From d1171267a8cf32dbd61ba023e0944b8337ef15dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Mon, 15 Jun 2026 17:09:45 +0200 Subject: [PATCH] feat: scrape tournament meta from Wikipedia, drop world_cup.csv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add worldcup.meta.json per year with host, teams_count, winner, runner_up, third_place, fourth_place — derived from match results (Final/Third-place match) with infobox as fallback for edge cases like 1950's round-robin final. Fix infobox host extraction to handle
-separated multi-host entries (2002: Japan / South Korea). Fix squad scraper to filter out zero-player phantom sections that Wikipedia appends (References, Captains, etc.). Drop app/data/world_cup.csv and the PLACEMENTS/parseCsv code in seed.ts — all tournament metadata now comes from the scraped JSON files. Co-Authored-By: Claude Sonnet 4.6 --- app/data/wikipedia/1930/worldcup.meta.json | 8 ++ app/data/wikipedia/1934/worldcup.meta.json | 8 ++ app/data/wikipedia/1938/worldcup.meta.json | 8 ++ app/data/wikipedia/1950/worldcup.meta.json | 8 ++ app/data/wikipedia/1954/worldcup.meta.json | 8 ++ app/data/wikipedia/1954/worldcup.squads.json | 16 --- app/data/wikipedia/1958/worldcup.meta.json | 8 ++ app/data/wikipedia/1962/worldcup.meta.json | 8 ++ app/data/wikipedia/1966/worldcup.meta.json | 8 ++ app/data/wikipedia/1970/worldcup.meta.json | 8 ++ app/data/wikipedia/1974/worldcup.meta.json | 8 ++ app/data/wikipedia/1978/worldcup.meta.json | 8 ++ app/data/wikipedia/1982/worldcup.meta.json | 8 ++ app/data/wikipedia/1986/worldcup.meta.json | 8 ++ app/data/wikipedia/1990/worldcup.meta.json | 8 ++ app/data/wikipedia/1990/worldcup.squads.json | 12 -- app/data/wikipedia/1994/worldcup.meta.json | 8 ++ app/data/wikipedia/1994/worldcup.squads.json | 8 -- app/data/wikipedia/1998/worldcup.meta.json | 8 ++ app/data/wikipedia/1998/worldcup.squads.json | 12 -- app/data/wikipedia/2002/worldcup.meta.json | 8 ++ app/data/wikipedia/2002/worldcup.squads.json | 20 --- app/data/wikipedia/2006/worldcup.meta.json | 8 ++ app/data/wikipedia/2006/worldcup.squads.json | 12 -- app/data/wikipedia/2010/worldcup.meta.json | 8 ++ app/data/wikipedia/2010/worldcup.squads.json | 20 --- app/data/wikipedia/2014/worldcup.meta.json | 8 ++ app/data/wikipedia/2014/worldcup.squads.json | 16 --- app/data/wikipedia/2018/worldcup.meta.json | 8 ++ app/data/wikipedia/2018/worldcup.squads.json | 20 --- app/data/wikipedia/2022/worldcup.meta.json | 8 ++ app/data/wikipedia/2022/worldcup.squads.json | 28 ----- scripts/scrape-wikipedia.ts | 123 ++++++++++++++++++- scripts/seed.ts | 110 ++++------------- 34 files changed, 319 insertions(+), 254 deletions(-) create mode 100644 app/data/wikipedia/1930/worldcup.meta.json create mode 100644 app/data/wikipedia/1934/worldcup.meta.json create mode 100644 app/data/wikipedia/1938/worldcup.meta.json create mode 100644 app/data/wikipedia/1950/worldcup.meta.json create mode 100644 app/data/wikipedia/1954/worldcup.meta.json create mode 100644 app/data/wikipedia/1958/worldcup.meta.json create mode 100644 app/data/wikipedia/1962/worldcup.meta.json create mode 100644 app/data/wikipedia/1966/worldcup.meta.json create mode 100644 app/data/wikipedia/1970/worldcup.meta.json create mode 100644 app/data/wikipedia/1974/worldcup.meta.json create mode 100644 app/data/wikipedia/1978/worldcup.meta.json create mode 100644 app/data/wikipedia/1982/worldcup.meta.json create mode 100644 app/data/wikipedia/1986/worldcup.meta.json create mode 100644 app/data/wikipedia/1990/worldcup.meta.json create mode 100644 app/data/wikipedia/1994/worldcup.meta.json create mode 100644 app/data/wikipedia/1998/worldcup.meta.json create mode 100644 app/data/wikipedia/2002/worldcup.meta.json create mode 100644 app/data/wikipedia/2006/worldcup.meta.json create mode 100644 app/data/wikipedia/2010/worldcup.meta.json create mode 100644 app/data/wikipedia/2014/worldcup.meta.json create mode 100644 app/data/wikipedia/2018/worldcup.meta.json create mode 100644 app/data/wikipedia/2022/worldcup.meta.json diff --git a/app/data/wikipedia/1930/worldcup.meta.json b/app/data/wikipedia/1930/worldcup.meta.json new file mode 100644 index 0000000..29e0ea2 --- /dev/null +++ b/app/data/wikipedia/1930/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Uruguay", + "teams_count": 13, + "winner": "Uruguay", + "runner_up": "Argentina", + "third_place": "United States", + "fourth_place": "Yugoslavia" +} \ No newline at end of file diff --git a/app/data/wikipedia/1934/worldcup.meta.json b/app/data/wikipedia/1934/worldcup.meta.json new file mode 100644 index 0000000..5adf54d --- /dev/null +++ b/app/data/wikipedia/1934/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Italy", + "teams_count": 16, + "winner": "Italy", + "runner_up": "Czechoslovakia", + "third_place": "Germany", + "fourth_place": "Austria" +} \ No newline at end of file diff --git a/app/data/wikipedia/1938/worldcup.meta.json b/app/data/wikipedia/1938/worldcup.meta.json new file mode 100644 index 0000000..c314da5 --- /dev/null +++ b/app/data/wikipedia/1938/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "France", + "teams_count": 15, + "winner": "Italy", + "runner_up": "Hungary", + "third_place": "Brazil", + "fourth_place": "Sweden" +} \ No newline at end of file diff --git a/app/data/wikipedia/1950/worldcup.meta.json b/app/data/wikipedia/1950/worldcup.meta.json new file mode 100644 index 0000000..ceb8683 --- /dev/null +++ b/app/data/wikipedia/1950/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Brazil", + "teams_count": 13, + "winner": "Uruguay", + "runner_up": "Brazil", + "third_place": "Sweden", + "fourth_place": "Spain" +} \ No newline at end of file diff --git a/app/data/wikipedia/1954/worldcup.meta.json b/app/data/wikipedia/1954/worldcup.meta.json new file mode 100644 index 0000000..0688f05 --- /dev/null +++ b/app/data/wikipedia/1954/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Switzerland", + "teams_count": 16, + "winner": "West Germany", + "runner_up": "Hungary", + "third_place": "Austria", + "fourth_place": "Uruguay" +} \ No newline at end of file diff --git a/app/data/wikipedia/1954/worldcup.squads.json b/app/data/wikipedia/1954/worldcup.squads.json index 2cf5585..cef0e08 100644 --- a/app/data/wikipedia/1954/worldcup.squads.json +++ b/app/data/wikipedia/1954/worldcup.squads.json @@ -2178,21 +2178,5 @@ "date_of_birth": "1929-01-07" } ] - }, - { - "name": "Age", - "players": [] - }, - { - "name": "Player representation by club", - "players": [] - }, - { - "name": "Player representation by league system", - "players": [] - }, - { - "name": "Coaches representation by country", - "players": [] } ] \ No newline at end of file diff --git a/app/data/wikipedia/1958/worldcup.meta.json b/app/data/wikipedia/1958/worldcup.meta.json new file mode 100644 index 0000000..8cac016 --- /dev/null +++ b/app/data/wikipedia/1958/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Sweden", + "teams_count": 16, + "winner": "Brazil", + "runner_up": "Sweden", + "third_place": "France", + "fourth_place": "West Germany" +} \ No newline at end of file diff --git a/app/data/wikipedia/1962/worldcup.meta.json b/app/data/wikipedia/1962/worldcup.meta.json new file mode 100644 index 0000000..0139f8f --- /dev/null +++ b/app/data/wikipedia/1962/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Chile", + "teams_count": 16, + "winner": "Brazil", + "runner_up": "Czechoslovakia", + "third_place": "Chile", + "fourth_place": "Yugoslavia" +} \ No newline at end of file diff --git a/app/data/wikipedia/1966/worldcup.meta.json b/app/data/wikipedia/1966/worldcup.meta.json new file mode 100644 index 0000000..5f609ce --- /dev/null +++ b/app/data/wikipedia/1966/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "England", + "teams_count": 16, + "winner": "England", + "runner_up": "West Germany", + "third_place": "Portugal", + "fourth_place": "Soviet Union" +} \ No newline at end of file diff --git a/app/data/wikipedia/1970/worldcup.meta.json b/app/data/wikipedia/1970/worldcup.meta.json new file mode 100644 index 0000000..b6dc9f0 --- /dev/null +++ b/app/data/wikipedia/1970/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Mexico", + "teams_count": 16, + "winner": "Brazil", + "runner_up": "Italy", + "third_place": "West Germany", + "fourth_place": "Uruguay" +} \ No newline at end of file diff --git a/app/data/wikipedia/1974/worldcup.meta.json b/app/data/wikipedia/1974/worldcup.meta.json new file mode 100644 index 0000000..bff3399 --- /dev/null +++ b/app/data/wikipedia/1974/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "West Germany", + "teams_count": 16, + "winner": "West Germany", + "runner_up": "Netherlands", + "third_place": "Poland", + "fourth_place": "Brazil" +} \ No newline at end of file diff --git a/app/data/wikipedia/1978/worldcup.meta.json b/app/data/wikipedia/1978/worldcup.meta.json new file mode 100644 index 0000000..c321770 --- /dev/null +++ b/app/data/wikipedia/1978/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Argentina", + "teams_count": 16, + "winner": "Argentina", + "runner_up": "Netherlands", + "third_place": "Brazil", + "fourth_place": "Italy" +} \ No newline at end of file diff --git a/app/data/wikipedia/1982/worldcup.meta.json b/app/data/wikipedia/1982/worldcup.meta.json new file mode 100644 index 0000000..6eb1b5b --- /dev/null +++ b/app/data/wikipedia/1982/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Spain", + "teams_count": 24, + "winner": "Italy", + "runner_up": "West Germany", + "third_place": "Poland", + "fourth_place": "France" +} \ No newline at end of file diff --git a/app/data/wikipedia/1986/worldcup.meta.json b/app/data/wikipedia/1986/worldcup.meta.json new file mode 100644 index 0000000..67e8ab4 --- /dev/null +++ b/app/data/wikipedia/1986/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Mexico", + "teams_count": 24, + "winner": "Argentina", + "runner_up": "West Germany", + "third_place": "France", + "fourth_place": "Belgium" +} \ No newline at end of file diff --git a/app/data/wikipedia/1990/worldcup.meta.json b/app/data/wikipedia/1990/worldcup.meta.json new file mode 100644 index 0000000..cd5382a --- /dev/null +++ b/app/data/wikipedia/1990/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Italy", + "teams_count": 24, + "winner": "West Germany", + "runner_up": "Argentina", + "third_place": "Italy", + "fourth_place": "England" +} \ No newline at end of file diff --git a/app/data/wikipedia/1990/worldcup.squads.json b/app/data/wikipedia/1990/worldcup.squads.json index a13bf2f..125078c 100644 --- a/app/data/wikipedia/1990/worldcup.squads.json +++ b/app/data/wikipedia/1990/worldcup.squads.json @@ -3298,17 +3298,5 @@ "date_of_birth": "1956-05-20" } ] - }, - { - "name": "Player", - "players": [] - }, - { - "name": "Captains", - "players": [] - }, - { - "name": "Goalkeepers", - "players": [] } ] \ No newline at end of file diff --git a/app/data/wikipedia/1994/worldcup.meta.json b/app/data/wikipedia/1994/worldcup.meta.json new file mode 100644 index 0000000..6f6d45d --- /dev/null +++ b/app/data/wikipedia/1994/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "United States", + "teams_count": 24, + "winner": "Brazil", + "runner_up": "Italy", + "third_place": "Sweden", + "fourth_place": "Bulgaria" +} \ No newline at end of file diff --git a/app/data/wikipedia/1994/worldcup.squads.json b/app/data/wikipedia/1994/worldcup.squads.json index 6b77d58..e59adec 100644 --- a/app/data/wikipedia/1994/worldcup.squads.json +++ b/app/data/wikipedia/1994/worldcup.squads.json @@ -3286,13 +3286,5 @@ "date_of_birth": "1972-08-18" } ] - }, - { - "name": "Captains", - "players": [] - }, - { - "name": "Goalkeepers", - "players": [] } ] \ No newline at end of file diff --git a/app/data/wikipedia/1998/worldcup.meta.json b/app/data/wikipedia/1998/worldcup.meta.json new file mode 100644 index 0000000..b9270bf --- /dev/null +++ b/app/data/wikipedia/1998/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "France", + "teams_count": 32, + "winner": "France", + "runner_up": "Brazil", + "third_place": "Croatia", + "fourth_place": "Netherlands" +} \ No newline at end of file diff --git a/app/data/wikipedia/1998/worldcup.squads.json b/app/data/wikipedia/1998/worldcup.squads.json index f519560..2b5b9a3 100644 --- a/app/data/wikipedia/1998/worldcup.squads.json +++ b/app/data/wikipedia/1998/worldcup.squads.json @@ -4388,17 +4388,5 @@ "date_of_birth": "1974-07-15" } ] - }, - { - "name": "Players", - "players": [] - }, - { - "name": "Captains", - "players": [] - }, - { - "name": "Goalkeepers", - "players": [] } ] \ No newline at end of file diff --git a/app/data/wikipedia/2002/worldcup.meta.json b/app/data/wikipedia/2002/worldcup.meta.json new file mode 100644 index 0000000..00a0512 --- /dev/null +++ b/app/data/wikipedia/2002/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Japan / South Korea", + "teams_count": 32, + "winner": "Brazil", + "runner_up": "Germany", + "third_place": "Turkey", + "fourth_place": "South Korea" +} \ No newline at end of file diff --git a/app/data/wikipedia/2002/worldcup.squads.json b/app/data/wikipedia/2002/worldcup.squads.json index 87116ad..818f34a 100644 --- a/app/data/wikipedia/2002/worldcup.squads.json +++ b/app/data/wikipedia/2002/worldcup.squads.json @@ -4574,25 +4574,5 @@ "date_of_birth": "1974-03-21" } ] - }, - { - "name": "Players", - "players": [] - }, - { - "name": "Captains", - "players": [] - }, - { - "name": "Goalkeepers", - "players": [] - }, - { - "name": "General references", - "players": [] - }, - { - "name": "Citations and notes", - "players": [] } ] \ No newline at end of file diff --git a/app/data/wikipedia/2006/worldcup.meta.json b/app/data/wikipedia/2006/worldcup.meta.json new file mode 100644 index 0000000..cee3221 --- /dev/null +++ b/app/data/wikipedia/2006/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Germany", + "teams_count": 32, + "winner": "Italy", + "runner_up": "France", + "third_place": "Germany", + "fourth_place": "Portugal" +} \ No newline at end of file diff --git a/app/data/wikipedia/2006/worldcup.squads.json b/app/data/wikipedia/2006/worldcup.squads.json index d90db89..45336bd 100644 --- a/app/data/wikipedia/2006/worldcup.squads.json +++ b/app/data/wikipedia/2006/worldcup.squads.json @@ -4574,17 +4574,5 @@ "date_of_birth": "1986-03-04" } ] - }, - { - "name": "Players", - "players": [] - }, - { - "name": "Goalkeeper", - "players": [] - }, - { - "name": "Captains", - "players": [] } ] \ No newline at end of file diff --git a/app/data/wikipedia/2010/worldcup.meta.json b/app/data/wikipedia/2010/worldcup.meta.json new file mode 100644 index 0000000..8948c6d --- /dev/null +++ b/app/data/wikipedia/2010/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "South Africa", + "teams_count": 32, + "winner": "Spain", + "runner_up": "Netherlands", + "third_place": "Germany", + "fourth_place": "Uruguay" +} \ No newline at end of file diff --git a/app/data/wikipedia/2010/worldcup.squads.json b/app/data/wikipedia/2010/worldcup.squads.json index f65697f..7e855a5 100644 --- a/app/data/wikipedia/2010/worldcup.squads.json +++ b/app/data/wikipedia/2010/worldcup.squads.json @@ -4574,25 +4574,5 @@ "date_of_birth": "1991-10-10" } ] - }, - { - "name": "Player representation by age", - "players": [] - }, - { - "name": "Player representation by club", - "players": [] - }, - { - "name": "Player representation by league", - "players": [] - }, - { - "name": "Average age of squads", - "players": [] - }, - { - "name": "Coaches representation by country", - "players": [] } ] \ No newline at end of file diff --git a/app/data/wikipedia/2014/worldcup.meta.json b/app/data/wikipedia/2014/worldcup.meta.json new file mode 100644 index 0000000..8ade189 --- /dev/null +++ b/app/data/wikipedia/2014/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Brazil", + "teams_count": 32, + "winner": "Germany", + "runner_up": "Argentina", + "third_place": "Netherlands", + "fourth_place": "Brazil" +} \ No newline at end of file diff --git a/app/data/wikipedia/2014/worldcup.squads.json b/app/data/wikipedia/2014/worldcup.squads.json index c984864..e99ad15 100644 --- a/app/data/wikipedia/2014/worldcup.squads.json +++ b/app/data/wikipedia/2014/worldcup.squads.json @@ -4574,21 +4574,5 @@ "date_of_birth": "1989-04-02" } ] - }, - { - "name": "Player representation by age", - "players": [] - }, - { - "name": "Player representation by league system", - "players": [] - }, - { - "name": "Player representation by club", - "players": [] - }, - { - "name": "Coaches representation by country", - "players": [] } ] \ No newline at end of file diff --git a/app/data/wikipedia/2018/worldcup.meta.json b/app/data/wikipedia/2018/worldcup.meta.json new file mode 100644 index 0000000..fedb614 --- /dev/null +++ b/app/data/wikipedia/2018/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Russia", + "teams_count": 32, + "winner": "France", + "runner_up": "Croatia", + "third_place": "Belgium", + "fourth_place": "England" +} \ No newline at end of file diff --git a/app/data/wikipedia/2018/worldcup.squads.json b/app/data/wikipedia/2018/worldcup.squads.json index ef809b2..c24f31a 100644 --- a/app/data/wikipedia/2018/worldcup.squads.json +++ b/app/data/wikipedia/2018/worldcup.squads.json @@ -4574,25 +4574,5 @@ "date_of_birth": "1993-09-05" } ] - }, - { - "name": "Age", - "players": [] - }, - { - "name": "Player representation by league system", - "players": [] - }, - { - "name": "Player representation by club", - "players": [] - }, - { - "name": "Player representation by club confederation", - "players": [] - }, - { - "name": "Coaches representation by country", - "players": [] } ] \ No newline at end of file diff --git a/app/data/wikipedia/2022/worldcup.meta.json b/app/data/wikipedia/2022/worldcup.meta.json new file mode 100644 index 0000000..7bc34dc --- /dev/null +++ b/app/data/wikipedia/2022/worldcup.meta.json @@ -0,0 +1,8 @@ +{ + "host": "Qatar", + "teams_count": 32, + "winner": "Argentina", + "runner_up": "France", + "third_place": "Croatia", + "fourth_place": "Morocco" +} \ No newline at end of file diff --git a/app/data/wikipedia/2022/worldcup.squads.json b/app/data/wikipedia/2022/worldcup.squads.json index 304003a..c710e5e 100644 --- a/app/data/wikipedia/2022/worldcup.squads.json +++ b/app/data/wikipedia/2022/worldcup.squads.json @@ -5144,33 +5144,5 @@ "date_of_birth": "1997-03-14" } ] - }, - { - "name": "Age", - "players": [] - }, - { - "name": "Players", - "players": [] - }, - { - "name": "Player representation by league system", - "players": [] - }, - { - "name": "Player representation by club", - "players": [] - }, - { - "name": "Player representation by club confederation", - "players": [] - }, - { - "name": "Average age of squads", - "players": [] - }, - { - "name": "Coaches representation by country", - "players": [] } ] \ No newline at end of file diff --git a/scripts/scrape-wikipedia.ts b/scripts/scrape-wikipedia.ts index f1bb6d5..1679843 100644 --- a/scripts/scrape-wikipedia.ts +++ b/scripts/scrape-wikipedia.ts @@ -49,6 +49,14 @@ type Stadium = { name: string; city: string } type Player = { name: string; number?: number; pos?: string; date_of_birth?: string } type Squad = { name: string; players: Player[] } type Group = { name: string; teams: string[] } +type Meta = { + host: string + teams_count: number | null + winner: string | null + runner_up: string | null + third_place: string | null + fourth_place: string | null +} // ── Fetch ────────────────────────────────────────────────────────────────── @@ -309,10 +317,99 @@ function processHeading(text: string, level: number, state: State): void { // ── Main year scraper ────────────────────────────────────────────────────── +// ── Infobox parsing ──────────────────────────────────────────────────────── + +function parseInfobox($: CheerioAPI): Partial { + const result: Partial = {} + + function tdText($td: Cheerio): string { + const $clone = $td.clone() + $clone.find('br').replaceWith(' / ') + $clone.find('sup, img').remove() + return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim() + } + + function tdFirstLink($td: Cheerio): string | null { + let name: string | null = null + $td.find('a').each((_, a) => { + const t = $(a).clone().find('img').remove().end().text().trim() + if (t && !/\[\d+\]/.test(t)) { name = t; return false } + }) + return name ?? (tdText($td) || null) + } + + function tdAllLinks($td: Cheerio): string { + const names: string[] = [] + $td.find('a').each((_, a) => { + const t = $(a).clone().find('img').remove().end().text().trim() + if (t && !/\[\d+\]/.test(t)) names.push(t) + }) + return names.length ? names.join(' / ') : tdText($td) + } + + $('table.infobox').first().find('tr').each((_, tr) => { + const $tr = $(tr) + const label = $tr.find('th').text().trim().toLowerCase() + const $td = $tr.find('td').first() + if (!$td.length) return + if (/host countr/i.test(label)) { + result.host = tdAllLinks($td) + } else if (/^teams$/i.test(label)) { + const m = $td.text().match(/\d+/) + if (m) result.teams_count = parseInt(m[0]) + } else if (/champion/i.test(label)) { + result.winner = tdFirstLink($td) + } else if (/runners?.up/i.test(label)) { + result.runner_up = tdFirstLink($td) + } else if (/third.place/i.test(label)) { + result.third_place = tdFirstLink($td) + } else if (/fourth.place/i.test(label)) { + result.fourth_place = tdFirstLink($td) + } + }) + + return result +} + +// ── Placement derivation ─────────────────────────────────────────────────── + +function derivePlacements(matches: Match[]): Pick { + function matchWinner(m: Match): [string, string] | null { + if (!m.score) return null + const [h, a] = m.score.et ?? m.score.ft ?? [0, 0] + if (h > a) return [m.team1, m.team2] + if (a > h) return [m.team2, m.team1] + if (m.score.p) { + const [ph, pa] = m.score.p + if (ph > pa) return [m.team1, m.team2] + if (pa > ph) return [m.team2, m.team1] + } + return null + } + + let winner: string | null = null, runner_up: string | null = null + let third_place: string | null = null, fourth_place: string | null = null + + for (const m of matches) { + if (m.round === 'Final') { + const result = matchWinner(m) + if (result) { [winner, runner_up] = result } + } else if (m.round === 'Third-place match') { + const result = matchWinner(m) + if (result) { [third_place, fourth_place] = result } + } + } + + return { winner, runner_up, third_place, fourth_place } +} + +// ── Year result ──────────────────────────────────────────────────────────── + type YearResult = { matches: Match[] stadiums: Map groups: Map> + meta: Meta } async function scrapeYear(year: number, mainHtml: string): Promise { @@ -406,7 +503,18 @@ async function scrapeYear(year: number, mainHtml: string): Promise { process.stdout.write(`[+${page.slice(-8)}] `) } - return { matches, stadiums, groups } + const infobox = parseInfobox($) + const placements = derivePlacements(matches) + const meta: Meta = { + host: infobox.host ?? '', + teams_count: infobox.teams_count ?? null, + winner: placements.winner ?? infobox.winner ?? null, + runner_up: placements.runner_up ?? infobox.runner_up ?? null, + third_place: placements.third_place ?? infobox.third_place ?? null, + fourth_place:placements.fourth_place?? infobox.fourth_place?? null, + } + + return { matches, stadiums, groups, meta } } // ── Squad page scraper ───────────────────────────────────────────────────── @@ -465,7 +573,7 @@ function scrapeSquads(html: string): Squad[] { currentTeam.players.push(player) }) - return squads + return squads.filter(s => s.players.length > 0) } // ── Output ───────────────────────────────────────────────────────────────── @@ -476,10 +584,17 @@ function writeOutput( stadiums: Map, groups: Map>, squads: Squad[], + meta: Meta, ): void { const dir = path.join(DATA_DIR, String(year)) mkdirSync(dir, { recursive: true }) + writeFileSync( + path.join(dir, 'worldcup.meta.json'), + JSON.stringify(meta, null, 2), + 'utf-8', + ) + writeFileSync( path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), @@ -529,14 +644,14 @@ async function main() { const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`) if (!mainHtml) { console.log('FAILED'); continue } - const { matches, stadiums, groups } = await scrapeYear(year, mainHtml) + const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml) await delay(600) const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`) const squads = squadHtml ? scrapeSquads(squadHtml) : [] - writeOutput(year, matches, stadiums, groups, squads) + writeOutput(year, matches, stadiums, groups, squads, meta) console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`) diff --git a/scripts/seed.ts b/scripts/seed.ts index c707561..414f359 100644 --- a/scripts/seed.ts +++ b/scripts/seed.ts @@ -8,39 +8,13 @@ import { getIso } from '../lib/iso-codes' const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup' const __dirname = path.dirname(fileURLToPath(import.meta.url)) -const DATA_DIR = path.join(__dirname, '../app/data') -const WC_DIR = path.join(DATA_DIR, 'wikipedia') +const WC_DIR = path.join(__dirname, '../app/data/wikipedia') const YEARS = [ 1930,1934,1938,1950,1954,1958,1962,1966,1970,1974, 1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022, ] -// Third/fourth place not reliably in source data for older years -const PLACEMENTS: Record = { - 1930: { third: 'USA', fourth: 'Yugoslavia' }, - 1934: { third: 'Germany', fourth: 'Austria' }, - 1938: { third: 'Brazil', fourth: 'Sweden' }, - 1954: { third: 'Austria', fourth: 'Uruguay' }, - 1958: { third: 'France', fourth: 'Germany' }, - 1962: { third: 'Chile', fourth: 'Yugoslavia' }, - 1966: { third: 'Portugal', fourth: 'Soviet Union' }, - 1970: { third: 'Germany', fourth: 'Uruguay' }, - 1974: { third: 'Poland', fourth: 'Brazil' }, - 1978: { third: 'Brazil', fourth: 'Italy' }, - 1982: { third: 'Poland', fourth: 'France' }, - 1986: { third: 'France', fourth: 'Belgium' }, - 1990: { third: 'Italy', fourth: 'England' }, - 1994: { third: 'Sweden', fourth: 'Bulgaria' }, - 1998: { third: 'Croatia', fourth: 'Netherlands' }, - 2002: { third: 'Turkey', fourth: 'South Korea' }, - 2006: { third: 'Germany', fourth: 'Portugal' }, - 2010: { third: 'Germany', fourth: 'Uruguay' }, - 2014: { third: 'Netherlands', fourth: 'Brazil' }, - 2018: { third: 'Belgium', fourth: 'England' }, - 2022: { third: 'Croatia', fourth: 'Morocco' }, -} - // Normalize team names from Wikipedia to canonical DB names const TEAM_ALIASES: Record = { 'West Germany': 'Germany', @@ -52,38 +26,6 @@ function normTeam(name: string): string { return TEAM_ALIASES[name] ?? name } -// Minimal RFC-4180 CSV parser -function parseCsv(content: string): Record[] { - const rows: string[][] = [] - let row: string[] = [] - let field = '' - let inQ = false - for (let i = 0; i < content.length; i++) { - const ch = content[i] - if (inQ) { - if (ch === '"') { - if (content[i + 1] === '"') { field += '"'; i++ } - else inQ = false - } else { - field += ch - } - } else if (ch === '"') { - inQ = true - } else if (ch === ',') { - row.push(field); field = '' - } else if (ch === '\n') { - row.push(field); rows.push(row); row = []; field = '' - } else if (ch !== '\r') { - field += ch - } - } - if (field || row.length) { row.push(field); rows.push(row) } - const headers = rows[0] - return rows.slice(1) - .filter(r => r.some(f => f.trim())) - .map(r => Object.fromEntries(headers.map((h, i) => [h.trim(), (r[i] ?? '').trim()]))) -} - function readJson(filePath: string): T | null { if (!existsSync(filePath)) return null try { return JSON.parse(readFileSync(filePath, 'utf-8')) as T } catch { return null } @@ -100,6 +42,7 @@ type RawMatch = { group?: string; ground?: string; } type RawData = { matches: RawMatch[] } +type RawMeta = { host: string; teams_count: number | null; winner: string | null; runner_up: string | null; third_place: string | null; fourth_place: string | null } type RawStadiums = { stadiums: { name: string; city: string; cc?: string; capacity?: number; timezone?: string; coords?: string }[] } type RawSquad = { name: string; players: { name: string; number?: number; pos?: string; date_of_birth?: string }[] } @@ -245,33 +188,7 @@ async function run() { return id } - // 1. Tournaments from world_cup.csv (host, winner, runner_up) - const wcRows = parseCsv(readFileSync(path.join(DATA_DIR, 'world_cup.csv'), 'utf-8')) - for (const r of wcRows) { - const year = parseInt(r['Year']) - if (isNaN(year)) continue - const winner = normTeam(r['Champion'] || '') - const runnerUp = normTeam(r['Runner-Up'] || '') - const p = PLACEMENTS[year] ?? {} - await db.execute(sql` - INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count) - VALUES ( - ${year}, ${r['Host']}, - ${winner || null}, ${runnerUp || null}, - ${p.third ?? null}, ${p.fourth ?? null}, - ${parseInt(r['Teams']) || null} - ) - ON CONFLICT (year) DO UPDATE SET - host = EXCLUDED.host, - winner = EXCLUDED.winner, - runner_up = EXCLUDED.runner_up, - third_place = EXCLUDED.third_place, - fourth_place = EXCLUDED.fourth_place, - teams_count = EXCLUDED.teams_count - `) - } - - // 2. Per-year match/stadium/squad data from openfootball JSON files + // Per-year data from Wikipedia JSON files let totalMatches = 0 let totalGoals = 0 @@ -283,6 +200,27 @@ async function run() { continue } + // Tournament row from meta.json + const meta = readJson(path.join(yearDir, 'worldcup.meta.json')) + if (meta) { + await db.execute(sql` + INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count) + VALUES ( + ${year}, ${meta.host || null}, + ${normTeam(meta.winner ?? '') || null}, ${normTeam(meta.runner_up ?? '') || null}, + ${normTeam(meta.third_place ?? '') || null}, ${normTeam(meta.fourth_place ?? '') || null}, + ${meta.teams_count ?? null} + ) + ON CONFLICT (year) DO UPDATE SET + host = EXCLUDED.host, + winner = EXCLUDED.winner, + runner_up = EXCLUDED.runner_up, + third_place = EXCLUDED.third_place, + fourth_place = EXCLUDED.fourth_place, + teams_count = EXCLUDED.teams_count + `) + } + let matchCount = 0, goalCount = 0 // Stadiums