diff --git a/app/data/wikipedia/1930/worldcup.meta.json b/app/data/wikipedia/1930/worldcup.meta.json
new file mode 100644
index 0000000..29e0ea2
--- /dev/null
+++ b/app/data/wikipedia/1930/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Uruguay",
+ "teams_count": 13,
+ "winner": "Uruguay",
+ "runner_up": "Argentina",
+ "third_place": "United States",
+ "fourth_place": "Yugoslavia"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1934/worldcup.meta.json b/app/data/wikipedia/1934/worldcup.meta.json
new file mode 100644
index 0000000..5adf54d
--- /dev/null
+++ b/app/data/wikipedia/1934/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Italy",
+ "teams_count": 16,
+ "winner": "Italy",
+ "runner_up": "Czechoslovakia",
+ "third_place": "Germany",
+ "fourth_place": "Austria"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1938/worldcup.meta.json b/app/data/wikipedia/1938/worldcup.meta.json
new file mode 100644
index 0000000..c314da5
--- /dev/null
+++ b/app/data/wikipedia/1938/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "France",
+ "teams_count": 15,
+ "winner": "Italy",
+ "runner_up": "Hungary",
+ "third_place": "Brazil",
+ "fourth_place": "Sweden"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1950/worldcup.meta.json b/app/data/wikipedia/1950/worldcup.meta.json
new file mode 100644
index 0000000..ceb8683
--- /dev/null
+++ b/app/data/wikipedia/1950/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Brazil",
+ "teams_count": 13,
+ "winner": "Uruguay",
+ "runner_up": "Brazil",
+ "third_place": "Sweden",
+ "fourth_place": "Spain"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1954/worldcup.meta.json b/app/data/wikipedia/1954/worldcup.meta.json
new file mode 100644
index 0000000..0688f05
--- /dev/null
+++ b/app/data/wikipedia/1954/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Switzerland",
+ "teams_count": 16,
+ "winner": "West Germany",
+ "runner_up": "Hungary",
+ "third_place": "Austria",
+ "fourth_place": "Uruguay"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1954/worldcup.squads.json b/app/data/wikipedia/1954/worldcup.squads.json
index 2cf5585..cef0e08 100644
--- a/app/data/wikipedia/1954/worldcup.squads.json
+++ b/app/data/wikipedia/1954/worldcup.squads.json
@@ -2178,21 +2178,5 @@
"date_of_birth": "1929-01-07"
}
]
- },
- {
- "name": "Age",
- "players": []
- },
- {
- "name": "Player representation by club",
- "players": []
- },
- {
- "name": "Player representation by league system",
- "players": []
- },
- {
- "name": "Coaches representation by country",
- "players": []
}
]
\ No newline at end of file
diff --git a/app/data/wikipedia/1958/worldcup.meta.json b/app/data/wikipedia/1958/worldcup.meta.json
new file mode 100644
index 0000000..8cac016
--- /dev/null
+++ b/app/data/wikipedia/1958/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Sweden",
+ "teams_count": 16,
+ "winner": "Brazil",
+ "runner_up": "Sweden",
+ "third_place": "France",
+ "fourth_place": "West Germany"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1962/worldcup.meta.json b/app/data/wikipedia/1962/worldcup.meta.json
new file mode 100644
index 0000000..0139f8f
--- /dev/null
+++ b/app/data/wikipedia/1962/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Chile",
+ "teams_count": 16,
+ "winner": "Brazil",
+ "runner_up": "Czechoslovakia",
+ "third_place": "Chile",
+ "fourth_place": "Yugoslavia"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1966/worldcup.meta.json b/app/data/wikipedia/1966/worldcup.meta.json
new file mode 100644
index 0000000..5f609ce
--- /dev/null
+++ b/app/data/wikipedia/1966/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "England",
+ "teams_count": 16,
+ "winner": "England",
+ "runner_up": "West Germany",
+ "third_place": "Portugal",
+ "fourth_place": "Soviet Union"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1970/worldcup.meta.json b/app/data/wikipedia/1970/worldcup.meta.json
new file mode 100644
index 0000000..b6dc9f0
--- /dev/null
+++ b/app/data/wikipedia/1970/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Mexico",
+ "teams_count": 16,
+ "winner": "Brazil",
+ "runner_up": "Italy",
+ "third_place": "West Germany",
+ "fourth_place": "Uruguay"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1974/worldcup.meta.json b/app/data/wikipedia/1974/worldcup.meta.json
new file mode 100644
index 0000000..bff3399
--- /dev/null
+++ b/app/data/wikipedia/1974/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "West Germany",
+ "teams_count": 16,
+ "winner": "West Germany",
+ "runner_up": "Netherlands",
+ "third_place": "Poland",
+ "fourth_place": "Brazil"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1978/worldcup.meta.json b/app/data/wikipedia/1978/worldcup.meta.json
new file mode 100644
index 0000000..c321770
--- /dev/null
+++ b/app/data/wikipedia/1978/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Argentina",
+ "teams_count": 16,
+ "winner": "Argentina",
+ "runner_up": "Netherlands",
+ "third_place": "Brazil",
+ "fourth_place": "Italy"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1982/worldcup.meta.json b/app/data/wikipedia/1982/worldcup.meta.json
new file mode 100644
index 0000000..6eb1b5b
--- /dev/null
+++ b/app/data/wikipedia/1982/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Spain",
+ "teams_count": 24,
+ "winner": "Italy",
+ "runner_up": "West Germany",
+ "third_place": "Poland",
+ "fourth_place": "France"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1986/worldcup.meta.json b/app/data/wikipedia/1986/worldcup.meta.json
new file mode 100644
index 0000000..67e8ab4
--- /dev/null
+++ b/app/data/wikipedia/1986/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Mexico",
+ "teams_count": 24,
+ "winner": "Argentina",
+ "runner_up": "West Germany",
+ "third_place": "France",
+ "fourth_place": "Belgium"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1990/worldcup.meta.json b/app/data/wikipedia/1990/worldcup.meta.json
new file mode 100644
index 0000000..cd5382a
--- /dev/null
+++ b/app/data/wikipedia/1990/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Italy",
+ "teams_count": 24,
+ "winner": "West Germany",
+ "runner_up": "Argentina",
+ "third_place": "Italy",
+ "fourth_place": "England"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1990/worldcup.squads.json b/app/data/wikipedia/1990/worldcup.squads.json
index a13bf2f..125078c 100644
--- a/app/data/wikipedia/1990/worldcup.squads.json
+++ b/app/data/wikipedia/1990/worldcup.squads.json
@@ -3298,17 +3298,5 @@
"date_of_birth": "1956-05-20"
}
]
- },
- {
- "name": "Player",
- "players": []
- },
- {
- "name": "Captains",
- "players": []
- },
- {
- "name": "Goalkeepers",
- "players": []
}
]
\ No newline at end of file
diff --git a/app/data/wikipedia/1994/worldcup.meta.json b/app/data/wikipedia/1994/worldcup.meta.json
new file mode 100644
index 0000000..6f6d45d
--- /dev/null
+++ b/app/data/wikipedia/1994/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "United States",
+ "teams_count": 24,
+ "winner": "Brazil",
+ "runner_up": "Italy",
+ "third_place": "Sweden",
+ "fourth_place": "Bulgaria"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1994/worldcup.squads.json b/app/data/wikipedia/1994/worldcup.squads.json
index 6b77d58..e59adec 100644
--- a/app/data/wikipedia/1994/worldcup.squads.json
+++ b/app/data/wikipedia/1994/worldcup.squads.json
@@ -3286,13 +3286,5 @@
"date_of_birth": "1972-08-18"
}
]
- },
- {
- "name": "Captains",
- "players": []
- },
- {
- "name": "Goalkeepers",
- "players": []
}
]
\ No newline at end of file
diff --git a/app/data/wikipedia/1998/worldcup.meta.json b/app/data/wikipedia/1998/worldcup.meta.json
new file mode 100644
index 0000000..b9270bf
--- /dev/null
+++ b/app/data/wikipedia/1998/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "France",
+ "teams_count": 32,
+ "winner": "France",
+ "runner_up": "Brazil",
+ "third_place": "Croatia",
+ "fourth_place": "Netherlands"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/1998/worldcup.squads.json b/app/data/wikipedia/1998/worldcup.squads.json
index f519560..2b5b9a3 100644
--- a/app/data/wikipedia/1998/worldcup.squads.json
+++ b/app/data/wikipedia/1998/worldcup.squads.json
@@ -4388,17 +4388,5 @@
"date_of_birth": "1974-07-15"
}
]
- },
- {
- "name": "Players",
- "players": []
- },
- {
- "name": "Captains",
- "players": []
- },
- {
- "name": "Goalkeepers",
- "players": []
}
]
\ No newline at end of file
diff --git a/app/data/wikipedia/2002/worldcup.meta.json b/app/data/wikipedia/2002/worldcup.meta.json
new file mode 100644
index 0000000..00a0512
--- /dev/null
+++ b/app/data/wikipedia/2002/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Japan / South Korea",
+ "teams_count": 32,
+ "winner": "Brazil",
+ "runner_up": "Germany",
+ "third_place": "Turkey",
+ "fourth_place": "South Korea"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/2002/worldcup.squads.json b/app/data/wikipedia/2002/worldcup.squads.json
index 87116ad..818f34a 100644
--- a/app/data/wikipedia/2002/worldcup.squads.json
+++ b/app/data/wikipedia/2002/worldcup.squads.json
@@ -4574,25 +4574,5 @@
"date_of_birth": "1974-03-21"
}
]
- },
- {
- "name": "Players",
- "players": []
- },
- {
- "name": "Captains",
- "players": []
- },
- {
- "name": "Goalkeepers",
- "players": []
- },
- {
- "name": "General references",
- "players": []
- },
- {
- "name": "Citations and notes",
- "players": []
}
]
\ No newline at end of file
diff --git a/app/data/wikipedia/2006/worldcup.meta.json b/app/data/wikipedia/2006/worldcup.meta.json
new file mode 100644
index 0000000..cee3221
--- /dev/null
+++ b/app/data/wikipedia/2006/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Germany",
+ "teams_count": 32,
+ "winner": "Italy",
+ "runner_up": "France",
+ "third_place": "Germany",
+ "fourth_place": "Portugal"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/2006/worldcup.squads.json b/app/data/wikipedia/2006/worldcup.squads.json
index d90db89..45336bd 100644
--- a/app/data/wikipedia/2006/worldcup.squads.json
+++ b/app/data/wikipedia/2006/worldcup.squads.json
@@ -4574,17 +4574,5 @@
"date_of_birth": "1986-03-04"
}
]
- },
- {
- "name": "Players",
- "players": []
- },
- {
- "name": "Goalkeeper",
- "players": []
- },
- {
- "name": "Captains",
- "players": []
}
]
\ No newline at end of file
diff --git a/app/data/wikipedia/2010/worldcup.meta.json b/app/data/wikipedia/2010/worldcup.meta.json
new file mode 100644
index 0000000..8948c6d
--- /dev/null
+++ b/app/data/wikipedia/2010/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "South Africa",
+ "teams_count": 32,
+ "winner": "Spain",
+ "runner_up": "Netherlands",
+ "third_place": "Germany",
+ "fourth_place": "Uruguay"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/2010/worldcup.squads.json b/app/data/wikipedia/2010/worldcup.squads.json
index f65697f..7e855a5 100644
--- a/app/data/wikipedia/2010/worldcup.squads.json
+++ b/app/data/wikipedia/2010/worldcup.squads.json
@@ -4574,25 +4574,5 @@
"date_of_birth": "1991-10-10"
}
]
- },
- {
- "name": "Player representation by age",
- "players": []
- },
- {
- "name": "Player representation by club",
- "players": []
- },
- {
- "name": "Player representation by league",
- "players": []
- },
- {
- "name": "Average age of squads",
- "players": []
- },
- {
- "name": "Coaches representation by country",
- "players": []
}
]
\ No newline at end of file
diff --git a/app/data/wikipedia/2014/worldcup.meta.json b/app/data/wikipedia/2014/worldcup.meta.json
new file mode 100644
index 0000000..8ade189
--- /dev/null
+++ b/app/data/wikipedia/2014/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Brazil",
+ "teams_count": 32,
+ "winner": "Germany",
+ "runner_up": "Argentina",
+ "third_place": "Netherlands",
+ "fourth_place": "Brazil"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/2014/worldcup.squads.json b/app/data/wikipedia/2014/worldcup.squads.json
index c984864..e99ad15 100644
--- a/app/data/wikipedia/2014/worldcup.squads.json
+++ b/app/data/wikipedia/2014/worldcup.squads.json
@@ -4574,21 +4574,5 @@
"date_of_birth": "1989-04-02"
}
]
- },
- {
- "name": "Player representation by age",
- "players": []
- },
- {
- "name": "Player representation by league system",
- "players": []
- },
- {
- "name": "Player representation by club",
- "players": []
- },
- {
- "name": "Coaches representation by country",
- "players": []
}
]
\ No newline at end of file
diff --git a/app/data/wikipedia/2018/worldcup.meta.json b/app/data/wikipedia/2018/worldcup.meta.json
new file mode 100644
index 0000000..fedb614
--- /dev/null
+++ b/app/data/wikipedia/2018/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Russia",
+ "teams_count": 32,
+ "winner": "France",
+ "runner_up": "Croatia",
+ "third_place": "Belgium",
+ "fourth_place": "England"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/2018/worldcup.squads.json b/app/data/wikipedia/2018/worldcup.squads.json
index ef809b2..c24f31a 100644
--- a/app/data/wikipedia/2018/worldcup.squads.json
+++ b/app/data/wikipedia/2018/worldcup.squads.json
@@ -4574,25 +4574,5 @@
"date_of_birth": "1993-09-05"
}
]
- },
- {
- "name": "Age",
- "players": []
- },
- {
- "name": "Player representation by league system",
- "players": []
- },
- {
- "name": "Player representation by club",
- "players": []
- },
- {
- "name": "Player representation by club confederation",
- "players": []
- },
- {
- "name": "Coaches representation by country",
- "players": []
}
]
\ No newline at end of file
diff --git a/app/data/wikipedia/2022/worldcup.meta.json b/app/data/wikipedia/2022/worldcup.meta.json
new file mode 100644
index 0000000..7bc34dc
--- /dev/null
+++ b/app/data/wikipedia/2022/worldcup.meta.json
@@ -0,0 +1,8 @@
+{
+ "host": "Qatar",
+ "teams_count": 32,
+ "winner": "Argentina",
+ "runner_up": "France",
+ "third_place": "Croatia",
+ "fourth_place": "Morocco"
+}
\ No newline at end of file
diff --git a/app/data/wikipedia/2022/worldcup.squads.json b/app/data/wikipedia/2022/worldcup.squads.json
index 304003a..c710e5e 100644
--- a/app/data/wikipedia/2022/worldcup.squads.json
+++ b/app/data/wikipedia/2022/worldcup.squads.json
@@ -5144,33 +5144,5 @@
"date_of_birth": "1997-03-14"
}
]
- },
- {
- "name": "Age",
- "players": []
- },
- {
- "name": "Players",
- "players": []
- },
- {
- "name": "Player representation by league system",
- "players": []
- },
- {
- "name": "Player representation by club",
- "players": []
- },
- {
- "name": "Player representation by club confederation",
- "players": []
- },
- {
- "name": "Average age of squads",
- "players": []
- },
- {
- "name": "Coaches representation by country",
- "players": []
}
]
\ No newline at end of file
diff --git a/scripts/scrape-wikipedia.ts b/scripts/scrape-wikipedia.ts
index f1bb6d5..1679843 100644
--- a/scripts/scrape-wikipedia.ts
+++ b/scripts/scrape-wikipedia.ts
@@ -49,6 +49,14 @@ type Stadium = { name: string; city: string }
type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
type Squad = { name: string; players: Player[] }
type Group = { name: string; teams: string[] }
+type Meta = {
+ host: string
+ teams_count: number | null
+ winner: string | null
+ runner_up: string | null
+ third_place: string | null
+ fourth_place: string | null
+}
// ── Fetch ──────────────────────────────────────────────────────────────────
@@ -309,10 +317,99 @@ function processHeading(text: string, level: number, state: State): void {
// ── Main year scraper ──────────────────────────────────────────────────────
+// ── Infobox parsing ────────────────────────────────────────────────────────
+
+function parseInfobox($: CheerioAPI): Partial {
+ const result: Partial = {}
+
+ function tdText($td: Cheerio): string {
+ const $clone = $td.clone()
+ $clone.find('br').replaceWith(' / ')
+ $clone.find('sup, img').remove()
+ return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
+ }
+
+ function tdFirstLink($td: Cheerio): string | null {
+ let name: string | null = null
+ $td.find('a').each((_, a) => {
+ const t = $(a).clone().find('img').remove().end().text().trim()
+ if (t && !/\[\d+\]/.test(t)) { name = t; return false }
+ })
+ return name ?? (tdText($td) || null)
+ }
+
+ function tdAllLinks($td: Cheerio): string {
+ const names: string[] = []
+ $td.find('a').each((_, a) => {
+ const t = $(a).clone().find('img').remove().end().text().trim()
+ if (t && !/\[\d+\]/.test(t)) names.push(t)
+ })
+ return names.length ? names.join(' / ') : tdText($td)
+ }
+
+ $('table.infobox').first().find('tr').each((_, tr) => {
+ const $tr = $(tr)
+ const label = $tr.find('th').text().trim().toLowerCase()
+ const $td = $tr.find('td').first()
+ if (!$td.length) return
+ if (/host countr/i.test(label)) {
+ result.host = tdAllLinks($td)
+ } else if (/^teams$/i.test(label)) {
+ const m = $td.text().match(/\d+/)
+ if (m) result.teams_count = parseInt(m[0])
+ } else if (/champion/i.test(label)) {
+ result.winner = tdFirstLink($td)
+ } else if (/runners?.up/i.test(label)) {
+ result.runner_up = tdFirstLink($td)
+ } else if (/third.place/i.test(label)) {
+ result.third_place = tdFirstLink($td)
+ } else if (/fourth.place/i.test(label)) {
+ result.fourth_place = tdFirstLink($td)
+ }
+ })
+
+ return result
+}
+
+// ── Placement derivation ───────────────────────────────────────────────────
+
+function derivePlacements(matches: Match[]): Pick {
+ function matchWinner(m: Match): [string, string] | null {
+ if (!m.score) return null
+ const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
+ if (h > a) return [m.team1, m.team2]
+ if (a > h) return [m.team2, m.team1]
+ if (m.score.p) {
+ const [ph, pa] = m.score.p
+ if (ph > pa) return [m.team1, m.team2]
+ if (pa > ph) return [m.team2, m.team1]
+ }
+ return null
+ }
+
+ let winner: string | null = null, runner_up: string | null = null
+ let third_place: string | null = null, fourth_place: string | null = null
+
+ for (const m of matches) {
+ if (m.round === 'Final') {
+ const result = matchWinner(m)
+ if (result) { [winner, runner_up] = result }
+ } else if (m.round === 'Third-place match') {
+ const result = matchWinner(m)
+ if (result) { [third_place, fourth_place] = result }
+ }
+ }
+
+ return { winner, runner_up, third_place, fourth_place }
+}
+
+// ── Year result ────────────────────────────────────────────────────────────
+
type YearResult = {
matches: Match[]
stadiums: Map
groups: Map>
+ meta: Meta
}
async function scrapeYear(year: number, mainHtml: string): Promise {
@@ -406,7 +503,18 @@ async function scrapeYear(year: number, mainHtml: string): Promise {
process.stdout.write(`[+${page.slice(-8)}] `)
}
- return { matches, stadiums, groups }
+ const infobox = parseInfobox($)
+ const placements = derivePlacements(matches)
+ const meta: Meta = {
+ host: infobox.host ?? '',
+ teams_count: infobox.teams_count ?? null,
+ winner: placements.winner ?? infobox.winner ?? null,
+ runner_up: placements.runner_up ?? infobox.runner_up ?? null,
+ third_place: placements.third_place ?? infobox.third_place ?? null,
+ fourth_place:placements.fourth_place?? infobox.fourth_place?? null,
+ }
+
+ return { matches, stadiums, groups, meta }
}
// ── Squad page scraper ─────────────────────────────────────────────────────
@@ -465,7 +573,7 @@ function scrapeSquads(html: string): Squad[] {
currentTeam.players.push(player)
})
- return squads
+ return squads.filter(s => s.players.length > 0)
}
// ── Output ─────────────────────────────────────────────────────────────────
@@ -476,10 +584,17 @@ function writeOutput(
stadiums: Map,
groups: Map>,
squads: Squad[],
+ meta: Meta,
): void {
const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true })
+ writeFileSync(
+ path.join(dir, 'worldcup.meta.json'),
+ JSON.stringify(meta, null, 2),
+ 'utf-8',
+ )
+
writeFileSync(
path.join(dir, 'worldcup.json'),
JSON.stringify({ matches }, null, 2),
@@ -529,14 +644,14 @@ async function main() {
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
if (!mainHtml) { console.log('FAILED'); continue }
- const { matches, stadiums, groups } = await scrapeYear(year, mainHtml)
+ const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
await delay(600)
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
const squads = squadHtml ? scrapeSquads(squadHtml) : []
- writeOutput(year, matches, stadiums, groups, squads)
+ writeOutput(year, matches, stadiums, groups, squads, meta)
console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`)
diff --git a/scripts/seed.ts b/scripts/seed.ts
index c707561..414f359 100644
--- a/scripts/seed.ts
+++ b/scripts/seed.ts
@@ -8,39 +8,13 @@ import { getIso } from '../lib/iso-codes'
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
-const DATA_DIR = path.join(__dirname, '../app/data')
-const WC_DIR = path.join(DATA_DIR, 'wikipedia')
+const WC_DIR = path.join(__dirname, '../app/data/wikipedia')
const YEARS = [
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
]
-// Third/fourth place not reliably in source data for older years
-const PLACEMENTS: Record = {
- 1930: { third: 'USA', fourth: 'Yugoslavia' },
- 1934: { third: 'Germany', fourth: 'Austria' },
- 1938: { third: 'Brazil', fourth: 'Sweden' },
- 1954: { third: 'Austria', fourth: 'Uruguay' },
- 1958: { third: 'France', fourth: 'Germany' },
- 1962: { third: 'Chile', fourth: 'Yugoslavia' },
- 1966: { third: 'Portugal', fourth: 'Soviet Union' },
- 1970: { third: 'Germany', fourth: 'Uruguay' },
- 1974: { third: 'Poland', fourth: 'Brazil' },
- 1978: { third: 'Brazil', fourth: 'Italy' },
- 1982: { third: 'Poland', fourth: 'France' },
- 1986: { third: 'France', fourth: 'Belgium' },
- 1990: { third: 'Italy', fourth: 'England' },
- 1994: { third: 'Sweden', fourth: 'Bulgaria' },
- 1998: { third: 'Croatia', fourth: 'Netherlands' },
- 2002: { third: 'Turkey', fourth: 'South Korea' },
- 2006: { third: 'Germany', fourth: 'Portugal' },
- 2010: { third: 'Germany', fourth: 'Uruguay' },
- 2014: { third: 'Netherlands', fourth: 'Brazil' },
- 2018: { third: 'Belgium', fourth: 'England' },
- 2022: { third: 'Croatia', fourth: 'Morocco' },
-}
-
// Normalize team names from Wikipedia to canonical DB names
const TEAM_ALIASES: Record = {
'West Germany': 'Germany',
@@ -52,38 +26,6 @@ function normTeam(name: string): string {
return TEAM_ALIASES[name] ?? name
}
-// Minimal RFC-4180 CSV parser
-function parseCsv(content: string): Record[] {
- const rows: string[][] = []
- let row: string[] = []
- let field = ''
- let inQ = false
- for (let i = 0; i < content.length; i++) {
- const ch = content[i]
- if (inQ) {
- if (ch === '"') {
- if (content[i + 1] === '"') { field += '"'; i++ }
- else inQ = false
- } else {
- field += ch
- }
- } else if (ch === '"') {
- inQ = true
- } else if (ch === ',') {
- row.push(field); field = ''
- } else if (ch === '\n') {
- row.push(field); rows.push(row); row = []; field = ''
- } else if (ch !== '\r') {
- field += ch
- }
- }
- if (field || row.length) { row.push(field); rows.push(row) }
- const headers = rows[0]
- return rows.slice(1)
- .filter(r => r.some(f => f.trim()))
- .map(r => Object.fromEntries(headers.map((h, i) => [h.trim(), (r[i] ?? '').trim()])))
-}
-
function readJson(filePath: string): T | null {
if (!existsSync(filePath)) return null
try { return JSON.parse(readFileSync(filePath, 'utf-8')) as T } catch { return null }
@@ -100,6 +42,7 @@ type RawMatch = {
group?: string; ground?: string;
}
type RawData = { matches: RawMatch[] }
+type RawMeta = { host: string; teams_count: number | null; winner: string | null; runner_up: string | null; third_place: string | null; fourth_place: string | null }
type RawStadiums = { stadiums: { name: string; city: string; cc?: string; capacity?: number; timezone?: string; coords?: string }[] }
type RawSquad = { name: string; players: { name: string; number?: number; pos?: string; date_of_birth?: string }[] }
@@ -245,33 +188,7 @@ async function run() {
return id
}
- // 1. Tournaments from world_cup.csv (host, winner, runner_up)
- const wcRows = parseCsv(readFileSync(path.join(DATA_DIR, 'world_cup.csv'), 'utf-8'))
- for (const r of wcRows) {
- const year = parseInt(r['Year'])
- if (isNaN(year)) continue
- const winner = normTeam(r['Champion'] || '')
- const runnerUp = normTeam(r['Runner-Up'] || '')
- const p = PLACEMENTS[year] ?? {}
- await db.execute(sql`
- INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
- VALUES (
- ${year}, ${r['Host']},
- ${winner || null}, ${runnerUp || null},
- ${p.third ?? null}, ${p.fourth ?? null},
- ${parseInt(r['Teams']) || null}
- )
- ON CONFLICT (year) DO UPDATE SET
- host = EXCLUDED.host,
- winner = EXCLUDED.winner,
- runner_up = EXCLUDED.runner_up,
- third_place = EXCLUDED.third_place,
- fourth_place = EXCLUDED.fourth_place,
- teams_count = EXCLUDED.teams_count
- `)
- }
-
- // 2. Per-year match/stadium/squad data from openfootball JSON files
+ // Per-year data from Wikipedia JSON files
let totalMatches = 0
let totalGoals = 0
@@ -283,6 +200,27 @@ async function run() {
continue
}
+ // Tournament row from meta.json
+ const meta = readJson(path.join(yearDir, 'worldcup.meta.json'))
+ if (meta) {
+ await db.execute(sql`
+ INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
+ VALUES (
+ ${year}, ${meta.host || null},
+ ${normTeam(meta.winner ?? '') || null}, ${normTeam(meta.runner_up ?? '') || null},
+ ${normTeam(meta.third_place ?? '') || null}, ${normTeam(meta.fourth_place ?? '') || null},
+ ${meta.teams_count ?? null}
+ )
+ ON CONFLICT (year) DO UPDATE SET
+ host = EXCLUDED.host,
+ winner = EXCLUDED.winner,
+ runner_up = EXCLUDED.runner_up,
+ third_place = EXCLUDED.third_place,
+ fourth_place = EXCLUDED.fourth_place,
+ teams_count = EXCLUDED.teams_count
+ `)
+ }
+
let matchCount = 0, goalCount = 0
// Stadiums