feat: scrape tournament meta from Wikipedia, drop world_cup.csv

Add worldcup.meta.json per year with host, teams_count, winner, runner_up,
third_place, fourth_place — derived from match results (Final/Third-place
match) with infobox as fallback for edge cases like 1950's round-robin final.

Fix infobox host extraction to handle <br>-separated multi-host entries
(2002: Japan / South Korea). Fix squad scraper to filter out zero-player
phantom sections that Wikipedia appends (References, Captains, etc.).

Drop app/data/world_cup.csv and the PLACEMENTS/parseCsv code in seed.ts —
all tournament metadata now comes from the scraped JSON files.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 17:09:45 +02:00
parent ff4989f39f
commit d1171267a8
34 changed files with 319 additions and 254 deletions
@@ -0,0 +1,8 @@
{
"host": "Uruguay",
"teams_count": 13,
"winner": "Uruguay",
"runner_up": "Argentina",
"third_place": "United States",
"fourth_place": "Yugoslavia"
}
@@ -0,0 +1,8 @@
{
"host": "Italy",
"teams_count": 16,
"winner": "Italy",
"runner_up": "Czechoslovakia",
"third_place": "Germany",
"fourth_place": "Austria"
}
@@ -0,0 +1,8 @@
{
"host": "France",
"teams_count": 15,
"winner": "Italy",
"runner_up": "Hungary",
"third_place": "Brazil",
"fourth_place": "Sweden"
}
@@ -0,0 +1,8 @@
{
"host": "Brazil",
"teams_count": 13,
"winner": "Uruguay",
"runner_up": "Brazil",
"third_place": "Sweden",
"fourth_place": "Spain"
}
@@ -0,0 +1,8 @@
{
"host": "Switzerland",
"teams_count": 16,
"winner": "West Germany",
"runner_up": "Hungary",
"third_place": "Austria",
"fourth_place": "Uruguay"
}
@@ -2178,21 +2178,5 @@
"date_of_birth": "1929-01-07" "date_of_birth": "1929-01-07"
} }
] ]
},
{
"name": "Age",
"players": []
},
{
"name": "Player representation by club",
"players": []
},
{
"name": "Player representation by league system",
"players": []
},
{
"name": "Coaches representation by country",
"players": []
} }
] ]
@@ -0,0 +1,8 @@
{
"host": "Sweden",
"teams_count": 16,
"winner": "Brazil",
"runner_up": "Sweden",
"third_place": "France",
"fourth_place": "West Germany"
}
@@ -0,0 +1,8 @@
{
"host": "Chile",
"teams_count": 16,
"winner": "Brazil",
"runner_up": "Czechoslovakia",
"third_place": "Chile",
"fourth_place": "Yugoslavia"
}
@@ -0,0 +1,8 @@
{
"host": "England",
"teams_count": 16,
"winner": "England",
"runner_up": "West Germany",
"third_place": "Portugal",
"fourth_place": "Soviet Union"
}
@@ -0,0 +1,8 @@
{
"host": "Mexico",
"teams_count": 16,
"winner": "Brazil",
"runner_up": "Italy",
"third_place": "West Germany",
"fourth_place": "Uruguay"
}
@@ -0,0 +1,8 @@
{
"host": "West Germany",
"teams_count": 16,
"winner": "West Germany",
"runner_up": "Netherlands",
"third_place": "Poland",
"fourth_place": "Brazil"
}
@@ -0,0 +1,8 @@
{
"host": "Argentina",
"teams_count": 16,
"winner": "Argentina",
"runner_up": "Netherlands",
"third_place": "Brazil",
"fourth_place": "Italy"
}
@@ -0,0 +1,8 @@
{
"host": "Spain",
"teams_count": 24,
"winner": "Italy",
"runner_up": "West Germany",
"third_place": "Poland",
"fourth_place": "France"
}
@@ -0,0 +1,8 @@
{
"host": "Mexico",
"teams_count": 24,
"winner": "Argentina",
"runner_up": "West Germany",
"third_place": "France",
"fourth_place": "Belgium"
}
@@ -0,0 +1,8 @@
{
"host": "Italy",
"teams_count": 24,
"winner": "West Germany",
"runner_up": "Argentina",
"third_place": "Italy",
"fourth_place": "England"
}
@@ -3298,17 +3298,5 @@
"date_of_birth": "1956-05-20" "date_of_birth": "1956-05-20"
} }
] ]
},
{
"name": "Player",
"players": []
},
{
"name": "Captains",
"players": []
},
{
"name": "Goalkeepers",
"players": []
} }
] ]
@@ -0,0 +1,8 @@
{
"host": "United States",
"teams_count": 24,
"winner": "Brazil",
"runner_up": "Italy",
"third_place": "Sweden",
"fourth_place": "Bulgaria"
}
@@ -3286,13 +3286,5 @@
"date_of_birth": "1972-08-18" "date_of_birth": "1972-08-18"
} }
] ]
},
{
"name": "Captains",
"players": []
},
{
"name": "Goalkeepers",
"players": []
} }
] ]
@@ -0,0 +1,8 @@
{
"host": "France",
"teams_count": 32,
"winner": "France",
"runner_up": "Brazil",
"third_place": "Croatia",
"fourth_place": "Netherlands"
}
@@ -4388,17 +4388,5 @@
"date_of_birth": "1974-07-15" "date_of_birth": "1974-07-15"
} }
] ]
},
{
"name": "Players",
"players": []
},
{
"name": "Captains",
"players": []
},
{
"name": "Goalkeepers",
"players": []
} }
] ]
@@ -0,0 +1,8 @@
{
"host": "Japan / South Korea",
"teams_count": 32,
"winner": "Brazil",
"runner_up": "Germany",
"third_place": "Turkey",
"fourth_place": "South Korea"
}
@@ -4574,25 +4574,5 @@
"date_of_birth": "1974-03-21" "date_of_birth": "1974-03-21"
} }
] ]
},
{
"name": "Players",
"players": []
},
{
"name": "Captains",
"players": []
},
{
"name": "Goalkeepers",
"players": []
},
{
"name": "General references",
"players": []
},
{
"name": "Citations and notes",
"players": []
} }
] ]
@@ -0,0 +1,8 @@
{
"host": "Germany",
"teams_count": 32,
"winner": "Italy",
"runner_up": "France",
"third_place": "Germany",
"fourth_place": "Portugal"
}
@@ -4574,17 +4574,5 @@
"date_of_birth": "1986-03-04" "date_of_birth": "1986-03-04"
} }
] ]
},
{
"name": "Players",
"players": []
},
{
"name": "Goalkeeper",
"players": []
},
{
"name": "Captains",
"players": []
} }
] ]
@@ -0,0 +1,8 @@
{
"host": "South Africa",
"teams_count": 32,
"winner": "Spain",
"runner_up": "Netherlands",
"third_place": "Germany",
"fourth_place": "Uruguay"
}
@@ -4574,25 +4574,5 @@
"date_of_birth": "1991-10-10" "date_of_birth": "1991-10-10"
} }
] ]
},
{
"name": "Player representation by age",
"players": []
},
{
"name": "Player representation by club",
"players": []
},
{
"name": "Player representation by league",
"players": []
},
{
"name": "Average age of squads",
"players": []
},
{
"name": "Coaches representation by country",
"players": []
} }
] ]
@@ -0,0 +1,8 @@
{
"host": "Brazil",
"teams_count": 32,
"winner": "Germany",
"runner_up": "Argentina",
"third_place": "Netherlands",
"fourth_place": "Brazil"
}
@@ -4574,21 +4574,5 @@
"date_of_birth": "1989-04-02" "date_of_birth": "1989-04-02"
} }
] ]
},
{
"name": "Player representation by age",
"players": []
},
{
"name": "Player representation by league system",
"players": []
},
{
"name": "Player representation by club",
"players": []
},
{
"name": "Coaches representation by country",
"players": []
} }
] ]
@@ -0,0 +1,8 @@
{
"host": "Russia",
"teams_count": 32,
"winner": "France",
"runner_up": "Croatia",
"third_place": "Belgium",
"fourth_place": "England"
}
@@ -4574,25 +4574,5 @@
"date_of_birth": "1993-09-05" "date_of_birth": "1993-09-05"
} }
] ]
},
{
"name": "Age",
"players": []
},
{
"name": "Player representation by league system",
"players": []
},
{
"name": "Player representation by club",
"players": []
},
{
"name": "Player representation by club confederation",
"players": []
},
{
"name": "Coaches representation by country",
"players": []
} }
] ]
@@ -0,0 +1,8 @@
{
"host": "Qatar",
"teams_count": 32,
"winner": "Argentina",
"runner_up": "France",
"third_place": "Croatia",
"fourth_place": "Morocco"
}
@@ -5144,33 +5144,5 @@
"date_of_birth": "1997-03-14" "date_of_birth": "1997-03-14"
} }
] ]
},
{
"name": "Age",
"players": []
},
{
"name": "Players",
"players": []
},
{
"name": "Player representation by league system",
"players": []
},
{
"name": "Player representation by club",
"players": []
},
{
"name": "Player representation by club confederation",
"players": []
},
{
"name": "Average age of squads",
"players": []
},
{
"name": "Coaches representation by country",
"players": []
} }
] ]
+119 -4
View File
@@ -49,6 +49,14 @@ type Stadium = { name: string; city: string }
type Player = { name: string; number?: number; pos?: string; date_of_birth?: string } type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
type Squad = { name: string; players: Player[] } type Squad = { name: string; players: Player[] }
type Group = { name: string; teams: string[] } type Group = { name: string; teams: string[] }
type Meta = {
host: string
teams_count: number | null
winner: string | null
runner_up: string | null
third_place: string | null
fourth_place: string | null
}
// ── Fetch ────────────────────────────────────────────────────────────────── // ── Fetch ──────────────────────────────────────────────────────────────────
@@ -309,10 +317,99 @@ function processHeading(text: string, level: number, state: State): void {
// ── Main year scraper ────────────────────────────────────────────────────── // ── Main year scraper ──────────────────────────────────────────────────────
// ── Infobox parsing ────────────────────────────────────────────────────────
function parseInfobox($: CheerioAPI): Partial<Meta> {
const result: Partial<Meta> = {}
function tdText($td: Cheerio<Element>): string {
const $clone = $td.clone()
$clone.find('br').replaceWith(' / ')
$clone.find('sup, img').remove()
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
}
function tdFirstLink($td: Cheerio<Element>): string | null {
let name: string | null = null
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
})
return name ?? (tdText($td) || null)
}
function tdAllLinks($td: Cheerio<Element>): string {
const names: string[] = []
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) names.push(t)
})
return names.length ? names.join(' / ') : tdText($td)
}
$('table.infobox').first().find('tr').each((_, tr) => {
const $tr = $(tr)
const label = $tr.find('th').text().trim().toLowerCase()
const $td = $tr.find('td').first()
if (!$td.length) return
if (/host countr/i.test(label)) {
result.host = tdAllLinks($td)
} else if (/^teams$/i.test(label)) {
const m = $td.text().match(/\d+/)
if (m) result.teams_count = parseInt(m[0])
} else if (/champion/i.test(label)) {
result.winner = tdFirstLink($td)
} else if (/runners?.up/i.test(label)) {
result.runner_up = tdFirstLink($td)
} else if (/third.place/i.test(label)) {
result.third_place = tdFirstLink($td)
} else if (/fourth.place/i.test(label)) {
result.fourth_place = tdFirstLink($td)
}
})
return result
}
// ── Placement derivation ───────────────────────────────────────────────────
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
function matchWinner(m: Match): [string, string] | null {
if (!m.score) return null
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
if (h > a) return [m.team1, m.team2]
if (a > h) return [m.team2, m.team1]
if (m.score.p) {
const [ph, pa] = m.score.p
if (ph > pa) return [m.team1, m.team2]
if (pa > ph) return [m.team2, m.team1]
}
return null
}
let winner: string | null = null, runner_up: string | null = null
let third_place: string | null = null, fourth_place: string | null = null
for (const m of matches) {
if (m.round === 'Final') {
const result = matchWinner(m)
if (result) { [winner, runner_up] = result }
} else if (m.round === 'Third-place match') {
const result = matchWinner(m)
if (result) { [third_place, fourth_place] = result }
}
}
return { winner, runner_up, third_place, fourth_place }
}
// ── Year result ────────────────────────────────────────────────────────────
type YearResult = { type YearResult = {
matches: Match[] matches: Match[]
stadiums: Map<string, Stadium> stadiums: Map<string, Stadium>
groups: Map<string, Set<string>> groups: Map<string, Set<string>>
meta: Meta
} }
async function scrapeYear(year: number, mainHtml: string): Promise<YearResult> { async function scrapeYear(year: number, mainHtml: string): Promise<YearResult> {
@@ -406,7 +503,18 @@ async function scrapeYear(year: number, mainHtml: string): Promise<YearResult> {
process.stdout.write(`[+${page.slice(-8)}] `) process.stdout.write(`[+${page.slice(-8)}] `)
} }
return { matches, stadiums, groups } const infobox = parseInfobox($)
const placements = derivePlacements(matches)
const meta: Meta = {
host: infobox.host ?? '',
teams_count: infobox.teams_count ?? null,
winner: placements.winner ?? infobox.winner ?? null,
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
third_place: placements.third_place ?? infobox.third_place ?? null,
fourth_place:placements.fourth_place?? infobox.fourth_place?? null,
}
return { matches, stadiums, groups, meta }
} }
// ── Squad page scraper ───────────────────────────────────────────────────── // ── Squad page scraper ─────────────────────────────────────────────────────
@@ -465,7 +573,7 @@ function scrapeSquads(html: string): Squad[] {
currentTeam.players.push(player) currentTeam.players.push(player)
}) })
return squads return squads.filter(s => s.players.length > 0)
} }
// ── Output ───────────────────────────────────────────────────────────────── // ── Output ─────────────────────────────────────────────────────────────────
@@ -476,10 +584,17 @@ function writeOutput(
stadiums: Map<string, Stadium>, stadiums: Map<string, Stadium>,
groups: Map<string, Set<string>>, groups: Map<string, Set<string>>,
squads: Squad[], squads: Squad[],
meta: Meta,
): void { ): void {
const dir = path.join(DATA_DIR, String(year)) const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true }) mkdirSync(dir, { recursive: true })
writeFileSync(
path.join(dir, 'worldcup.meta.json'),
JSON.stringify(meta, null, 2),
'utf-8',
)
writeFileSync( writeFileSync(
path.join(dir, 'worldcup.json'), path.join(dir, 'worldcup.json'),
JSON.stringify({ matches }, null, 2), JSON.stringify({ matches }, null, 2),
@@ -529,14 +644,14 @@ async function main() {
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`) const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
if (!mainHtml) { console.log('FAILED'); continue } if (!mainHtml) { console.log('FAILED'); continue }
const { matches, stadiums, groups } = await scrapeYear(year, mainHtml) const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
await delay(600) await delay(600)
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`) const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
const squads = squadHtml ? scrapeSquads(squadHtml) : [] const squads = squadHtml ? scrapeSquads(squadHtml) : []
writeOutput(year, matches, stadiums, groups, squads) writeOutput(year, matches, stadiums, groups, squads, meta)
console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`) console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`)
+24 -86
View File
@@ -8,39 +8,13 @@ import { getIso } from '../lib/iso-codes'
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup' const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
const __dirname = path.dirname(fileURLToPath(import.meta.url)) const __dirname = path.dirname(fileURLToPath(import.meta.url))
const DATA_DIR = path.join(__dirname, '../app/data') const WC_DIR = path.join(__dirname, '../app/data/wikipedia')
const WC_DIR = path.join(DATA_DIR, 'wikipedia')
const YEARS = [ const YEARS = [
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974, 1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022, 1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
] ]
// Third/fourth place not reliably in source data for older years
const PLACEMENTS: Record<number, { third?: string; fourth?: string }> = {
1930: { third: 'USA', fourth: 'Yugoslavia' },
1934: { third: 'Germany', fourth: 'Austria' },
1938: { third: 'Brazil', fourth: 'Sweden' },
1954: { third: 'Austria', fourth: 'Uruguay' },
1958: { third: 'France', fourth: 'Germany' },
1962: { third: 'Chile', fourth: 'Yugoslavia' },
1966: { third: 'Portugal', fourth: 'Soviet Union' },
1970: { third: 'Germany', fourth: 'Uruguay' },
1974: { third: 'Poland', fourth: 'Brazil' },
1978: { third: 'Brazil', fourth: 'Italy' },
1982: { third: 'Poland', fourth: 'France' },
1986: { third: 'France', fourth: 'Belgium' },
1990: { third: 'Italy', fourth: 'England' },
1994: { third: 'Sweden', fourth: 'Bulgaria' },
1998: { third: 'Croatia', fourth: 'Netherlands' },
2002: { third: 'Turkey', fourth: 'South Korea' },
2006: { third: 'Germany', fourth: 'Portugal' },
2010: { third: 'Germany', fourth: 'Uruguay' },
2014: { third: 'Netherlands', fourth: 'Brazil' },
2018: { third: 'Belgium', fourth: 'England' },
2022: { third: 'Croatia', fourth: 'Morocco' },
}
// Normalize team names from Wikipedia to canonical DB names // Normalize team names from Wikipedia to canonical DB names
const TEAM_ALIASES: Record<string, string> = { const TEAM_ALIASES: Record<string, string> = {
'West Germany': 'Germany', 'West Germany': 'Germany',
@@ -52,38 +26,6 @@ function normTeam(name: string): string {
return TEAM_ALIASES[name] ?? name return TEAM_ALIASES[name] ?? name
} }
// Minimal RFC-4180 CSV parser
function parseCsv(content: string): Record<string, string>[] {
const rows: string[][] = []
let row: string[] = []
let field = ''
let inQ = false
for (let i = 0; i < content.length; i++) {
const ch = content[i]
if (inQ) {
if (ch === '"') {
if (content[i + 1] === '"') { field += '"'; i++ }
else inQ = false
} else {
field += ch
}
} else if (ch === '"') {
inQ = true
} else if (ch === ',') {
row.push(field); field = ''
} else if (ch === '\n') {
row.push(field); rows.push(row); row = []; field = ''
} else if (ch !== '\r') {
field += ch
}
}
if (field || row.length) { row.push(field); rows.push(row) }
const headers = rows[0]
return rows.slice(1)
.filter(r => r.some(f => f.trim()))
.map(r => Object.fromEntries(headers.map((h, i) => [h.trim(), (r[i] ?? '').trim()])))
}
function readJson<T>(filePath: string): T | null { function readJson<T>(filePath: string): T | null {
if (!existsSync(filePath)) return null if (!existsSync(filePath)) return null
try { return JSON.parse(readFileSync(filePath, 'utf-8')) as T } catch { return null } try { return JSON.parse(readFileSync(filePath, 'utf-8')) as T } catch { return null }
@@ -100,6 +42,7 @@ type RawMatch = {
group?: string; ground?: string; group?: string; ground?: string;
} }
type RawData = { matches: RawMatch[] } type RawData = { matches: RawMatch[] }
type RawMeta = { host: string; teams_count: number | null; winner: string | null; runner_up: string | null; third_place: string | null; fourth_place: string | null }
type RawStadiums = { stadiums: { name: string; city: string; cc?: string; capacity?: number; timezone?: string; coords?: string }[] } type RawStadiums = { stadiums: { name: string; city: string; cc?: string; capacity?: number; timezone?: string; coords?: string }[] }
type RawSquad = { name: string; players: { name: string; number?: number; pos?: string; date_of_birth?: string }[] } type RawSquad = { name: string; players: { name: string; number?: number; pos?: string; date_of_birth?: string }[] }
@@ -245,33 +188,7 @@ async function run() {
return id return id
} }
// 1. Tournaments from world_cup.csv (host, winner, runner_up) // Per-year data from Wikipedia JSON files
const wcRows = parseCsv(readFileSync(path.join(DATA_DIR, 'world_cup.csv'), 'utf-8'))
for (const r of wcRows) {
const year = parseInt(r['Year'])
if (isNaN(year)) continue
const winner = normTeam(r['Champion'] || '')
const runnerUp = normTeam(r['Runner-Up'] || '')
const p = PLACEMENTS[year] ?? {}
await db.execute(sql`
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
VALUES (
${year}, ${r['Host']},
${winner || null}, ${runnerUp || null},
${p.third ?? null}, ${p.fourth ?? null},
${parseInt(r['Teams']) || null}
)
ON CONFLICT (year) DO UPDATE SET
host = EXCLUDED.host,
winner = EXCLUDED.winner,
runner_up = EXCLUDED.runner_up,
third_place = EXCLUDED.third_place,
fourth_place = EXCLUDED.fourth_place,
teams_count = EXCLUDED.teams_count
`)
}
// 2. Per-year match/stadium/squad data from openfootball JSON files
let totalMatches = 0 let totalMatches = 0
let totalGoals = 0 let totalGoals = 0
@@ -283,6 +200,27 @@ async function run() {
continue continue
} }
// Tournament row from meta.json
const meta = readJson<RawMeta>(path.join(yearDir, 'worldcup.meta.json'))
if (meta) {
await db.execute(sql`
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
VALUES (
${year}, ${meta.host || null},
${normTeam(meta.winner ?? '') || null}, ${normTeam(meta.runner_up ?? '') || null},
${normTeam(meta.third_place ?? '') || null}, ${normTeam(meta.fourth_place ?? '') || null},
${meta.teams_count ?? null}
)
ON CONFLICT (year) DO UPDATE SET
host = EXCLUDED.host,
winner = EXCLUDED.winner,
runner_up = EXCLUDED.runner_up,
third_place = EXCLUDED.third_place,
fourth_place = EXCLUDED.fourth_place,
teams_count = EXCLUDED.teams_count
`)
}
let matchCount = 0, goalCount = 0 let matchCount = 0, goalCount = 0
// Stadiums // Stadiums