feat: scrape tournament meta from Wikipedia, drop world_cup.csv

Add worldcup.meta.json per year with host, teams_count, winner, runner_up,
third_place, fourth_place — derived from match results (Final/Third-place
match) with infobox as fallback for edge cases like 1950's round-robin final.

Fix infobox host extraction to handle <br>-separated multi-host entries
(2002: Japan / South Korea). Fix squad scraper to filter out zero-player
phantom sections that Wikipedia appends (References, Captains, etc.).

Drop app/data/world_cup.csv and the PLACEMENTS/parseCsv code in seed.ts —
all tournament metadata now comes from the scraped JSON files.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 17:09:45 +02:00
parent ff4989f39f
commit d1171267a8
34 changed files with 319 additions and 254 deletions
+119 -4
View File
@@ -49,6 +49,14 @@ type Stadium = { name: string; city: string }
type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
type Squad = { name: string; players: Player[] }
type Group = { name: string; teams: string[] }
type Meta = {
host: string
teams_count: number | null
winner: string | null
runner_up: string | null
third_place: string | null
fourth_place: string | null
}
// ── Fetch ──────────────────────────────────────────────────────────────────
@@ -309,10 +317,99 @@ function processHeading(text: string, level: number, state: State): void {
// ── Main year scraper ──────────────────────────────────────────────────────
// ── Infobox parsing ────────────────────────────────────────────────────────
function parseInfobox($: CheerioAPI): Partial<Meta> {
const result: Partial<Meta> = {}
function tdText($td: Cheerio<Element>): string {
const $clone = $td.clone()
$clone.find('br').replaceWith(' / ')
$clone.find('sup, img').remove()
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
}
function tdFirstLink($td: Cheerio<Element>): string | null {
let name: string | null = null
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
})
return name ?? (tdText($td) || null)
}
function tdAllLinks($td: Cheerio<Element>): string {
const names: string[] = []
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) names.push(t)
})
return names.length ? names.join(' / ') : tdText($td)
}
$('table.infobox').first().find('tr').each((_, tr) => {
const $tr = $(tr)
const label = $tr.find('th').text().trim().toLowerCase()
const $td = $tr.find('td').first()
if (!$td.length) return
if (/host countr/i.test(label)) {
result.host = tdAllLinks($td)
} else if (/^teams$/i.test(label)) {
const m = $td.text().match(/\d+/)
if (m) result.teams_count = parseInt(m[0])
} else if (/champion/i.test(label)) {
result.winner = tdFirstLink($td)
} else if (/runners?.up/i.test(label)) {
result.runner_up = tdFirstLink($td)
} else if (/third.place/i.test(label)) {
result.third_place = tdFirstLink($td)
} else if (/fourth.place/i.test(label)) {
result.fourth_place = tdFirstLink($td)
}
})
return result
}
// ── Placement derivation ───────────────────────────────────────────────────
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
function matchWinner(m: Match): [string, string] | null {
if (!m.score) return null
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
if (h > a) return [m.team1, m.team2]
if (a > h) return [m.team2, m.team1]
if (m.score.p) {
const [ph, pa] = m.score.p
if (ph > pa) return [m.team1, m.team2]
if (pa > ph) return [m.team2, m.team1]
}
return null
}
let winner: string | null = null, runner_up: string | null = null
let third_place: string | null = null, fourth_place: string | null = null
for (const m of matches) {
if (m.round === 'Final') {
const result = matchWinner(m)
if (result) { [winner, runner_up] = result }
} else if (m.round === 'Third-place match') {
const result = matchWinner(m)
if (result) { [third_place, fourth_place] = result }
}
}
return { winner, runner_up, third_place, fourth_place }
}
// ── Year result ────────────────────────────────────────────────────────────
type YearResult = {
matches: Match[]
stadiums: Map<string, Stadium>
groups: Map<string, Set<string>>
meta: Meta
}
async function scrapeYear(year: number, mainHtml: string): Promise<YearResult> {
@@ -406,7 +503,18 @@ async function scrapeYear(year: number, mainHtml: string): Promise<YearResult> {
process.stdout.write(`[+${page.slice(-8)}] `)
}
return { matches, stadiums, groups }
const infobox = parseInfobox($)
const placements = derivePlacements(matches)
const meta: Meta = {
host: infobox.host ?? '',
teams_count: infobox.teams_count ?? null,
winner: placements.winner ?? infobox.winner ?? null,
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
third_place: placements.third_place ?? infobox.third_place ?? null,
fourth_place:placements.fourth_place?? infobox.fourth_place?? null,
}
return { matches, stadiums, groups, meta }
}
// ── Squad page scraper ─────────────────────────────────────────────────────
@@ -465,7 +573,7 @@ function scrapeSquads(html: string): Squad[] {
currentTeam.players.push(player)
})
return squads
return squads.filter(s => s.players.length > 0)
}
// ── Output ─────────────────────────────────────────────────────────────────
@@ -476,10 +584,17 @@ function writeOutput(
stadiums: Map<string, Stadium>,
groups: Map<string, Set<string>>,
squads: Squad[],
meta: Meta,
): void {
const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true })
writeFileSync(
path.join(dir, 'worldcup.meta.json'),
JSON.stringify(meta, null, 2),
'utf-8',
)
writeFileSync(
path.join(dir, 'worldcup.json'),
JSON.stringify({ matches }, null, 2),
@@ -529,14 +644,14 @@ async function main() {
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
if (!mainHtml) { console.log('FAILED'); continue }
const { matches, stadiums, groups } = await scrapeYear(year, mainHtml)
const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
await delay(600)
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
const squads = squadHtml ? scrapeSquads(squadHtml) : []
writeOutput(year, matches, stadiums, groups, squads)
writeOutput(year, matches, stadiums, groups, squads, meta)
console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`)