import { load } from 'cheerio' import type { CheerioAPI } from 'cheerio' import type { Cheerio } from 'cheerio' import type { Element } from 'domhandler' import { mkdirSync, writeFileSync } from 'fs' import path from 'path' import { fileURLToPath } from 'url' const __dirname = path.dirname(fileURLToPath(import.meta.url)) const DATA_DIR = path.join(__dirname, '../app/data/wikipedia') const YEARS = [ 1930,1934,1938,1950,1954,1958,1962,1966,1970,1974, 1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022, ] const delay = (ms: number) => new Promise(r => setTimeout(r, ms)) // ── Types ────────────────────────────────────────────────────────────────── type Goal = { name: string minute?: number offset?: number penalty?: boolean owngoal?: boolean } type ScoreObj = { ft?: [number, number] et?: [number, number] p?: [number, number] } type Match = { round: string group?: string date?: string time?: string team1: string team2: string score?: ScoreObj goals1?: Goal[] goals2?: Goal[] ground?: string } type Stadium = { name: string; city: string } type Player = { name: string; number?: number; pos?: string; date_of_birth?: string } type Squad = { name: string; players: Player[] } type Group = { name: string; teams: string[] } type Meta = { host: string teams_count: number | null winner: string | null runner_up: string | null third_place: string | null fourth_place: string | null } // ── Fetch ────────────────────────────────────────────────────────────────── async function fetchWikiHtml(page: string, retries = 5): Promise { const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1` for (let attempt = 0; attempt < retries; attempt++) { try { if (attempt > 0) await delay(3000 * attempt) const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0 (github.com/worldcup)' } }) if (!res.ok) continue const data = await res.json() as { parse?: { text?: { '*': string } } } const html = data?.parse?.text?.['*'] if (html) return html } catch { // retry } } return null } // ── Score parsing ────────────────────────────────────────────────────────── function parseScoreText(text: string): [number, number] | null { const m = text.match(/(\d+)\s*[–\-]\s*(\d+)/) if (!m) return null return [parseInt(m[1]), parseInt(m[2])] } // ── Team name extraction ─────────────────────────────────────────────────── function extractTeam($: CheerioAPI, $cell: Cheerio): string { let name = '' $cell.find('a').each((_, a) => { const $a = $(a) if (!$a.find('img').length && $a.text().trim()) { name = $a.text().trim() return false } }) return name } // ── Goal parsing ─────────────────────────────────────────────────────────── function parseGoals($: CheerioAPI, $td: Cheerio): Goal[] { const goals: Goal[] = [] $td.find('li').each((_, li) => { const $li = $(li) // Player name: first NOT inside .fb-goal let playerName = '' $li.find('a').each((_, a) => { if (!$(a).closest('.fb-goal').length) { const t = $(a).text().trim() if (t) { playerName = t; return false } } }) if (!playerName) return const $fbGoal = $li.find('.fb-goal') if (!$fbGoal.length) return // Each direct child inside .fb-goal (excluding image wrapper) $fbGoal.children('span').each((_, span) => { const $span = $(span) if ($span.attr('typeof')) return // image wrapper const text = $span.text() const minMatch = text.match(/(\d+)(?:\+(\d+))?['′]/) if (!minMatch) return const minute = parseInt(minMatch[1]) const offset = minMatch[2] ? parseInt(minMatch[2]) : 0 const isPen = text.includes('pen.') const isOG = text.includes('o.g.') const goal: Goal = { name: playerName } if (!isNaN(minute)) goal.minute = minute if (offset) goal.offset = offset if (isPen) goal.penalty = true if (isOG) goal.owngoal = true goals.push(goal) }) }) return goals } // ── Ground extraction ────────────────────────────────────────────────────── function extractGround($: CheerioAPI, $box: Cheerio): string { const $loc = $box.find('[itemprop="name address"]').first() if ($loc.length) return $loc.text().trim() return $box.find('.fright').first().text().split('\n')[0].trim() } function parseGroundParts(ground: string): { name: string; city: string } { const commaIdx = ground.indexOf(',') if (commaIdx !== -1) { return { name: ground.slice(0, commaIdx).trim(), city: ground.slice(commaIdx + 1).trim(), } } return { name: ground, city: '' } } // ── Footballbox parsing ──────────────────────────────────────────────────── function parseBox( $: CheerioAPI, $box: Cheerio, round: string, group: string | null, ): Match | null { const team1 = extractTeam($, $box.find('.fhome')) const team2 = extractTeam($, $box.find('.faway')) if (!team1 || !team2) return null const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined const timeText = $box.find('.ftime').first().text().trim() const timeMatch = timeText.match(/(\d{2}:\d{2})/) const timeStr = timeMatch?.[1] const scoreText = $box.find('.fscore').first().text().trim() const hasAET = scoreText.toLowerCase().includes('a.e.t.') const scoreArr = parseScoreText(scoreText) // Use first fgoals row only (exclude penalty shootout row) const $regularRow = $box.find('tr.fgoals').first() const goals1 = parseGoals($, $regularRow.find('.fhgoal')) const goals2 = parseGoals($, $regularRow.find('.fagoal')) // Penalty shootout score: row after "Penalties" header tr let penScore: [number, number] | undefined $box.find('tr').each((_, tr) => { const $tr = $(tr) if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) { const penText = $tr.next('tr').find('th').not('.fhome,.faway').first().text().trim() const ps = parseScoreText(penText) if (ps) penScore = ps return false } }) let score: ScoreObj | undefined if (scoreArr) { if (hasAET) { // scoreArr is ET total; compute FT from goals in ≤90 min const ftGoals = (gs: Goal[], includeOG = false) => gs.filter(g => { const w90 = g.minute === undefined || g.minute <= 90 return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90 }).length const ftHome = ftGoals(goals1) + ftGoals(goals2, true) const ftAway = ftGoals(goals2) + ftGoals(goals1, true) score = { ft: [ftHome, ftAway], et: scoreArr } } else { score = { ft: scoreArr } } if (penScore) score.p = penScore } const ground = extractGround($, $box) || undefined return { round, ...(group ? { group } : {}), ...(dateStr ? { date: dateStr } : {}), ...(timeStr ? { time: timeStr } : {}), team1, team2, ...(score ? { score } : {}), ...(goals1.length ? { goals1 } : {}), ...(goals2.length ? { goals2 } : {}), ...(ground ? { ground } : {}), } } // ── Collect matches from a pre-loaded page ───────────────────────────────── function collectBoxes( $: CheerioAPI, round: string, group: string | null, ): Match[] { const matches: Match[] = [] $('.footballbox').each((_, el) => { const m = parseBox($, $(el), round, group) if (m) matches.push(m) }) return matches } // ── Section heading state machine ────────────────────────────────────────── type State = { active: boolean round: string group: string | null } function processHeading(text: string, level: number, state: State): void { const t = text.toLowerCase().trim() if (level === 2) { if (/group stage/i.test(t) && !/second/i.test(t)) { state.active = true; state.round = 'Group stage'; state.group = null } else if (/first group stage/i.test(t)) { state.active = true; state.round = 'Group stage'; state.group = null } else if (/second group stage/i.test(t)) { state.active = true; state.round = 'Second group stage'; state.group = null } else if (t === 'final round') { state.active = true; state.round = 'Final round'; state.group = null } else if (/final tournament/i.test(t)) { state.active = true; state.round = ''; state.group = null } else if (/knock.?out stage/i.test(t)) { state.active = true; state.round = ''; state.group = null } else if (/round of 16/i.test(t)) { state.active = true; state.round = 'Round of 16'; state.group = null } else if (/quarter.final/i.test(t)) { state.active = true; state.round = 'Quarter-finals'; state.group = null } else if (/semi.final/i.test(t)) { state.active = true; state.round = 'Semi-finals'; state.group = null } else if (/third.place|match for third|play.off for third/i.test(t)) { state.active = true; state.round = 'Third-place match'; state.group = null } else if (t === 'final') { state.active = true; state.round = 'Final'; state.group = null } else { state.active = false } return } if (!state.active) return if (level === 3 || level === 4) { if (/^group [a-h1-9]+$/i.test(t)) { state.group = text.trim() } else if (/round of 32/i.test(t)) { state.round = 'Round of 32'; state.group = null } else if (/round of 16/i.test(t)) { state.round = 'Round of 16'; state.group = null } else if (/quarter.final/i.test(t)) { state.round = 'Quarter-finals'; state.group = null } else if (/semi.final/i.test(t)) { state.round = 'Semi-finals'; state.group = null } else if (/third.place|match for third|play.off for third/i.test(t)) { state.round = 'Third-place match'; state.group = null } else if (t === 'final') { state.round = 'Final'; state.group = null } // bracket, draw, seeding, replay → keep current state } } // ── Main year scraper ────────────────────────────────────────────────────── // ── Infobox parsing ──────────────────────────────────────────────────────── function parseInfobox($: CheerioAPI): Partial { const result: Partial = {} function tdText($td: Cheerio): string { const $clone = $td.clone() $clone.find('br').replaceWith(' / ') $clone.find('sup, img').remove() return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim() } function tdFirstLink($td: Cheerio): string | null { let name: string | null = null $td.find('a').each((_, a) => { const t = $(a).clone().find('img').remove().end().text().trim() if (t && !/\[\d+\]/.test(t)) { name = t; return false } }) return name ?? (tdText($td) || null) } function tdAllLinks($td: Cheerio): string { const names: string[] = [] $td.find('a').each((_, a) => { const t = $(a).clone().find('img').remove().end().text().trim() if (t && !/\[\d+\]/.test(t)) names.push(t) }) return names.length ? names.join(' / ') : tdText($td) } $('table.infobox').first().find('tr').each((_, tr) => { const $tr = $(tr) const label = $tr.find('th').text().trim().toLowerCase() const $td = $tr.find('td').first() if (!$td.length) return if (/host countr/i.test(label)) { result.host = tdAllLinks($td) } else if (/^teams$/i.test(label)) { const m = $td.text().match(/\d+/) if (m) result.teams_count = parseInt(m[0]) } else if (/champion/i.test(label)) { result.winner = tdFirstLink($td) } else if (/runners?.up/i.test(label)) { result.runner_up = tdFirstLink($td) } else if (/third.place/i.test(label)) { result.third_place = tdFirstLink($td) } else if (/fourth.place/i.test(label)) { result.fourth_place = tdFirstLink($td) } }) return result } // ── Placement derivation ─────────────────────────────────────────────────── function derivePlacements(matches: Match[]): Pick { function matchWinner(m: Match): [string, string] | null { if (!m.score) return null const [h, a] = m.score.et ?? m.score.ft ?? [0, 0] if (h > a) return [m.team1, m.team2] if (a > h) return [m.team2, m.team1] if (m.score.p) { const [ph, pa] = m.score.p if (ph > pa) return [m.team1, m.team2] if (pa > ph) return [m.team2, m.team1] } return null } let winner: string | null = null, runner_up: string | null = null let third_place: string | null = null, fourth_place: string | null = null for (const m of matches) { if (m.round === 'Final') { const result = matchWinner(m) if (result) { [winner, runner_up] = result } } else if (m.round === 'Third-place match') { const result = matchWinner(m) if (result) { [third_place, fourth_place] = result } } } return { winner, runner_up, third_place, fourth_place } } // ── Year result ──────────────────────────────────────────────────────────── type YearResult = { matches: Match[] stadiums: Map groups: Map> meta: Meta } async function scrapeYear(year: number, mainHtml: string): Promise { const $ = load(mainHtml) const matches: Match[] = [] const stadiums = new Map() const groups = new Map>() const state: State = { active: false, round: '', group: null } // Maps group name → sub-page to fetch (if main page has no matches for that group) const groupSubpages = new Map() // Groups that got at least one match from the main page const groupsOnMainPage = new Set() function recordMatch(m: Match) { matches.push(m) if (m.group) groupsOnMainPage.add(m.group) if (m.ground) { const { name, city } = parseGroundParts(m.ground) if (name && !stadiums.has(name)) stadiums.set(name, { name, city }) } if (m.group) { if (!groups.has(m.group)) groups.set(m.group, new Set()) groups.get(m.group)!.add(m.team1) groups.get(m.group)!.add(m.team2) } } // Walk elements in document order: headings, hatnotes, footballboxes $('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => { const $el = $(el) if ($el.hasClass('mw-heading')) { const $h = $el.find('h2, h3, h4').first() const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') const text = $h.text().replace(/\[edit\]/g, '').trim() processHeading(text, level, state) } else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) { // Record sub-page link for current group context (for fallback if no main-page matches) if (state.active && state.group) { const link = $el.find('a[href^="/wiki/"]').first().attr('href') if (link) { const page = link.replace('/wiki/', '').split('#')[0] if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group)) { groupSubpages.set(state.group, page) } } } } else if ($el.hasClass('footballbox')) { if (!state.active) return const round = state.round || state.group || 'Unknown' const m = parseBox($, $el, round, state.group) if (m) recordMatch(m) } }) // Fetch group sub-pages for any group that got 0 matches from main page for (const [group, page] of groupSubpages) { if (groupsOnMainPage.has(group)) continue await delay(1200) const subHtml = await fetchWikiHtml(page) if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue } // Determine the round for this group from the state machine result // (we'll reconstruct from the main-page walk state — use the round that was active when this group was seen) // Since we can't easily recover state here, we re-walk to find the round for this group let round = 'Group stage' let foundGroup = false const stateTemp: State = { active: false, round: '', group: null } $('.mw-parser-output').find('div.mw-heading').each((_, el) => { const $h = $(el).find('h2, h3, h4').first() const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') const text = $h.text().replace(/\[edit\]/g, '').trim() processHeading(text, level, stateTemp) if (stateTemp.group === group) { round = stateTemp.round || 'Group stage' foundGroup = true return false } }) const $sub = load(subHtml) const subMatches = collectBoxes($sub, round || 'Group stage', group) for (const m of subMatches) { recordMatch(m) } process.stdout.write(`[+${page.slice(-8)}] `) } const infobox = parseInfobox($) const placements = derivePlacements(matches) const meta: Meta = { host: infobox.host ?? '', teams_count: infobox.teams_count ?? null, winner: placements.winner ?? infobox.winner ?? null, runner_up: placements.runner_up ?? infobox.runner_up ?? null, third_place: placements.third_place ?? infobox.third_place ?? null, fourth_place:placements.fourth_place?? infobox.fourth_place?? null, } return { matches, stadiums, groups, meta } } // ── Squad page scraper ───────────────────────────────────────────────────── function scrapeSquads(html: string): Squad[] { const $ = load(html) const squads: Squad[] = [] let currentTeam: Squad | null = null $('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => { const $el = $(el) if ($el.hasClass('mw-heading')) { const $h = $el.find('h3, h4').first() if (!$h.length) return const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') if (level !== 3) return const name = $h.text().replace(/\[edit\]/g, '').trim() if (/^group /i.test(name)) return // skip group headers currentTeam = { name, players: [] } squads.push(currentTeam) return } if (!currentTeam) return let number: number | undefined let pos: string | undefined let playerName = '' let dob: string | undefined $el.find('td, th[scope="row"]').each((i, td) => { const $td = $(td) const text = $td.text().trim() if ($td.is('th[scope="row"]')) { playerName = $td.find('a').first().text().trim() || text } else if (i === 0 && !playerName) { const n = parseInt(text) if (!isNaN(n)) number = n } else if (i === 1 && !playerName && !pos) { const posLink = $td.find('a').first().text().trim() if (['GK', 'DF', 'MF', 'FW'].includes(posLink)) pos = posLink } const $bday = $td.find('.bday') if ($bday.length) dob = $bday.text().trim() }) if (!playerName) return const player: Player = { name: playerName } if (number !== undefined) player.number = number if (pos) player.pos = pos if (dob) player.date_of_birth = dob currentTeam.players.push(player) }) return squads.filter(s => s.players.length > 0) } // ── Output ───────────────────────────────────────────────────────────────── function writeOutput( year: number, matches: Match[], stadiums: Map, groups: Map>, squads: Squad[], meta: Meta, ): void { const dir = path.join(DATA_DIR, String(year)) mkdirSync(dir, { recursive: true }) writeFileSync( path.join(dir, 'worldcup.meta.json'), JSON.stringify(meta, null, 2), 'utf-8', ) writeFileSync( path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), 'utf-8', ) if (stadiums.size > 0) { writeFileSync( path.join(dir, 'worldcup.stadiums.json'), JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), 'utf-8', ) } const groupList: Group[] = [] groups.forEach((teams, name) => { groupList.push({ name, teams: Array.from(teams) }) }) if (groupList.length > 0) { writeFileSync( path.join(dir, 'worldcup.groups.json'), JSON.stringify({ groups: groupList }, null, 2), 'utf-8', ) } if (squads.length > 0) { writeFileSync( path.join(dir, 'worldcup.squads.json'), JSON.stringify(squads, null, 2), 'utf-8', ) } } // ── Entry point ──────────────────────────────────────────────────────────── async function main() { const onlyYear = process.argv[2] ? parseInt(process.argv[2]) : null const yearsToScrape = onlyYear ? [onlyYear] : YEARS console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia...`) for (const year of yearsToScrape) { process.stdout.write(` ${year}... `) const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`) if (!mainHtml) { console.log('FAILED'); continue } const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml) await delay(600) const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`) const squads = squadHtml ? scrapeSquads(squadHtml) : [] writeOutput(year, matches, stadiums, groups, squads, meta) console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`) await delay(600) } console.log('\nDone! Files written to app/data/wikipedia/{year}/') } main().catch(e => { console.error(e); process.exit(1) })