import { load } from 'cheerio' import type { CheerioAPI, Cheerio } from 'cheerio' import type { Element } from 'domhandler' // ── Types ────────────────────────────────────────────────────────────────── export type Goal = { name: string minute?: number offset?: number penalty?: boolean owngoal?: boolean } export type ScoreObj = { ft?: [number, number] et?: [number, number] p?: [number, number] } export type Match = { round: string group?: string date?: string time?: string team1: string team2: string score?: ScoreObj goals1?: Goal[] goals2?: Goal[] ground?: string } export type Stadium = { name: string; city: string } export type Group = { name: string; teams: string[] } export type Meta = { host: string teams_count: number | null winner: string | null runner_up: string | null third_place: string | null fourth_place: string | null } export type Player = { name: string; number?: number; pos?: string; date_of_birth?: string } export type Squad = { name: string; players: Player[] } export type YearResult = { matches: Match[] stadiums: Map groups: Map> meta: Meta } type State = { active: boolean; round: string; group: string | null } // ── Fetch ────────────────────────────────────────────────────────────────── const delay = (ms: number) => new Promise(r => setTimeout(r, ms)) export async function fetchWikiHtml(page: string, retries = 5): Promise { const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1` for (let attempt = 0; attempt < retries; attempt++) { try { if (attempt > 0) await delay(3000 * attempt) const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0' } }) if (!res.ok) continue const data = await res.json() as { parse?: { text?: { '*': string } } } const html = data?.parse?.text?.['*'] if (html) return html } catch { // retry } } return null } // ── Parsing helpers ──────────────────────────────────────────────────────── function parseScoreText(text: string): [number, number] | null { const m = text.match(/(\d+)\s*[–\-]\s*(\d+)/) if (!m) return null return [parseInt(m[1]), parseInt(m[2])] } function extractTeam($: CheerioAPI, $cell: Cheerio): string { let name = '' $cell.find('a').each((_, a) => { const $a = $(a) if (!$a.find('img').length && $a.text().trim()) { name = $a.text().trim() return false } }) return name } function parseGoals($: CheerioAPI, $td: Cheerio): Goal[] { const goals: Goal[] = [] $td.find('li').each((_, li) => { const $li = $(li) let playerName = '' $li.find('a').each((_, a) => { if (!$(a).closest('.fb-goal').length) { const t = $(a).text().trim() if (t) { playerName = t; return false } } }) if (!playerName) return const $fbGoal = $li.find('.fb-goal') if (!$fbGoal.length) return $fbGoal.children('span').each((_, span) => { const $span = $(span) if ($span.attr('typeof')) return const text = $span.text() const minMatch = text.match(/(\d+)(?:\+(\d+))?['′]/) if (!minMatch) return const minute = parseInt(minMatch[1]) const offset = minMatch[2] ? parseInt(minMatch[2]) : 0 const goal: Goal = { name: playerName } if (!isNaN(minute)) goal.minute = minute if (offset) goal.offset = offset if (text.includes('pen.')) goal.penalty = true if (text.includes('o.g.')) goal.owngoal = true goals.push(goal) }) }) return goals } function extractGround($: CheerioAPI, $box: Cheerio): string { const $loc = $box.find('[itemprop="name address"]').first() if ($loc.length) return $loc.text().trim() return $box.find('.fright').first().text().split('\n')[0].trim() } function parseGroundParts(ground: string): { name: string; city: string } { const commaIdx = ground.indexOf(',') if (commaIdx !== -1) return { name: ground.slice(0, commaIdx).trim(), city: ground.slice(commaIdx + 1).trim() } return { name: ground, city: '' } } function parseBox($: CheerioAPI, $box: Cheerio, round: string, group: string | null): Match | null { const team1 = extractTeam($, $box.find('.fhome')) const team2 = extractTeam($, $box.find('.faway')) if (!team1 || !team2) return null const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined const timeText = $box.find('.ftime').first().text().trim() const timeStr = timeText.match(/(\d{2}:\d{2})/)?.[1] const scoreText = $box.find('.fscore').first().text().trim() const hasAET = scoreText.toLowerCase().includes('a.e.t.') const scoreArr = parseScoreText(scoreText) const $regularRow = $box.find('tr.fgoals').first() const goals1 = parseGoals($, $regularRow.find('.fhgoal')) const goals2 = parseGoals($, $regularRow.find('.fagoal')) let penScore: [number, number] | undefined $box.find('tr').each((_, tr) => { const $tr = $(tr) if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) { const ps = parseScoreText($tr.next('tr').find('th').not('.fhome,.faway').first().text().trim()) if (ps) penScore = ps return false } }) let score: ScoreObj | undefined if (scoreArr) { if (hasAET) { const ftGoals = (gs: Goal[], includeOG = false) => gs.filter(g => { const w90 = g.minute === undefined || g.minute <= 90 return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90 }).length score = { ft: [ftGoals(goals1) + ftGoals(goals2, true), ftGoals(goals2) + ftGoals(goals1, true)], et: scoreArr } } else { score = { ft: scoreArr } } if (penScore) score.p = penScore } const ground = extractGround($, $box) || undefined return { round, ...(group ? { group } : {}), ...(dateStr ? { date: dateStr } : {}), ...(timeStr ? { time: timeStr } : {}), team1, team2, ...(score ? { score } : {}), ...(goals1.length ? { goals1 } : {}), ...(goals2.length ? { goals2 } : {}), ...(ground ? { ground } : {}), } } function collectBoxes($: CheerioAPI, round: string, group: string | null): Match[] { const matches: Match[] = [] $('.footballbox').each((_, el) => { const m = parseBox($, $(el), round, group) if (m) matches.push(m) }) return matches } function processHeading(text: string, level: number, state: State): void { const t = text.toLowerCase().trim() if (level === 2) { if (/group stage/i.test(t) && !/second/i.test(t)) { state.active = true; state.round = 'Group stage'; state.group = null } else if (/first group stage/i.test(t)) { state.active = true; state.round = 'Group stage'; state.group = null } else if (/second group stage/i.test(t)) { state.active = true; state.round = 'Second group stage'; state.group = null } else if (t === 'final round') { state.active = true; state.round = 'Final round'; state.group = null } else if (/final tournament/i.test(t)) { state.active = true; state.round = ''; state.group = null } else if (/knock.?out stage/i.test(t)) { state.active = true; state.round = ''; state.group = null } else if (/round of 16/i.test(t)) { state.active = true; state.round = 'Round of 16'; state.group = null } else if (/quarter.final/i.test(t)) { state.active = true; state.round = 'Quarter-finals'; state.group = null } else if (/semi.final/i.test(t)) { state.active = true; state.round = 'Semi-finals'; state.group = null } else if (/third.place|match for third|play.off for third/i.test(t)) { state.active = true; state.round = 'Third-place match'; state.group = null } else if (t === 'final') { state.active = true; state.round = 'Final'; state.group = null } else { state.active = false } return } if (!state.active) return if (level === 3 || level === 4) { if (/^group [a-h1-9]+$/i.test(t)) { state.group = text.trim() } else if (/round of 32/i.test(t)) { state.round = 'Round of 32'; state.group = null } else if (/round of 16/i.test(t)) { state.round = 'Round of 16'; state.group = null } else if (/quarter.final/i.test(t)) { state.round = 'Quarter-finals'; state.group = null } else if (/semi.final/i.test(t)) { state.round = 'Semi-finals'; state.group = null } else if (/third.place|match for third|play.off for third/i.test(t)) { state.round = 'Third-place match'; state.group = null } else if (t === 'final') { state.round = 'Final'; state.group = null } } } // ── Infobox ──────────────────────────────────────────────────────────────── function parseInfobox($: CheerioAPI): Partial { const result: Partial = {} function tdText($td: Cheerio): string { const $clone = $td.clone() $clone.find('br').replaceWith(' / ') $clone.find('sup, img').remove() return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim() } function tdFirstLink($td: Cheerio): string | null { let name: string | null = null $td.find('a').each((_, a) => { const t = $(a).clone().find('img').remove().end().text().trim() if (t && !/\[\d+\]/.test(t)) { name = t; return false } }) return name ?? (tdText($td) || null) } function tdAllLinks($td: Cheerio): string { const names: string[] = [] $td.find('a').each((_, a) => { const t = $(a).clone().find('img').remove().end().text().trim() if (t && !/\[\d+\]/.test(t)) names.push(t) }) return names.length ? names.join(' / ') : tdText($td) } $('table.infobox').first().find('tr').each((_, tr) => { const $tr = $(tr) const label = $tr.find('th').text().trim().toLowerCase() const $td = $tr.find('td').first() if (!$td.length) return if (/host countr/i.test(label)) result.host = tdAllLinks($td) else if (/^teams$/i.test(label)) { const m = $td.text().match(/\d+/); if (m) result.teams_count = parseInt(m[0]) } else if (/champion/i.test(label)) result.winner = tdFirstLink($td) else if (/runners?.up/i.test(label)) result.runner_up = tdFirstLink($td) else if (/third.place/i.test(label)) result.third_place = tdFirstLink($td) else if (/fourth.place/i.test(label)) result.fourth_place = tdFirstLink($td) }) return result } function derivePlacements(matches: Match[]): Pick { function matchWinner(m: Match): [string, string] | null { if (!m.score) return null const [h, a] = m.score.et ?? m.score.ft ?? [0, 0] if (h > a) return [m.team1, m.team2] if (a > h) return [m.team2, m.team1] if (m.score.p) { const [ph, pa] = m.score.p if (ph > pa) return [m.team1, m.team2] if (pa > ph) return [m.team2, m.team1] } return null } let winner: string | null = null, runner_up: string | null = null let third_place: string | null = null, fourth_place: string | null = null for (const m of matches) { if (m.round === 'Final') { const r = matchWinner(m); if (r) [winner, runner_up] = r } else if (m.round === 'Third-place match') { const r = matchWinner(m); if (r) [third_place, fourth_place] = r } } return { winner, runner_up, third_place, fourth_place } } // ── Main year scraper ────────────────────────────────────────────────────── export async function scrapeYear( year: number, mainHtml: string, opts?: { skipGroups?: Set }, ): Promise { const $ = load(mainHtml) const matches: Match[] = [] const stadiums = new Map() const groups = new Map>() const state: State = { active: false, round: '', group: null } const groupSubpages = new Map() const groupsOnMainPage = new Set() function recordMatch(m: Match) { matches.push(m) if (m.group) groupsOnMainPage.add(m.group) if (m.ground) { const { name, city } = parseGroundParts(m.ground) if (name && !stadiums.has(name)) stadiums.set(name, { name, city }) } if (m.group) { if (!groups.has(m.group)) groups.set(m.group, new Set()) groups.get(m.group)!.add(m.team1) groups.get(m.group)!.add(m.team2) } } $('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => { const $el = $(el) if ($el.hasClass('mw-heading')) { const $h = $el.find('h2, h3, h4').first() const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, state) } else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) { if (state.active && state.group) { const link = $el.find('a[href^="/wiki/"]').first().attr('href') if (link) { const page = link.replace('/wiki/', '').split('#')[0] if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group)) groupSubpages.set(state.group, page) } } } else if ($el.hasClass('footballbox')) { if (!state.active) return const m = parseBox($, $el, state.round || state.group || 'Unknown', state.group) if (m) recordMatch(m) } }) for (const [group, page] of groupSubpages) { if (groupsOnMainPage.has(group)) continue if (opts?.skipGroups?.has(group)) { process.stdout.write(`[skip ${group}] `) continue } await delay(1200) const subHtml = await fetchWikiHtml(page) if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue } const stateTemp: State = { active: false, round: '', group: null } let round = 'Group stage' $('.mw-parser-output').find('div.mw-heading').each((_, el) => { const $h = $(el).find('h2, h3, h4').first() const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, stateTemp) if (stateTemp.group === group) { round = stateTemp.round || 'Group stage'; return false } }) const $sub = load(subHtml) for (const m of collectBoxes($sub, round, group)) recordMatch(m) process.stdout.write(`[+${page.slice(-8)}] `) } const infobox = parseInfobox($) const placements = derivePlacements(matches) const meta: Meta = { host: infobox.host ?? '', teams_count: infobox.teams_count ?? null, winner: placements.winner ?? infobox.winner ?? null, runner_up: placements.runner_up ?? infobox.runner_up ?? null, third_place: placements.third_place ?? infobox.third_place ?? null, fourth_place: placements.fourth_place?? infobox.fourth_place?? null, } return { matches, stadiums, groups, meta } } // ── Squad page scraper ───────────────────────────────────────────────────── export function scrapeSquads(html: string): Squad[] { const $ = load(html) const squads: Squad[] = [] let currentTeam: Squad | null = null $('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => { const $el = $(el) if ($el.hasClass('mw-heading')) { const $h = $el.find('h3, h4').first() if (!$h.length) return if (parseInt($h.prop('tagName')?.slice(1) ?? '9') !== 3) return const name = $h.text().replace(/\[edit\]/g, '').trim() if (/^group /i.test(name)) return currentTeam = { name, players: [] } squads.push(currentTeam) return } if (!currentTeam) return let number: number | undefined let pos: string | undefined let playerName = '' let dob: string | undefined $el.find('td, th[scope="row"]').each((i, td) => { const $td = $(td) const text = $td.text().trim() if ($td.is('th[scope="row"]')) { playerName = $td.find('a').first().text().trim() || text } else if (i === 0 && !playerName) { const n = parseInt(text); if (!isNaN(n)) number = n } else if (i === 1 && !playerName && !pos) { const p = $td.find('a').first().text().trim() if (['GK', 'DF', 'MF', 'FW'].includes(p)) pos = p } const $bday = $td.find('.bday') if ($bday.length) dob = $bday.text().trim() }) if (!playerName) return const player: Player = { name: playerName } if (number !== undefined) player.number = number if (pos) player.pos = pos if (dob) player.date_of_birth = dob currentTeam.players.push(player) }) return squads.filter(s => s.players.length > 0) }