Files
worldcup/lib/wiki-scraper.ts
T
valknar 9ce2a4e27c fix: use full player names from title attr, preserve UTC offset in match times
Wikipedia abbreviates goal scorer display text (e.g. "Müller") but the
<a title="Thomas Müller"> attribute always has the full name. Switch
parseGoals() to prefer title attr and strip disambiguation suffixes like
"(soccer, born 1993)". This ensures Gerd Müller and Thomas Müller get
separate player pages.

Also preserve the UTC offset from Wikipedia's ftime (e.g. "12:00 UTC-4")
so that isLive() can accurately compute UTC kickoff time instead of
treating local time as UTC. upcomingMatches sorts by SPLIT_PART on the
HH:MM part to ignore the timezone suffix.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-15 18:14:53 +02:00

502 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { load } from 'cheerio'
import type { CheerioAPI, Cheerio } from 'cheerio'
import type { Element } from 'domhandler'
// ── Types ──────────────────────────────────────────────────────────────────
export type Goal = {
name: string
minute?: number
offset?: number
penalty?: boolean
owngoal?: boolean
}
export type ScoreObj = {
ft?: [number, number]
et?: [number, number]
p?: [number, number]
}
export type Match = {
round: string
group?: string
date?: string
time?: string
team1: string
team2: string
score?: ScoreObj
goals1?: Goal[]
goals2?: Goal[]
ground?: string
}
export type Stadium = { name: string; city: string }
export type Group = { name: string; teams: string[] }
export type Meta = {
host: string
teams_count: number | null
winner: string | null
runner_up: string | null
third_place: string | null
fourth_place: string | null
}
export type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
export type Squad = { name: string; players: Player[] }
export type YearResult = {
matches: Match[]
stadiums: Map<string, Stadium>
groups: Map<string, Set<string>>
meta: Meta
}
type State = { active: boolean; round: string; group: string | null }
// ── Fetch ──────────────────────────────────────────────────────────────────
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
export async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
for (let attempt = 0; attempt < retries; attempt++) {
try {
if (attempt > 0) await delay(3000 * attempt)
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0' } })
if (!res.ok) continue
const data = await res.json() as { parse?: { text?: { '*': string } } }
const html = data?.parse?.text?.['*']
if (html) return html
} catch {
// retry
}
}
return null
}
// ── Team name normalisation ────────────────────────────────────────────────
const TEAM_ALIASES: Record<string, string> = {
'West Germany': 'Germany',
'Korea Republic': 'South Korea',
'IR Iran': 'Iran',
'Bosnia & Herzegovina': 'Bosnia and Herzegovina',
'USA': 'United States',
}
export function normalizeTeam(name: string): string {
return TEAM_ALIASES[name] ?? name
}
// ── Parsing helpers ────────────────────────────────────────────────────────
function parseScoreText(text: string): [number, number] | null {
const m = text.match(/(\d+)\s*[\-]\s*(\d+)/)
if (!m) return null
return [parseInt(m[1]), parseInt(m[2])]
}
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
let name = ''
$cell.find('a').each((_, a) => {
const $a = $(a)
if (!$a.find('img').length && $a.text().trim()) {
name = $a.text().trim()
return false
}
})
return normalizeTeam(name)
}
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
const goals: Goal[] = []
$td.find('li').each((_, li) => {
const $li = $(li)
let playerName = ''
$li.find('a').each((_, a) => {
if (!$(a).closest('.fb-goal').length) {
const display = $(a).text().trim()
if (!display) return
// title attr has the full unabbreviated name; strip disambiguation suffix
const titleAttr = ($(a).attr('title') ?? '').replace(/\s*\([^)]*\)\s*$/, '').trim()
playerName = titleAttr || display
return false
}
})
if (!playerName) return
const $fbGoal = $li.find('.fb-goal')
if (!$fbGoal.length) return
$fbGoal.children('span').each((_, span) => {
const $span = $(span)
if ($span.attr('typeof')) return
const text = $span.text()
const minMatch = text.match(/(\d+)(?:\+(\d+))?[']/)
if (!minMatch) return
const minute = parseInt(minMatch[1])
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
const goal: Goal = { name: playerName }
if (!isNaN(minute)) goal.minute = minute
if (offset) goal.offset = offset
if (text.includes('pen.')) goal.penalty = true
if (text.includes('o.g.')) goal.owngoal = true
goals.push(goal)
})
})
return goals
}
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
const $loc = $box.find('[itemprop="name address"]').first()
if ($loc.length) return $loc.text().trim()
return $box.find('.fright').first().text().split('\n')[0].trim()
}
function parseGroundParts(ground: string): { name: string; city: string } {
const commaIdx = ground.indexOf(',')
if (commaIdx !== -1) return { name: ground.slice(0, commaIdx).trim(), city: ground.slice(commaIdx + 1).trim() }
return { name: ground, city: '' }
}
function parseTime12h(text: string): string | undefined {
// Normalise Unicode minus (U+2212) used by Wikipedia to ASCII hyphen
const t = text.replace(//g, '-')
const m = t.match(/(\d{1,2}):(\d{2})\s*([ap]\.?m\.?)/i)
if (!m) return t.match(/(\d{2}:\d{2})/)?.[1]
let h = parseInt(m[1])
const min = m[2]
const isPm = m[3].toLowerCase().replace(/\./g, '').startsWith('p')
if (isPm && h !== 12) h += 12
else if (!isPm && h === 12) h = 0
const time24 = `${String(h).padStart(2, '0')}:${min}`
// Preserve UTC offset so isLive() can compute correct UTC kickoff time
const tz = t.match(/UTC([+-]\d+(?:\.\d+)?)/i)
return tz ? `${time24} UTC${tz[1]}` : time24
}
function parseBox($: CheerioAPI, $box: Cheerio<Element>, round: string, group: string | null): Match | null {
const team1 = extractTeam($, $box.find('.fhome'))
const team2 = extractTeam($, $box.find('.faway'))
if (!team1 || !team2) return null
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
const timeText = $box.find('.ftime').first().text().trim()
const timeStr = parseTime12h(timeText)
const scoreText = $box.find('.fscore').first().text().trim()
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
const scoreArr = parseScoreText(scoreText)
const $regularRow = $box.find('tr.fgoals').first()
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
let penScore: [number, number] | undefined
$box.find('tr').each((_, tr) => {
const $tr = $(tr)
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
const ps = parseScoreText($tr.next('tr').find('th').not('.fhome,.faway').first().text().trim())
if (ps) penScore = ps
return false
}
})
let score: ScoreObj | undefined
if (scoreArr) {
if (hasAET) {
const ftGoals = (gs: Goal[], includeOG = false) =>
gs.filter(g => {
const w90 = g.minute === undefined || g.minute <= 90
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
}).length
score = { ft: [ftGoals(goals1) + ftGoals(goals2, true), ftGoals(goals2) + ftGoals(goals1, true)], et: scoreArr }
} else {
score = { ft: scoreArr }
}
if (penScore) score.p = penScore
}
const ground = extractGround($, $box) || undefined
return {
round,
...(group ? { group } : {}),
...(dateStr ? { date: dateStr } : {}),
...(timeStr ? { time: timeStr } : {}),
team1, team2,
...(score ? { score } : {}),
...(goals1.length ? { goals1 } : {}),
...(goals2.length ? { goals2 } : {}),
...(ground ? { ground } : {}),
}
}
function collectBoxes($: CheerioAPI, round: string, group: string | null): Match[] {
const matches: Match[] = []
$('.footballbox').each((_, el) => {
const m = parseBox($, $(el), round, group)
if (m) matches.push(m)
})
return matches
}
function processHeading(text: string, level: number, state: State): void {
const t = text.toLowerCase().trim()
if (level === 2) {
if (/group stage/i.test(t) && !/second/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/first group stage/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/second group stage/i.test(t)) {
state.active = true; state.round = 'Second group stage'; state.group = null
} else if (t === 'final round') {
state.active = true; state.round = 'Final round'; state.group = null
} else if (/final tournament/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/knock.?out stage/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/round of 16/i.test(t)) {
state.active = true; state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.active = true; state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.active = true; state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.active = true; state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.active = true; state.round = 'Final'; state.group = null
} else {
state.active = false
}
return
}
if (!state.active) return
if (level === 3 || level === 4) {
if (/^group [a-z1-9]+$/i.test(t)) {
state.group = text.trim()
} else if (/round of 32/i.test(t)) {
state.round = 'Round of 32'; state.group = null
} else if (/round of 16/i.test(t)) {
state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.round = 'Final'; state.group = null
}
}
}
// ── Infobox ────────────────────────────────────────────────────────────────
function parseInfobox($: CheerioAPI): Partial<Meta> {
const result: Partial<Meta> = {}
function tdText($td: Cheerio<Element>): string {
const $clone = $td.clone()
$clone.find('br').replaceWith(' / ')
$clone.find('sup, img').remove()
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
}
function tdFirstLink($td: Cheerio<Element>): string | null {
let name: string | null = null
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
})
return name ?? (tdText($td) || null)
}
function tdAllLinks($td: Cheerio<Element>): string {
const names: string[] = []
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) names.push(t)
})
return names.length ? names.join(' / ') : tdText($td)
}
$('table.infobox').first().find('tr').each((_, tr) => {
const $tr = $(tr)
const label = $tr.find('th').text().trim().toLowerCase()
const $td = $tr.find('td').first()
if (!$td.length) return
if (/host countr/i.test(label)) result.host = tdAllLinks($td)
else if (/^teams$/i.test(label)) { const m = $td.text().match(/\d+/); if (m) result.teams_count = parseInt(m[0]) }
else if (/champion/i.test(label)) result.winner = tdFirstLink($td)
else if (/runners?.up/i.test(label)) result.runner_up = tdFirstLink($td)
else if (/third.place/i.test(label)) result.third_place = tdFirstLink($td)
else if (/fourth.place/i.test(label)) result.fourth_place = tdFirstLink($td)
})
return result
}
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
function matchWinner(m: Match): [string, string] | null {
if (!m.score) return null
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
if (h > a) return [m.team1, m.team2]
if (a > h) return [m.team2, m.team1]
if (m.score.p) {
const [ph, pa] = m.score.p
if (ph > pa) return [m.team1, m.team2]
if (pa > ph) return [m.team2, m.team1]
}
return null
}
let winner: string | null = null, runner_up: string | null = null
let third_place: string | null = null, fourth_place: string | null = null
for (const m of matches) {
if (m.round === 'Final') {
const r = matchWinner(m); if (r) [winner, runner_up] = r
} else if (m.round === 'Third-place match') {
const r = matchWinner(m); if (r) [third_place, fourth_place] = r
}
}
return { winner, runner_up, third_place, fourth_place }
}
// ── Main year scraper ──────────────────────────────────────────────────────
export async function scrapeYear(
year: number,
mainHtml: string,
opts?: { skipGroups?: Set<string> },
): Promise<YearResult> {
const $ = load(mainHtml)
const matches: Match[] = []
const stadiums = new Map<string, Stadium>()
const groups = new Map<string, Set<string>>()
const state: State = { active: false, round: '', group: null }
const groupSubpages = new Map<string, string>()
const groupsOnMainPage = new Set<string>()
function recordMatch(m: Match) {
matches.push(m)
if (m.group) groupsOnMainPage.add(m.group)
if (m.ground) {
const { name, city } = parseGroundParts(m.ground)
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
}
if (m.group) {
if (!groups.has(m.group)) groups.set(m.group, new Set())
groups.get(m.group)!.add(m.team1)
groups.get(m.group)!.add(m.team2)
}
}
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, state)
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
if (state.active && state.group) {
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
if (link) {
const page = link.replace('/wiki/', '').split('#')[0]
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group))
groupSubpages.set(state.group, page)
}
}
} else if ($el.hasClass('footballbox')) {
if (!state.active) return
const m = parseBox($, $el, state.round || state.group || 'Unknown', state.group)
if (m) recordMatch(m)
}
})
for (const [group, page] of groupSubpages) {
if (groupsOnMainPage.has(group)) continue
if (opts?.skipGroups?.has(group)) {
process.stdout.write(`[skip ${group}] `)
continue
}
await delay(1200)
const subHtml = await fetchWikiHtml(page)
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
const stateTemp: State = { active: false, round: '', group: null }
let round = 'Group stage'
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
const $h = $(el).find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, stateTemp)
if (stateTemp.group === group) { round = stateTemp.round || 'Group stage'; return false }
})
const $sub = load(subHtml)
for (const m of collectBoxes($sub, round, group)) recordMatch(m)
process.stdout.write(`[+${page.slice(-8)}] `)
}
const infobox = parseInfobox($)
const placements = derivePlacements(matches)
const meta: Meta = {
host: infobox.host ?? '',
teams_count: infobox.teams_count ?? null,
winner: placements.winner ?? infobox.winner ?? null,
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
third_place: placements.third_place ?? infobox.third_place ?? null,
fourth_place: placements.fourth_place?? infobox.fourth_place?? null,
}
return { matches, stadiums, groups, meta }
}
// ── Squad page scraper ─────────────────────────────────────────────────────
export function scrapeSquads(html: string): Squad[] {
const $ = load(html)
const squads: Squad[] = []
let currentTeam: Squad | null = null
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h3, h4').first()
if (!$h.length) return
if (parseInt($h.prop('tagName')?.slice(1) ?? '9') !== 3) return
const name = $h.text().replace(/\[edit\]/g, '').trim()
if (/^group /i.test(name)) return
currentTeam = { name, players: [] }
squads.push(currentTeam)
return
}
if (!currentTeam) return
let number: number | undefined
let pos: string | undefined
let playerName = ''
let dob: string | undefined
$el.find('td, th[scope="row"]').each((i, td) => {
const $td = $(td)
const text = $td.text().trim()
if ($td.is('th[scope="row"]')) {
playerName = $td.find('a').first().text().trim() || text
} else if (i === 0 && !playerName) {
const n = parseInt(text); if (!isNaN(n)) number = n
} else if (i === 1 && !playerName && !pos) {
const p = $td.find('a').first().text().trim()
if (['GK', 'DF', 'MF', 'FW'].includes(p)) pos = p
}
const $bday = $td.find('.bday')
if ($bday.length) dob = $bday.text().trim()
})
if (!playerName) return
const player: Player = { name: playerName }
if (number !== undefined) player.number = number
if (pos) player.pos = pos
if (dob) player.date_of_birth = dob
currentTeam.players.push(player)
})
return squads.filter(s => s.players.length > 0)
}