Files
worldcup/lib/wiki-scraper.ts
T
valknar 187ee2e312 fix: parse Wikipedia 12h time format and sort upcoming matches with NULLS LAST
Wikipedia stores match times as "6:00 p.m." (1-digit hour) which didn't
match the \d{2}:\d{2} regex, producing NULL for those matches. Introduced
parseTime12h() to handle 1-2 digit hours + AM/PM and convert to 24h.
Also sort upcomingMatches by NULLS LAST so unscheduled games appear after
timed ones rather than first. Dropped "openfootball" data attribution.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-15 17:50:30 +02:00

493 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { load } from 'cheerio'
import type { CheerioAPI, Cheerio } from 'cheerio'
import type { Element } from 'domhandler'
// ── Types ──────────────────────────────────────────────────────────────────
export type Goal = {
name: string
minute?: number
offset?: number
penalty?: boolean
owngoal?: boolean
}
export type ScoreObj = {
ft?: [number, number]
et?: [number, number]
p?: [number, number]
}
export type Match = {
round: string
group?: string
date?: string
time?: string
team1: string
team2: string
score?: ScoreObj
goals1?: Goal[]
goals2?: Goal[]
ground?: string
}
export type Stadium = { name: string; city: string }
export type Group = { name: string; teams: string[] }
export type Meta = {
host: string
teams_count: number | null
winner: string | null
runner_up: string | null
third_place: string | null
fourth_place: string | null
}
export type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
export type Squad = { name: string; players: Player[] }
export type YearResult = {
matches: Match[]
stadiums: Map<string, Stadium>
groups: Map<string, Set<string>>
meta: Meta
}
type State = { active: boolean; round: string; group: string | null }
// ── Fetch ──────────────────────────────────────────────────────────────────
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
export async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
for (let attempt = 0; attempt < retries; attempt++) {
try {
if (attempt > 0) await delay(3000 * attempt)
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0' } })
if (!res.ok) continue
const data = await res.json() as { parse?: { text?: { '*': string } } }
const html = data?.parse?.text?.['*']
if (html) return html
} catch {
// retry
}
}
return null
}
// ── Team name normalisation ────────────────────────────────────────────────
const TEAM_ALIASES: Record<string, string> = {
'West Germany': 'Germany',
'Korea Republic': 'South Korea',
'IR Iran': 'Iran',
'Bosnia & Herzegovina': 'Bosnia and Herzegovina',
'USA': 'United States',
}
export function normalizeTeam(name: string): string {
return TEAM_ALIASES[name] ?? name
}
// ── Parsing helpers ────────────────────────────────────────────────────────
function parseScoreText(text: string): [number, number] | null {
const m = text.match(/(\d+)\s*[\-]\s*(\d+)/)
if (!m) return null
return [parseInt(m[1]), parseInt(m[2])]
}
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
let name = ''
$cell.find('a').each((_, a) => {
const $a = $(a)
if (!$a.find('img').length && $a.text().trim()) {
name = $a.text().trim()
return false
}
})
return normalizeTeam(name)
}
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
const goals: Goal[] = []
$td.find('li').each((_, li) => {
const $li = $(li)
let playerName = ''
$li.find('a').each((_, a) => {
if (!$(a).closest('.fb-goal').length) {
const t = $(a).text().trim()
if (t) { playerName = t; return false }
}
})
if (!playerName) return
const $fbGoal = $li.find('.fb-goal')
if (!$fbGoal.length) return
$fbGoal.children('span').each((_, span) => {
const $span = $(span)
if ($span.attr('typeof')) return
const text = $span.text()
const minMatch = text.match(/(\d+)(?:\+(\d+))?[']/)
if (!minMatch) return
const minute = parseInt(minMatch[1])
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
const goal: Goal = { name: playerName }
if (!isNaN(minute)) goal.minute = minute
if (offset) goal.offset = offset
if (text.includes('pen.')) goal.penalty = true
if (text.includes('o.g.')) goal.owngoal = true
goals.push(goal)
})
})
return goals
}
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
const $loc = $box.find('[itemprop="name address"]').first()
if ($loc.length) return $loc.text().trim()
return $box.find('.fright').first().text().split('\n')[0].trim()
}
function parseGroundParts(ground: string): { name: string; city: string } {
const commaIdx = ground.indexOf(',')
if (commaIdx !== -1) return { name: ground.slice(0, commaIdx).trim(), city: ground.slice(commaIdx + 1).trim() }
return { name: ground, city: '' }
}
function parseTime12h(text: string): string | undefined {
const m = text.match(/(\d{1,2}):(\d{2})\s*([ap]\.?m\.?)/i)
if (!m) return text.match(/(\d{2}:\d{2})/)?.[1]
let h = parseInt(m[1])
const min = m[2]
const isPm = m[3].toLowerCase().replace(/\./g, '').startsWith('p')
if (isPm && h !== 12) h += 12
else if (!isPm && h === 12) h = 0
return `${String(h).padStart(2, '0')}:${min}`
}
function parseBox($: CheerioAPI, $box: Cheerio<Element>, round: string, group: string | null): Match | null {
const team1 = extractTeam($, $box.find('.fhome'))
const team2 = extractTeam($, $box.find('.faway'))
if (!team1 || !team2) return null
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
const timeText = $box.find('.ftime').first().text().trim()
const timeStr = parseTime12h(timeText)
const scoreText = $box.find('.fscore').first().text().trim()
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
const scoreArr = parseScoreText(scoreText)
const $regularRow = $box.find('tr.fgoals').first()
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
let penScore: [number, number] | undefined
$box.find('tr').each((_, tr) => {
const $tr = $(tr)
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
const ps = parseScoreText($tr.next('tr').find('th').not('.fhome,.faway').first().text().trim())
if (ps) penScore = ps
return false
}
})
let score: ScoreObj | undefined
if (scoreArr) {
if (hasAET) {
const ftGoals = (gs: Goal[], includeOG = false) =>
gs.filter(g => {
const w90 = g.minute === undefined || g.minute <= 90
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
}).length
score = { ft: [ftGoals(goals1) + ftGoals(goals2, true), ftGoals(goals2) + ftGoals(goals1, true)], et: scoreArr }
} else {
score = { ft: scoreArr }
}
if (penScore) score.p = penScore
}
const ground = extractGround($, $box) || undefined
return {
round,
...(group ? { group } : {}),
...(dateStr ? { date: dateStr } : {}),
...(timeStr ? { time: timeStr } : {}),
team1, team2,
...(score ? { score } : {}),
...(goals1.length ? { goals1 } : {}),
...(goals2.length ? { goals2 } : {}),
...(ground ? { ground } : {}),
}
}
function collectBoxes($: CheerioAPI, round: string, group: string | null): Match[] {
const matches: Match[] = []
$('.footballbox').each((_, el) => {
const m = parseBox($, $(el), round, group)
if (m) matches.push(m)
})
return matches
}
function processHeading(text: string, level: number, state: State): void {
const t = text.toLowerCase().trim()
if (level === 2) {
if (/group stage/i.test(t) && !/second/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/first group stage/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/second group stage/i.test(t)) {
state.active = true; state.round = 'Second group stage'; state.group = null
} else if (t === 'final round') {
state.active = true; state.round = 'Final round'; state.group = null
} else if (/final tournament/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/knock.?out stage/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/round of 16/i.test(t)) {
state.active = true; state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.active = true; state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.active = true; state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.active = true; state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.active = true; state.round = 'Final'; state.group = null
} else {
state.active = false
}
return
}
if (!state.active) return
if (level === 3 || level === 4) {
if (/^group [a-z1-9]+$/i.test(t)) {
state.group = text.trim()
} else if (/round of 32/i.test(t)) {
state.round = 'Round of 32'; state.group = null
} else if (/round of 16/i.test(t)) {
state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.round = 'Final'; state.group = null
}
}
}
// ── Infobox ────────────────────────────────────────────────────────────────
function parseInfobox($: CheerioAPI): Partial<Meta> {
const result: Partial<Meta> = {}
function tdText($td: Cheerio<Element>): string {
const $clone = $td.clone()
$clone.find('br').replaceWith(' / ')
$clone.find('sup, img').remove()
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
}
function tdFirstLink($td: Cheerio<Element>): string | null {
let name: string | null = null
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
})
return name ?? (tdText($td) || null)
}
function tdAllLinks($td: Cheerio<Element>): string {
const names: string[] = []
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) names.push(t)
})
return names.length ? names.join(' / ') : tdText($td)
}
$('table.infobox').first().find('tr').each((_, tr) => {
const $tr = $(tr)
const label = $tr.find('th').text().trim().toLowerCase()
const $td = $tr.find('td').first()
if (!$td.length) return
if (/host countr/i.test(label)) result.host = tdAllLinks($td)
else if (/^teams$/i.test(label)) { const m = $td.text().match(/\d+/); if (m) result.teams_count = parseInt(m[0]) }
else if (/champion/i.test(label)) result.winner = tdFirstLink($td)
else if (/runners?.up/i.test(label)) result.runner_up = tdFirstLink($td)
else if (/third.place/i.test(label)) result.third_place = tdFirstLink($td)
else if (/fourth.place/i.test(label)) result.fourth_place = tdFirstLink($td)
})
return result
}
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
function matchWinner(m: Match): [string, string] | null {
if (!m.score) return null
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
if (h > a) return [m.team1, m.team2]
if (a > h) return [m.team2, m.team1]
if (m.score.p) {
const [ph, pa] = m.score.p
if (ph > pa) return [m.team1, m.team2]
if (pa > ph) return [m.team2, m.team1]
}
return null
}
let winner: string | null = null, runner_up: string | null = null
let third_place: string | null = null, fourth_place: string | null = null
for (const m of matches) {
if (m.round === 'Final') {
const r = matchWinner(m); if (r) [winner, runner_up] = r
} else if (m.round === 'Third-place match') {
const r = matchWinner(m); if (r) [third_place, fourth_place] = r
}
}
return { winner, runner_up, third_place, fourth_place }
}
// ── Main year scraper ──────────────────────────────────────────────────────
export async function scrapeYear(
year: number,
mainHtml: string,
opts?: { skipGroups?: Set<string> },
): Promise<YearResult> {
const $ = load(mainHtml)
const matches: Match[] = []
const stadiums = new Map<string, Stadium>()
const groups = new Map<string, Set<string>>()
const state: State = { active: false, round: '', group: null }
const groupSubpages = new Map<string, string>()
const groupsOnMainPage = new Set<string>()
function recordMatch(m: Match) {
matches.push(m)
if (m.group) groupsOnMainPage.add(m.group)
if (m.ground) {
const { name, city } = parseGroundParts(m.ground)
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
}
if (m.group) {
if (!groups.has(m.group)) groups.set(m.group, new Set())
groups.get(m.group)!.add(m.team1)
groups.get(m.group)!.add(m.team2)
}
}
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, state)
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
if (state.active && state.group) {
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
if (link) {
const page = link.replace('/wiki/', '').split('#')[0]
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group))
groupSubpages.set(state.group, page)
}
}
} else if ($el.hasClass('footballbox')) {
if (!state.active) return
const m = parseBox($, $el, state.round || state.group || 'Unknown', state.group)
if (m) recordMatch(m)
}
})
for (const [group, page] of groupSubpages) {
if (groupsOnMainPage.has(group)) continue
if (opts?.skipGroups?.has(group)) {
process.stdout.write(`[skip ${group}] `)
continue
}
await delay(1200)
const subHtml = await fetchWikiHtml(page)
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
const stateTemp: State = { active: false, round: '', group: null }
let round = 'Group stage'
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
const $h = $(el).find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, stateTemp)
if (stateTemp.group === group) { round = stateTemp.round || 'Group stage'; return false }
})
const $sub = load(subHtml)
for (const m of collectBoxes($sub, round, group)) recordMatch(m)
process.stdout.write(`[+${page.slice(-8)}] `)
}
const infobox = parseInfobox($)
const placements = derivePlacements(matches)
const meta: Meta = {
host: infobox.host ?? '',
teams_count: infobox.teams_count ?? null,
winner: placements.winner ?? infobox.winner ?? null,
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
third_place: placements.third_place ?? infobox.third_place ?? null,
fourth_place: placements.fourth_place?? infobox.fourth_place?? null,
}
return { matches, stadiums, groups, meta }
}
// ── Squad page scraper ─────────────────────────────────────────────────────
export function scrapeSquads(html: string): Squad[] {
const $ = load(html)
const squads: Squad[] = []
let currentTeam: Squad | null = null
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h3, h4').first()
if (!$h.length) return
if (parseInt($h.prop('tagName')?.slice(1) ?? '9') !== 3) return
const name = $h.text().replace(/\[edit\]/g, '').trim()
if (/^group /i.test(name)) return
currentTeam = { name, players: [] }
squads.push(currentTeam)
return
}
if (!currentTeam) return
let number: number | undefined
let pos: string | undefined
let playerName = ''
let dob: string | undefined
$el.find('td, th[scope="row"]').each((i, td) => {
const $td = $(td)
const text = $td.text().trim()
if ($td.is('th[scope="row"]')) {
playerName = $td.find('a').first().text().trim() || text
} else if (i === 0 && !playerName) {
const n = parseInt(text); if (!isNaN(n)) number = n
} else if (i === 1 && !playerName && !pos) {
const p = $td.find('a').first().text().trim()
if (['GK', 'DF', 'MF', 'FW'].includes(p)) pos = p
}
const $bday = $td.find('.bday')
if ($bday.length) dob = $bday.text().trim()
})
if (!playerName) return
const player: Player = { name: playerName }
if (number !== undefined) player.number = number
if (pos) player.pos = pos
if (dob) player.date_of_birth = dob
currentTeam.players.push(player)
})
return squads.filter(s => s.players.length > 0)
}