9ce2a4e27c
Wikipedia abbreviates goal scorer display text (e.g. "Müller") but the <a title="Thomas Müller"> attribute always has the full name. Switch parseGoals() to prefer title attr and strip disambiguation suffixes like "(soccer, born 1993)". This ensures Gerd Müller and Thomas Müller get separate player pages. Also preserve the UTC offset from Wikipedia's ftime (e.g. "12:00 UTC-4") so that isLive() can accurately compute UTC kickoff time instead of treating local time as UTC. upcomingMatches sorts by SPLIT_PART on the HH:MM part to ignore the timezone suffix. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
502 lines
18 KiB
TypeScript
502 lines
18 KiB
TypeScript
import { load } from 'cheerio'
|
||
import type { CheerioAPI, Cheerio } from 'cheerio'
|
||
import type { Element } from 'domhandler'
|
||
|
||
// ── Types ──────────────────────────────────────────────────────────────────
|
||
|
||
export type Goal = {
|
||
name: string
|
||
minute?: number
|
||
offset?: number
|
||
penalty?: boolean
|
||
owngoal?: boolean
|
||
}
|
||
|
||
export type ScoreObj = {
|
||
ft?: [number, number]
|
||
et?: [number, number]
|
||
p?: [number, number]
|
||
}
|
||
|
||
export type Match = {
|
||
round: string
|
||
group?: string
|
||
date?: string
|
||
time?: string
|
||
team1: string
|
||
team2: string
|
||
score?: ScoreObj
|
||
goals1?: Goal[]
|
||
goals2?: Goal[]
|
||
ground?: string
|
||
}
|
||
|
||
export type Stadium = { name: string; city: string }
|
||
export type Group = { name: string; teams: string[] }
|
||
export type Meta = {
|
||
host: string
|
||
teams_count: number | null
|
||
winner: string | null
|
||
runner_up: string | null
|
||
third_place: string | null
|
||
fourth_place: string | null
|
||
}
|
||
|
||
export type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
|
||
export type Squad = { name: string; players: Player[] }
|
||
|
||
export type YearResult = {
|
||
matches: Match[]
|
||
stadiums: Map<string, Stadium>
|
||
groups: Map<string, Set<string>>
|
||
meta: Meta
|
||
}
|
||
|
||
type State = { active: boolean; round: string; group: string | null }
|
||
|
||
// ── Fetch ──────────────────────────────────────────────────────────────────
|
||
|
||
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
|
||
|
||
export async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
|
||
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
|
||
for (let attempt = 0; attempt < retries; attempt++) {
|
||
try {
|
||
if (attempt > 0) await delay(3000 * attempt)
|
||
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0' } })
|
||
if (!res.ok) continue
|
||
const data = await res.json() as { parse?: { text?: { '*': string } } }
|
||
const html = data?.parse?.text?.['*']
|
||
if (html) return html
|
||
} catch {
|
||
// retry
|
||
}
|
||
}
|
||
return null
|
||
}
|
||
|
||
// ── Team name normalisation ────────────────────────────────────────────────
|
||
|
||
const TEAM_ALIASES: Record<string, string> = {
|
||
'West Germany': 'Germany',
|
||
'Korea Republic': 'South Korea',
|
||
'IR Iran': 'Iran',
|
||
'Bosnia & Herzegovina': 'Bosnia and Herzegovina',
|
||
'USA': 'United States',
|
||
}
|
||
|
||
export function normalizeTeam(name: string): string {
|
||
return TEAM_ALIASES[name] ?? name
|
||
}
|
||
|
||
// ── Parsing helpers ────────────────────────────────────────────────────────
|
||
|
||
function parseScoreText(text: string): [number, number] | null {
|
||
const m = text.match(/(\d+)\s*[–\-]\s*(\d+)/)
|
||
if (!m) return null
|
||
return [parseInt(m[1]), parseInt(m[2])]
|
||
}
|
||
|
||
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
|
||
let name = ''
|
||
$cell.find('a').each((_, a) => {
|
||
const $a = $(a)
|
||
if (!$a.find('img').length && $a.text().trim()) {
|
||
name = $a.text().trim()
|
||
return false
|
||
}
|
||
})
|
||
return normalizeTeam(name)
|
||
}
|
||
|
||
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
|
||
const goals: Goal[] = []
|
||
$td.find('li').each((_, li) => {
|
||
const $li = $(li)
|
||
let playerName = ''
|
||
$li.find('a').each((_, a) => {
|
||
if (!$(a).closest('.fb-goal').length) {
|
||
const display = $(a).text().trim()
|
||
if (!display) return
|
||
// title attr has the full unabbreviated name; strip disambiguation suffix
|
||
const titleAttr = ($(a).attr('title') ?? '').replace(/\s*\([^)]*\)\s*$/, '').trim()
|
||
playerName = titleAttr || display
|
||
return false
|
||
}
|
||
})
|
||
if (!playerName) return
|
||
const $fbGoal = $li.find('.fb-goal')
|
||
if (!$fbGoal.length) return
|
||
$fbGoal.children('span').each((_, span) => {
|
||
const $span = $(span)
|
||
if ($span.attr('typeof')) return
|
||
const text = $span.text()
|
||
const minMatch = text.match(/(\d+)(?:\+(\d+))?['′]/)
|
||
if (!minMatch) return
|
||
const minute = parseInt(minMatch[1])
|
||
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
|
||
const goal: Goal = { name: playerName }
|
||
if (!isNaN(minute)) goal.minute = minute
|
||
if (offset) goal.offset = offset
|
||
if (text.includes('pen.')) goal.penalty = true
|
||
if (text.includes('o.g.')) goal.owngoal = true
|
||
goals.push(goal)
|
||
})
|
||
})
|
||
return goals
|
||
}
|
||
|
||
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
|
||
const $loc = $box.find('[itemprop="name address"]').first()
|
||
if ($loc.length) return $loc.text().trim()
|
||
return $box.find('.fright').first().text().split('\n')[0].trim()
|
||
}
|
||
|
||
function parseGroundParts(ground: string): { name: string; city: string } {
|
||
const commaIdx = ground.indexOf(',')
|
||
if (commaIdx !== -1) return { name: ground.slice(0, commaIdx).trim(), city: ground.slice(commaIdx + 1).trim() }
|
||
return { name: ground, city: '' }
|
||
}
|
||
|
||
function parseTime12h(text: string): string | undefined {
|
||
// Normalise Unicode minus (U+2212) used by Wikipedia to ASCII hyphen
|
||
const t = text.replace(/−/g, '-')
|
||
const m = t.match(/(\d{1,2}):(\d{2})\s*([ap]\.?m\.?)/i)
|
||
if (!m) return t.match(/(\d{2}:\d{2})/)?.[1]
|
||
let h = parseInt(m[1])
|
||
const min = m[2]
|
||
const isPm = m[3].toLowerCase().replace(/\./g, '').startsWith('p')
|
||
if (isPm && h !== 12) h += 12
|
||
else if (!isPm && h === 12) h = 0
|
||
const time24 = `${String(h).padStart(2, '0')}:${min}`
|
||
// Preserve UTC offset so isLive() can compute correct UTC kickoff time
|
||
const tz = t.match(/UTC([+-]\d+(?:\.\d+)?)/i)
|
||
return tz ? `${time24} UTC${tz[1]}` : time24
|
||
}
|
||
|
||
function parseBox($: CheerioAPI, $box: Cheerio<Element>, round: string, group: string | null): Match | null {
|
||
const team1 = extractTeam($, $box.find('.fhome'))
|
||
const team2 = extractTeam($, $box.find('.faway'))
|
||
if (!team1 || !team2) return null
|
||
|
||
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
|
||
const timeText = $box.find('.ftime').first().text().trim()
|
||
const timeStr = parseTime12h(timeText)
|
||
const scoreText = $box.find('.fscore').first().text().trim()
|
||
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
|
||
const scoreArr = parseScoreText(scoreText)
|
||
|
||
const $regularRow = $box.find('tr.fgoals').first()
|
||
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
|
||
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
|
||
|
||
let penScore: [number, number] | undefined
|
||
$box.find('tr').each((_, tr) => {
|
||
const $tr = $(tr)
|
||
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
|
||
const ps = parseScoreText($tr.next('tr').find('th').not('.fhome,.faway').first().text().trim())
|
||
if (ps) penScore = ps
|
||
return false
|
||
}
|
||
})
|
||
|
||
let score: ScoreObj | undefined
|
||
if (scoreArr) {
|
||
if (hasAET) {
|
||
const ftGoals = (gs: Goal[], includeOG = false) =>
|
||
gs.filter(g => {
|
||
const w90 = g.minute === undefined || g.minute <= 90
|
||
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
|
||
}).length
|
||
score = { ft: [ftGoals(goals1) + ftGoals(goals2, true), ftGoals(goals2) + ftGoals(goals1, true)], et: scoreArr }
|
||
} else {
|
||
score = { ft: scoreArr }
|
||
}
|
||
if (penScore) score.p = penScore
|
||
}
|
||
|
||
const ground = extractGround($, $box) || undefined
|
||
return {
|
||
round,
|
||
...(group ? { group } : {}),
|
||
...(dateStr ? { date: dateStr } : {}),
|
||
...(timeStr ? { time: timeStr } : {}),
|
||
team1, team2,
|
||
...(score ? { score } : {}),
|
||
...(goals1.length ? { goals1 } : {}),
|
||
...(goals2.length ? { goals2 } : {}),
|
||
...(ground ? { ground } : {}),
|
||
}
|
||
}
|
||
|
||
function collectBoxes($: CheerioAPI, round: string, group: string | null): Match[] {
|
||
const matches: Match[] = []
|
||
$('.footballbox').each((_, el) => {
|
||
const m = parseBox($, $(el), round, group)
|
||
if (m) matches.push(m)
|
||
})
|
||
return matches
|
||
}
|
||
|
||
function processHeading(text: string, level: number, state: State): void {
|
||
const t = text.toLowerCase().trim()
|
||
if (level === 2) {
|
||
if (/group stage/i.test(t) && !/second/i.test(t)) {
|
||
state.active = true; state.round = 'Group stage'; state.group = null
|
||
} else if (/first group stage/i.test(t)) {
|
||
state.active = true; state.round = 'Group stage'; state.group = null
|
||
} else if (/second group stage/i.test(t)) {
|
||
state.active = true; state.round = 'Second group stage'; state.group = null
|
||
} else if (t === 'final round') {
|
||
state.active = true; state.round = 'Final round'; state.group = null
|
||
} else if (/final tournament/i.test(t)) {
|
||
state.active = true; state.round = ''; state.group = null
|
||
} else if (/knock.?out stage/i.test(t)) {
|
||
state.active = true; state.round = ''; state.group = null
|
||
} else if (/round of 16/i.test(t)) {
|
||
state.active = true; state.round = 'Round of 16'; state.group = null
|
||
} else if (/quarter.final/i.test(t)) {
|
||
state.active = true; state.round = 'Quarter-finals'; state.group = null
|
||
} else if (/semi.final/i.test(t)) {
|
||
state.active = true; state.round = 'Semi-finals'; state.group = null
|
||
} else if (/third.place|match for third|play.off for third/i.test(t)) {
|
||
state.active = true; state.round = 'Third-place match'; state.group = null
|
||
} else if (t === 'final') {
|
||
state.active = true; state.round = 'Final'; state.group = null
|
||
} else {
|
||
state.active = false
|
||
}
|
||
return
|
||
}
|
||
if (!state.active) return
|
||
if (level === 3 || level === 4) {
|
||
if (/^group [a-z1-9]+$/i.test(t)) {
|
||
state.group = text.trim()
|
||
} else if (/round of 32/i.test(t)) {
|
||
state.round = 'Round of 32'; state.group = null
|
||
} else if (/round of 16/i.test(t)) {
|
||
state.round = 'Round of 16'; state.group = null
|
||
} else if (/quarter.final/i.test(t)) {
|
||
state.round = 'Quarter-finals'; state.group = null
|
||
} else if (/semi.final/i.test(t)) {
|
||
state.round = 'Semi-finals'; state.group = null
|
||
} else if (/third.place|match for third|play.off for third/i.test(t)) {
|
||
state.round = 'Third-place match'; state.group = null
|
||
} else if (t === 'final') {
|
||
state.round = 'Final'; state.group = null
|
||
}
|
||
}
|
||
}
|
||
|
||
// ── Infobox ────────────────────────────────────────────────────────────────
|
||
|
||
function parseInfobox($: CheerioAPI): Partial<Meta> {
|
||
const result: Partial<Meta> = {}
|
||
|
||
function tdText($td: Cheerio<Element>): string {
|
||
const $clone = $td.clone()
|
||
$clone.find('br').replaceWith(' / ')
|
||
$clone.find('sup, img').remove()
|
||
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
|
||
}
|
||
function tdFirstLink($td: Cheerio<Element>): string | null {
|
||
let name: string | null = null
|
||
$td.find('a').each((_, a) => {
|
||
const t = $(a).clone().find('img').remove().end().text().trim()
|
||
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
|
||
})
|
||
return name ?? (tdText($td) || null)
|
||
}
|
||
function tdAllLinks($td: Cheerio<Element>): string {
|
||
const names: string[] = []
|
||
$td.find('a').each((_, a) => {
|
||
const t = $(a).clone().find('img').remove().end().text().trim()
|
||
if (t && !/\[\d+\]/.test(t)) names.push(t)
|
||
})
|
||
return names.length ? names.join(' / ') : tdText($td)
|
||
}
|
||
|
||
$('table.infobox').first().find('tr').each((_, tr) => {
|
||
const $tr = $(tr)
|
||
const label = $tr.find('th').text().trim().toLowerCase()
|
||
const $td = $tr.find('td').first()
|
||
if (!$td.length) return
|
||
if (/host countr/i.test(label)) result.host = tdAllLinks($td)
|
||
else if (/^teams$/i.test(label)) { const m = $td.text().match(/\d+/); if (m) result.teams_count = parseInt(m[0]) }
|
||
else if (/champion/i.test(label)) result.winner = tdFirstLink($td)
|
||
else if (/runners?.up/i.test(label)) result.runner_up = tdFirstLink($td)
|
||
else if (/third.place/i.test(label)) result.third_place = tdFirstLink($td)
|
||
else if (/fourth.place/i.test(label)) result.fourth_place = tdFirstLink($td)
|
||
})
|
||
|
||
return result
|
||
}
|
||
|
||
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
|
||
function matchWinner(m: Match): [string, string] | null {
|
||
if (!m.score) return null
|
||
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
|
||
if (h > a) return [m.team1, m.team2]
|
||
if (a > h) return [m.team2, m.team1]
|
||
if (m.score.p) {
|
||
const [ph, pa] = m.score.p
|
||
if (ph > pa) return [m.team1, m.team2]
|
||
if (pa > ph) return [m.team2, m.team1]
|
||
}
|
||
return null
|
||
}
|
||
let winner: string | null = null, runner_up: string | null = null
|
||
let third_place: string | null = null, fourth_place: string | null = null
|
||
for (const m of matches) {
|
||
if (m.round === 'Final') {
|
||
const r = matchWinner(m); if (r) [winner, runner_up] = r
|
||
} else if (m.round === 'Third-place match') {
|
||
const r = matchWinner(m); if (r) [third_place, fourth_place] = r
|
||
}
|
||
}
|
||
return { winner, runner_up, third_place, fourth_place }
|
||
}
|
||
|
||
// ── Main year scraper ──────────────────────────────────────────────────────
|
||
|
||
export async function scrapeYear(
|
||
year: number,
|
||
mainHtml: string,
|
||
opts?: { skipGroups?: Set<string> },
|
||
): Promise<YearResult> {
|
||
const $ = load(mainHtml)
|
||
const matches: Match[] = []
|
||
const stadiums = new Map<string, Stadium>()
|
||
const groups = new Map<string, Set<string>>()
|
||
const state: State = { active: false, round: '', group: null }
|
||
const groupSubpages = new Map<string, string>()
|
||
const groupsOnMainPage = new Set<string>()
|
||
|
||
function recordMatch(m: Match) {
|
||
matches.push(m)
|
||
if (m.group) groupsOnMainPage.add(m.group)
|
||
if (m.ground) {
|
||
const { name, city } = parseGroundParts(m.ground)
|
||
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
|
||
}
|
||
if (m.group) {
|
||
if (!groups.has(m.group)) groups.set(m.group, new Set())
|
||
groups.get(m.group)!.add(m.team1)
|
||
groups.get(m.group)!.add(m.team2)
|
||
}
|
||
}
|
||
|
||
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
|
||
const $el = $(el)
|
||
if ($el.hasClass('mw-heading')) {
|
||
const $h = $el.find('h2, h3, h4').first()
|
||
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
|
||
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, state)
|
||
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
|
||
if (state.active && state.group) {
|
||
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
|
||
if (link) {
|
||
const page = link.replace('/wiki/', '').split('#')[0]
|
||
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group))
|
||
groupSubpages.set(state.group, page)
|
||
}
|
||
}
|
||
} else if ($el.hasClass('footballbox')) {
|
||
if (!state.active) return
|
||
const m = parseBox($, $el, state.round || state.group || 'Unknown', state.group)
|
||
if (m) recordMatch(m)
|
||
}
|
||
})
|
||
|
||
for (const [group, page] of groupSubpages) {
|
||
if (groupsOnMainPage.has(group)) continue
|
||
if (opts?.skipGroups?.has(group)) {
|
||
process.stdout.write(`[skip ${group}] `)
|
||
continue
|
||
}
|
||
await delay(1200)
|
||
const subHtml = await fetchWikiHtml(page)
|
||
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
|
||
|
||
const stateTemp: State = { active: false, round: '', group: null }
|
||
let round = 'Group stage'
|
||
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
|
||
const $h = $(el).find('h2, h3, h4').first()
|
||
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
|
||
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, stateTemp)
|
||
if (stateTemp.group === group) { round = stateTemp.round || 'Group stage'; return false }
|
||
})
|
||
|
||
const $sub = load(subHtml)
|
||
for (const m of collectBoxes($sub, round, group)) recordMatch(m)
|
||
process.stdout.write(`[+${page.slice(-8)}] `)
|
||
}
|
||
|
||
const infobox = parseInfobox($)
|
||
const placements = derivePlacements(matches)
|
||
const meta: Meta = {
|
||
host: infobox.host ?? '',
|
||
teams_count: infobox.teams_count ?? null,
|
||
winner: placements.winner ?? infobox.winner ?? null,
|
||
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
|
||
third_place: placements.third_place ?? infobox.third_place ?? null,
|
||
fourth_place: placements.fourth_place?? infobox.fourth_place?? null,
|
||
}
|
||
|
||
return { matches, stadiums, groups, meta }
|
||
}
|
||
|
||
// ── Squad page scraper ─────────────────────────────────────────────────────
|
||
|
||
export function scrapeSquads(html: string): Squad[] {
|
||
const $ = load(html)
|
||
const squads: Squad[] = []
|
||
let currentTeam: Squad | null = null
|
||
|
||
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
|
||
const $el = $(el)
|
||
|
||
if ($el.hasClass('mw-heading')) {
|
||
const $h = $el.find('h3, h4').first()
|
||
if (!$h.length) return
|
||
if (parseInt($h.prop('tagName')?.slice(1) ?? '9') !== 3) return
|
||
const name = $h.text().replace(/\[edit\]/g, '').trim()
|
||
if (/^group /i.test(name)) return
|
||
currentTeam = { name, players: [] }
|
||
squads.push(currentTeam)
|
||
return
|
||
}
|
||
|
||
if (!currentTeam) return
|
||
|
||
let number: number | undefined
|
||
let pos: string | undefined
|
||
let playerName = ''
|
||
let dob: string | undefined
|
||
|
||
$el.find('td, th[scope="row"]').each((i, td) => {
|
||
const $td = $(td)
|
||
const text = $td.text().trim()
|
||
if ($td.is('th[scope="row"]')) {
|
||
playerName = $td.find('a').first().text().trim() || text
|
||
} else if (i === 0 && !playerName) {
|
||
const n = parseInt(text); if (!isNaN(n)) number = n
|
||
} else if (i === 1 && !playerName && !pos) {
|
||
const p = $td.find('a').first().text().trim()
|
||
if (['GK', 'DF', 'MF', 'FW'].includes(p)) pos = p
|
||
}
|
||
const $bday = $td.find('.bday')
|
||
if ($bday.length) dob = $bday.text().trim()
|
||
})
|
||
|
||
if (!playerName) return
|
||
const player: Player = { name: playerName }
|
||
if (number !== undefined) player.number = number
|
||
if (pos) player.pos = pos
|
||
if (dob) player.date_of_birth = dob
|
||
currentTeam.players.push(player)
|
||
})
|
||
|
||
return squads.filter(s => s.players.length > 0)
|
||
}
|