2026-06-15 17:23:17 +02:00
|
|
|
|
import { load } from 'cheerio'
|
|
|
|
|
|
import type { CheerioAPI, Cheerio } from 'cheerio'
|
|
|
|
|
|
import type { Element } from 'domhandler'
|
|
|
|
|
|
|
|
|
|
|
|
// ── Types ──────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
export type Goal = {
|
|
|
|
|
|
name: string
|
|
|
|
|
|
minute?: number
|
|
|
|
|
|
offset?: number
|
|
|
|
|
|
penalty?: boolean
|
|
|
|
|
|
owngoal?: boolean
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export type ScoreObj = {
|
|
|
|
|
|
ft?: [number, number]
|
|
|
|
|
|
et?: [number, number]
|
|
|
|
|
|
p?: [number, number]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export type Match = {
|
|
|
|
|
|
round: string
|
|
|
|
|
|
group?: string
|
|
|
|
|
|
date?: string
|
|
|
|
|
|
time?: string
|
|
|
|
|
|
team1: string
|
|
|
|
|
|
team2: string
|
|
|
|
|
|
score?: ScoreObj
|
|
|
|
|
|
goals1?: Goal[]
|
|
|
|
|
|
goals2?: Goal[]
|
|
|
|
|
|
ground?: string
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export type Stadium = { name: string; city: string }
|
|
|
|
|
|
export type Group = { name: string; teams: string[] }
|
|
|
|
|
|
export type Meta = {
|
|
|
|
|
|
host: string
|
|
|
|
|
|
teams_count: number | null
|
|
|
|
|
|
winner: string | null
|
|
|
|
|
|
runner_up: string | null
|
|
|
|
|
|
third_place: string | null
|
|
|
|
|
|
fourth_place: string | null
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
|
|
|
|
|
|
export type Squad = { name: string; players: Player[] }
|
|
|
|
|
|
|
|
|
|
|
|
export type YearResult = {
|
|
|
|
|
|
matches: Match[]
|
|
|
|
|
|
stadiums: Map<string, Stadium>
|
|
|
|
|
|
groups: Map<string, Set<string>>
|
|
|
|
|
|
meta: Meta
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
type State = { active: boolean; round: string; group: string | null }
|
|
|
|
|
|
|
|
|
|
|
|
// ── Fetch ──────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
|
|
|
|
|
|
|
2026-06-15 18:44:54 +02:00
|
|
|
|
export async function fetchWikiHtml(page: string, retries = 6): Promise<string | null> {
|
2026-06-15 17:23:17 +02:00
|
|
|
|
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
|
|
|
|
|
|
for (let attempt = 0; attempt < retries; attempt++) {
|
|
|
|
|
|
try {
|
2026-06-15 18:44:54 +02:00
|
|
|
|
if (attempt > 0) await delay(15000 * attempt)
|
|
|
|
|
|
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0 (worldcup-stats)' } })
|
|
|
|
|
|
if (res.status === 429) { await delay(30000); continue }
|
2026-06-15 17:23:17 +02:00
|
|
|
|
if (!res.ok) continue
|
2026-06-15 18:44:54 +02:00
|
|
|
|
const text = await res.text()
|
|
|
|
|
|
if (text.toLowerCase().startsWith('you are making')) { await delay(30000); continue }
|
|
|
|
|
|
const data = JSON.parse(text) as { parse?: { text?: { '*': string } } }
|
2026-06-15 17:23:17 +02:00
|
|
|
|
const html = data?.parse?.text?.['*']
|
|
|
|
|
|
if (html) return html
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
// retry
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return null
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-15 17:33:05 +02:00
|
|
|
|
// ── Team name normalisation ────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
const TEAM_ALIASES: Record<string, string> = {
|
|
|
|
|
|
'West Germany': 'Germany',
|
|
|
|
|
|
'Korea Republic': 'South Korea',
|
|
|
|
|
|
'IR Iran': 'Iran',
|
|
|
|
|
|
'Bosnia & Herzegovina': 'Bosnia and Herzegovina',
|
|
|
|
|
|
'USA': 'United States',
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export function normalizeTeam(name: string): string {
|
|
|
|
|
|
return TEAM_ALIASES[name] ?? name
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-15 17:23:17 +02:00
|
|
|
|
// ── Parsing helpers ────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
function parseScoreText(text: string): [number, number] | null {
|
|
|
|
|
|
const m = text.match(/(\d+)\s*[–\-]\s*(\d+)/)
|
|
|
|
|
|
if (!m) return null
|
|
|
|
|
|
return [parseInt(m[1]), parseInt(m[2])]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
|
|
|
|
|
|
let name = ''
|
|
|
|
|
|
$cell.find('a').each((_, a) => {
|
|
|
|
|
|
const $a = $(a)
|
|
|
|
|
|
if (!$a.find('img').length && $a.text().trim()) {
|
|
|
|
|
|
name = $a.text().trim()
|
|
|
|
|
|
return false
|
|
|
|
|
|
}
|
|
|
|
|
|
})
|
2026-06-15 17:33:05 +02:00
|
|
|
|
return normalizeTeam(name)
|
2026-06-15 17:23:17 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
|
|
|
|
|
|
const goals: Goal[] = []
|
|
|
|
|
|
$td.find('li').each((_, li) => {
|
|
|
|
|
|
const $li = $(li)
|
|
|
|
|
|
let playerName = ''
|
|
|
|
|
|
$li.find('a').each((_, a) => {
|
|
|
|
|
|
if (!$(a).closest('.fb-goal').length) {
|
2026-06-15 18:14:53 +02:00
|
|
|
|
const display = $(a).text().trim()
|
|
|
|
|
|
if (!display) return
|
|
|
|
|
|
// title attr has the full unabbreviated name; strip disambiguation suffix
|
|
|
|
|
|
const titleAttr = ($(a).attr('title') ?? '').replace(/\s*\([^)]*\)\s*$/, '').trim()
|
|
|
|
|
|
playerName = titleAttr || display
|
|
|
|
|
|
return false
|
2026-06-15 17:23:17 +02:00
|
|
|
|
}
|
|
|
|
|
|
})
|
|
|
|
|
|
if (!playerName) return
|
|
|
|
|
|
const $fbGoal = $li.find('.fb-goal')
|
|
|
|
|
|
if (!$fbGoal.length) return
|
|
|
|
|
|
$fbGoal.children('span').each((_, span) => {
|
|
|
|
|
|
const $span = $(span)
|
|
|
|
|
|
if ($span.attr('typeof')) return
|
|
|
|
|
|
const text = $span.text()
|
|
|
|
|
|
const minMatch = text.match(/(\d+)(?:\+(\d+))?['′]/)
|
|
|
|
|
|
if (!minMatch) return
|
|
|
|
|
|
const minute = parseInt(minMatch[1])
|
|
|
|
|
|
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
|
|
|
|
|
|
const goal: Goal = { name: playerName }
|
|
|
|
|
|
if (!isNaN(minute)) goal.minute = minute
|
|
|
|
|
|
if (offset) goal.offset = offset
|
|
|
|
|
|
if (text.includes('pen.')) goal.penalty = true
|
|
|
|
|
|
if (text.includes('o.g.')) goal.owngoal = true
|
|
|
|
|
|
goals.push(goal)
|
|
|
|
|
|
})
|
|
|
|
|
|
})
|
|
|
|
|
|
return goals
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
|
|
|
|
|
|
const $loc = $box.find('[itemprop="name address"]').first()
|
|
|
|
|
|
if ($loc.length) return $loc.text().trim()
|
|
|
|
|
|
return $box.find('.fright').first().text().split('\n')[0].trim()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function parseGroundParts(ground: string): { name: string; city: string } {
|
|
|
|
|
|
const commaIdx = ground.indexOf(',')
|
|
|
|
|
|
if (commaIdx !== -1) return { name: ground.slice(0, commaIdx).trim(), city: ground.slice(commaIdx + 1).trim() }
|
|
|
|
|
|
return { name: ground, city: '' }
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-15 17:50:30 +02:00
|
|
|
|
function parseTime12h(text: string): string | undefined {
|
2026-06-15 18:14:53 +02:00
|
|
|
|
// Normalise Unicode minus (U+2212) used by Wikipedia to ASCII hyphen
|
|
|
|
|
|
const t = text.replace(/−/g, '-')
|
|
|
|
|
|
const m = t.match(/(\d{1,2}):(\d{2})\s*([ap]\.?m\.?)/i)
|
|
|
|
|
|
if (!m) return t.match(/(\d{2}:\d{2})/)?.[1]
|
2026-06-15 17:50:30 +02:00
|
|
|
|
let h = parseInt(m[1])
|
|
|
|
|
|
const min = m[2]
|
|
|
|
|
|
const isPm = m[3].toLowerCase().replace(/\./g, '').startsWith('p')
|
|
|
|
|
|
if (isPm && h !== 12) h += 12
|
|
|
|
|
|
else if (!isPm && h === 12) h = 0
|
2026-06-15 18:14:53 +02:00
|
|
|
|
const time24 = `${String(h).padStart(2, '0')}:${min}`
|
|
|
|
|
|
// Preserve UTC offset so isLive() can compute correct UTC kickoff time
|
|
|
|
|
|
const tz = t.match(/UTC([+-]\d+(?:\.\d+)?)/i)
|
|
|
|
|
|
return tz ? `${time24} UTC${tz[1]}` : time24
|
2026-06-15 17:50:30 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-15 17:23:17 +02:00
|
|
|
|
function parseBox($: CheerioAPI, $box: Cheerio<Element>, round: string, group: string | null): Match | null {
|
|
|
|
|
|
const team1 = extractTeam($, $box.find('.fhome'))
|
|
|
|
|
|
const team2 = extractTeam($, $box.find('.faway'))
|
|
|
|
|
|
if (!team1 || !team2) return null
|
|
|
|
|
|
|
|
|
|
|
|
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
|
|
|
|
|
|
const timeText = $box.find('.ftime').first().text().trim()
|
2026-06-15 17:50:30 +02:00
|
|
|
|
const timeStr = parseTime12h(timeText)
|
2026-06-15 17:23:17 +02:00
|
|
|
|
const scoreText = $box.find('.fscore').first().text().trim()
|
|
|
|
|
|
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
|
|
|
|
|
|
const scoreArr = parseScoreText(scoreText)
|
|
|
|
|
|
|
|
|
|
|
|
const $regularRow = $box.find('tr.fgoals').first()
|
|
|
|
|
|
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
|
|
|
|
|
|
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
|
|
|
|
|
|
|
|
|
|
|
|
let penScore: [number, number] | undefined
|
|
|
|
|
|
$box.find('tr').each((_, tr) => {
|
|
|
|
|
|
const $tr = $(tr)
|
|
|
|
|
|
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
|
|
|
|
|
|
const ps = parseScoreText($tr.next('tr').find('th').not('.fhome,.faway').first().text().trim())
|
|
|
|
|
|
if (ps) penScore = ps
|
|
|
|
|
|
return false
|
|
|
|
|
|
}
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
let score: ScoreObj | undefined
|
|
|
|
|
|
if (scoreArr) {
|
|
|
|
|
|
if (hasAET) {
|
|
|
|
|
|
const ftGoals = (gs: Goal[], includeOG = false) =>
|
|
|
|
|
|
gs.filter(g => {
|
|
|
|
|
|
const w90 = g.minute === undefined || g.minute <= 90
|
|
|
|
|
|
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
|
|
|
|
|
|
}).length
|
|
|
|
|
|
score = { ft: [ftGoals(goals1) + ftGoals(goals2, true), ftGoals(goals2) + ftGoals(goals1, true)], et: scoreArr }
|
|
|
|
|
|
} else {
|
|
|
|
|
|
score = { ft: scoreArr }
|
|
|
|
|
|
}
|
|
|
|
|
|
if (penScore) score.p = penScore
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const ground = extractGround($, $box) || undefined
|
|
|
|
|
|
return {
|
|
|
|
|
|
round,
|
|
|
|
|
|
...(group ? { group } : {}),
|
|
|
|
|
|
...(dateStr ? { date: dateStr } : {}),
|
|
|
|
|
|
...(timeStr ? { time: timeStr } : {}),
|
|
|
|
|
|
team1, team2,
|
|
|
|
|
|
...(score ? { score } : {}),
|
|
|
|
|
|
...(goals1.length ? { goals1 } : {}),
|
|
|
|
|
|
...(goals2.length ? { goals2 } : {}),
|
|
|
|
|
|
...(ground ? { ground } : {}),
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function collectBoxes($: CheerioAPI, round: string, group: string | null): Match[] {
|
|
|
|
|
|
const matches: Match[] = []
|
|
|
|
|
|
$('.footballbox').each((_, el) => {
|
|
|
|
|
|
const m = parseBox($, $(el), round, group)
|
|
|
|
|
|
if (m) matches.push(m)
|
|
|
|
|
|
})
|
|
|
|
|
|
return matches
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function processHeading(text: string, level: number, state: State): void {
|
|
|
|
|
|
const t = text.toLowerCase().trim()
|
|
|
|
|
|
if (level === 2) {
|
|
|
|
|
|
if (/group stage/i.test(t) && !/second/i.test(t)) {
|
|
|
|
|
|
state.active = true; state.round = 'Group stage'; state.group = null
|
|
|
|
|
|
} else if (/first group stage/i.test(t)) {
|
|
|
|
|
|
state.active = true; state.round = 'Group stage'; state.group = null
|
|
|
|
|
|
} else if (/second group stage/i.test(t)) {
|
|
|
|
|
|
state.active = true; state.round = 'Second group stage'; state.group = null
|
|
|
|
|
|
} else if (t === 'final round') {
|
|
|
|
|
|
state.active = true; state.round = 'Final round'; state.group = null
|
|
|
|
|
|
} else if (/final tournament/i.test(t)) {
|
|
|
|
|
|
state.active = true; state.round = ''; state.group = null
|
|
|
|
|
|
} else if (/knock.?out stage/i.test(t)) {
|
|
|
|
|
|
state.active = true; state.round = ''; state.group = null
|
|
|
|
|
|
} else if (/round of 16/i.test(t)) {
|
|
|
|
|
|
state.active = true; state.round = 'Round of 16'; state.group = null
|
|
|
|
|
|
} else if (/quarter.final/i.test(t)) {
|
|
|
|
|
|
state.active = true; state.round = 'Quarter-finals'; state.group = null
|
|
|
|
|
|
} else if (/semi.final/i.test(t)) {
|
|
|
|
|
|
state.active = true; state.round = 'Semi-finals'; state.group = null
|
|
|
|
|
|
} else if (/third.place|match for third|play.off for third/i.test(t)) {
|
|
|
|
|
|
state.active = true; state.round = 'Third-place match'; state.group = null
|
|
|
|
|
|
} else if (t === 'final') {
|
|
|
|
|
|
state.active = true; state.round = 'Final'; state.group = null
|
|
|
|
|
|
} else {
|
|
|
|
|
|
state.active = false
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!state.active) return
|
|
|
|
|
|
if (level === 3 || level === 4) {
|
2026-06-15 17:39:38 +02:00
|
|
|
|
if (/^group [a-z1-9]+$/i.test(t)) {
|
2026-06-15 17:23:17 +02:00
|
|
|
|
state.group = text.trim()
|
|
|
|
|
|
} else if (/round of 32/i.test(t)) {
|
|
|
|
|
|
state.round = 'Round of 32'; state.group = null
|
|
|
|
|
|
} else if (/round of 16/i.test(t)) {
|
|
|
|
|
|
state.round = 'Round of 16'; state.group = null
|
|
|
|
|
|
} else if (/quarter.final/i.test(t)) {
|
|
|
|
|
|
state.round = 'Quarter-finals'; state.group = null
|
|
|
|
|
|
} else if (/semi.final/i.test(t)) {
|
|
|
|
|
|
state.round = 'Semi-finals'; state.group = null
|
|
|
|
|
|
} else if (/third.place|match for third|play.off for third/i.test(t)) {
|
|
|
|
|
|
state.round = 'Third-place match'; state.group = null
|
|
|
|
|
|
} else if (t === 'final') {
|
|
|
|
|
|
state.round = 'Final'; state.group = null
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ── Infobox ────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
function parseInfobox($: CheerioAPI): Partial<Meta> {
|
|
|
|
|
|
const result: Partial<Meta> = {}
|
|
|
|
|
|
|
|
|
|
|
|
function tdText($td: Cheerio<Element>): string {
|
|
|
|
|
|
const $clone = $td.clone()
|
|
|
|
|
|
$clone.find('br').replaceWith(' / ')
|
|
|
|
|
|
$clone.find('sup, img').remove()
|
|
|
|
|
|
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
|
|
|
|
|
|
}
|
|
|
|
|
|
function tdFirstLink($td: Cheerio<Element>): string | null {
|
|
|
|
|
|
let name: string | null = null
|
|
|
|
|
|
$td.find('a').each((_, a) => {
|
|
|
|
|
|
const t = $(a).clone().find('img').remove().end().text().trim()
|
|
|
|
|
|
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
|
|
|
|
|
|
})
|
|
|
|
|
|
return name ?? (tdText($td) || null)
|
|
|
|
|
|
}
|
|
|
|
|
|
function tdAllLinks($td: Cheerio<Element>): string {
|
|
|
|
|
|
const names: string[] = []
|
|
|
|
|
|
$td.find('a').each((_, a) => {
|
|
|
|
|
|
const t = $(a).clone().find('img').remove().end().text().trim()
|
|
|
|
|
|
if (t && !/\[\d+\]/.test(t)) names.push(t)
|
|
|
|
|
|
})
|
|
|
|
|
|
return names.length ? names.join(' / ') : tdText($td)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$('table.infobox').first().find('tr').each((_, tr) => {
|
|
|
|
|
|
const $tr = $(tr)
|
|
|
|
|
|
const label = $tr.find('th').text().trim().toLowerCase()
|
|
|
|
|
|
const $td = $tr.find('td').first()
|
|
|
|
|
|
if (!$td.length) return
|
|
|
|
|
|
if (/host countr/i.test(label)) result.host = tdAllLinks($td)
|
|
|
|
|
|
else if (/^teams$/i.test(label)) { const m = $td.text().match(/\d+/); if (m) result.teams_count = parseInt(m[0]) }
|
|
|
|
|
|
else if (/champion/i.test(label)) result.winner = tdFirstLink($td)
|
|
|
|
|
|
else if (/runners?.up/i.test(label)) result.runner_up = tdFirstLink($td)
|
|
|
|
|
|
else if (/third.place/i.test(label)) result.third_place = tdFirstLink($td)
|
|
|
|
|
|
else if (/fourth.place/i.test(label)) result.fourth_place = tdFirstLink($td)
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
|
|
|
|
|
|
function matchWinner(m: Match): [string, string] | null {
|
|
|
|
|
|
if (!m.score) return null
|
|
|
|
|
|
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
|
|
|
|
|
|
if (h > a) return [m.team1, m.team2]
|
|
|
|
|
|
if (a > h) return [m.team2, m.team1]
|
|
|
|
|
|
if (m.score.p) {
|
|
|
|
|
|
const [ph, pa] = m.score.p
|
|
|
|
|
|
if (ph > pa) return [m.team1, m.team2]
|
|
|
|
|
|
if (pa > ph) return [m.team2, m.team1]
|
|
|
|
|
|
}
|
|
|
|
|
|
return null
|
|
|
|
|
|
}
|
|
|
|
|
|
let winner: string | null = null, runner_up: string | null = null
|
|
|
|
|
|
let third_place: string | null = null, fourth_place: string | null = null
|
|
|
|
|
|
for (const m of matches) {
|
|
|
|
|
|
if (m.round === 'Final') {
|
|
|
|
|
|
const r = matchWinner(m); if (r) [winner, runner_up] = r
|
|
|
|
|
|
} else if (m.round === 'Third-place match') {
|
|
|
|
|
|
const r = matchWinner(m); if (r) [third_place, fourth_place] = r
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return { winner, runner_up, third_place, fourth_place }
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ── Main year scraper ──────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
export async function scrapeYear(
|
|
|
|
|
|
year: number,
|
|
|
|
|
|
mainHtml: string,
|
|
|
|
|
|
opts?: { skipGroups?: Set<string> },
|
|
|
|
|
|
): Promise<YearResult> {
|
|
|
|
|
|
const $ = load(mainHtml)
|
|
|
|
|
|
const matches: Match[] = []
|
|
|
|
|
|
const stadiums = new Map<string, Stadium>()
|
|
|
|
|
|
const groups = new Map<string, Set<string>>()
|
|
|
|
|
|
const state: State = { active: false, round: '', group: null }
|
|
|
|
|
|
const groupSubpages = new Map<string, string>()
|
|
|
|
|
|
const groupsOnMainPage = new Set<string>()
|
|
|
|
|
|
|
|
|
|
|
|
function recordMatch(m: Match) {
|
|
|
|
|
|
matches.push(m)
|
|
|
|
|
|
if (m.group) groupsOnMainPage.add(m.group)
|
|
|
|
|
|
if (m.ground) {
|
|
|
|
|
|
const { name, city } = parseGroundParts(m.ground)
|
|
|
|
|
|
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
|
|
|
|
|
|
}
|
|
|
|
|
|
if (m.group) {
|
|
|
|
|
|
if (!groups.has(m.group)) groups.set(m.group, new Set())
|
|
|
|
|
|
groups.get(m.group)!.add(m.team1)
|
|
|
|
|
|
groups.get(m.group)!.add(m.team2)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
|
|
|
|
|
|
const $el = $(el)
|
|
|
|
|
|
if ($el.hasClass('mw-heading')) {
|
|
|
|
|
|
const $h = $el.find('h2, h3, h4').first()
|
|
|
|
|
|
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
|
|
|
|
|
|
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, state)
|
|
|
|
|
|
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
|
|
|
|
|
|
if (state.active && state.group) {
|
|
|
|
|
|
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
|
|
|
|
|
|
if (link) {
|
|
|
|
|
|
const page = link.replace('/wiki/', '').split('#')[0]
|
|
|
|
|
|
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group))
|
|
|
|
|
|
groupSubpages.set(state.group, page)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} else if ($el.hasClass('footballbox')) {
|
|
|
|
|
|
if (!state.active) return
|
|
|
|
|
|
const m = parseBox($, $el, state.round || state.group || 'Unknown', state.group)
|
|
|
|
|
|
if (m) recordMatch(m)
|
|
|
|
|
|
}
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
for (const [group, page] of groupSubpages) {
|
|
|
|
|
|
if (groupsOnMainPage.has(group)) continue
|
|
|
|
|
|
if (opts?.skipGroups?.has(group)) {
|
|
|
|
|
|
process.stdout.write(`[skip ${group}] `)
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
2026-06-15 18:44:54 +02:00
|
|
|
|
await delay(3000)
|
2026-06-15 17:23:17 +02:00
|
|
|
|
const subHtml = await fetchWikiHtml(page)
|
|
|
|
|
|
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
|
|
|
|
|
|
|
|
|
|
|
|
const stateTemp: State = { active: false, round: '', group: null }
|
|
|
|
|
|
let round = 'Group stage'
|
|
|
|
|
|
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
|
|
|
|
|
|
const $h = $(el).find('h2, h3, h4').first()
|
|
|
|
|
|
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
|
|
|
|
|
|
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, stateTemp)
|
|
|
|
|
|
if (stateTemp.group === group) { round = stateTemp.round || 'Group stage'; return false }
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
const $sub = load(subHtml)
|
|
|
|
|
|
for (const m of collectBoxes($sub, round, group)) recordMatch(m)
|
|
|
|
|
|
process.stdout.write(`[+${page.slice(-8)}] `)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const infobox = parseInfobox($)
|
|
|
|
|
|
const placements = derivePlacements(matches)
|
|
|
|
|
|
const meta: Meta = {
|
|
|
|
|
|
host: infobox.host ?? '',
|
|
|
|
|
|
teams_count: infobox.teams_count ?? null,
|
|
|
|
|
|
winner: placements.winner ?? infobox.winner ?? null,
|
|
|
|
|
|
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
|
|
|
|
|
|
third_place: placements.third_place ?? infobox.third_place ?? null,
|
|
|
|
|
|
fourth_place: placements.fourth_place?? infobox.fourth_place?? null,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return { matches, stadiums, groups, meta }
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ── Squad page scraper ─────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
export function scrapeSquads(html: string): Squad[] {
|
|
|
|
|
|
const $ = load(html)
|
|
|
|
|
|
const squads: Squad[] = []
|
|
|
|
|
|
let currentTeam: Squad | null = null
|
|
|
|
|
|
|
|
|
|
|
|
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
|
|
|
|
|
|
const $el = $(el)
|
|
|
|
|
|
|
|
|
|
|
|
if ($el.hasClass('mw-heading')) {
|
|
|
|
|
|
const $h = $el.find('h3, h4').first()
|
|
|
|
|
|
if (!$h.length) return
|
|
|
|
|
|
if (parseInt($h.prop('tagName')?.slice(1) ?? '9') !== 3) return
|
|
|
|
|
|
const name = $h.text().replace(/\[edit\]/g, '').trim()
|
|
|
|
|
|
if (/^group /i.test(name)) return
|
|
|
|
|
|
currentTeam = { name, players: [] }
|
|
|
|
|
|
squads.push(currentTeam)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!currentTeam) return
|
|
|
|
|
|
|
|
|
|
|
|
let number: number | undefined
|
|
|
|
|
|
let pos: string | undefined
|
|
|
|
|
|
let playerName = ''
|
|
|
|
|
|
let dob: string | undefined
|
|
|
|
|
|
|
|
|
|
|
|
$el.find('td, th[scope="row"]').each((i, td) => {
|
|
|
|
|
|
const $td = $(td)
|
|
|
|
|
|
const text = $td.text().trim()
|
|
|
|
|
|
if ($td.is('th[scope="row"]')) {
|
|
|
|
|
|
playerName = $td.find('a').first().text().trim() || text
|
|
|
|
|
|
} else if (i === 0 && !playerName) {
|
|
|
|
|
|
const n = parseInt(text); if (!isNaN(n)) number = n
|
|
|
|
|
|
} else if (i === 1 && !playerName && !pos) {
|
|
|
|
|
|
const p = $td.find('a').first().text().trim()
|
|
|
|
|
|
if (['GK', 'DF', 'MF', 'FW'].includes(p)) pos = p
|
|
|
|
|
|
}
|
|
|
|
|
|
const $bday = $td.find('.bday')
|
|
|
|
|
|
if ($bday.length) dob = $bday.text().trim()
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
if (!playerName) return
|
|
|
|
|
|
const player: Player = { name: playerName }
|
|
|
|
|
|
if (number !== undefined) player.number = number
|
|
|
|
|
|
if (pos) player.pos = pos
|
|
|
|
|
|
if (dob) player.date_of_birth = dob
|
|
|
|
|
|
currentTeam.players.push(player)
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
return squads.filter(s => s.players.length > 0)
|
|
|
|
|
|
}
|