Files
worldcup/scripts/scrape-wikipedia.ts
T
valknar ff4989f39f refactor: rename data/openfootball → data/wikipedia, drop data/kaggle
Move world_cup.csv to app/data/ directly (the only remaining Kaggle file
used by seed.ts for tournament metadata). Delete the rest of the Kaggle CSVs.
Update path constants in scrape-wikipedia.ts and seed.ts accordingly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-15 16:10:21 +02:00

550 lines
19 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { load } from 'cheerio'
import type { CheerioAPI } from 'cheerio'
import type { Cheerio } from 'cheerio'
import type { Element } from 'domhandler'
import { mkdirSync, writeFileSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const DATA_DIR = path.join(__dirname, '../app/data/wikipedia')
const YEARS = [
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
]
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
// ── Types ──────────────────────────────────────────────────────────────────
type Goal = {
name: string
minute?: number
offset?: number
penalty?: boolean
owngoal?: boolean
}
type ScoreObj = {
ft?: [number, number]
et?: [number, number]
p?: [number, number]
}
type Match = {
round: string
group?: string
date?: string
time?: string
team1: string
team2: string
score?: ScoreObj
goals1?: Goal[]
goals2?: Goal[]
ground?: string
}
type Stadium = { name: string; city: string }
type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
type Squad = { name: string; players: Player[] }
type Group = { name: string; teams: string[] }
// ── Fetch ──────────────────────────────────────────────────────────────────
async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
for (let attempt = 0; attempt < retries; attempt++) {
try {
if (attempt > 0) await delay(3000 * attempt)
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0 (github.com/worldcup)' } })
if (!res.ok) continue
const data = await res.json() as { parse?: { text?: { '*': string } } }
const html = data?.parse?.text?.['*']
if (html) return html
} catch {
// retry
}
}
return null
}
// ── Score parsing ──────────────────────────────────────────────────────────
function parseScoreText(text: string): [number, number] | null {
const m = text.match(/(\d+)\s*[\-]\s*(\d+)/)
if (!m) return null
return [parseInt(m[1]), parseInt(m[2])]
}
// ── Team name extraction ───────────────────────────────────────────────────
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
let name = ''
$cell.find('a').each((_, a) => {
const $a = $(a)
if (!$a.find('img').length && $a.text().trim()) {
name = $a.text().trim()
return false
}
})
return name
}
// ── Goal parsing ───────────────────────────────────────────────────────────
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
const goals: Goal[] = []
$td.find('li').each((_, li) => {
const $li = $(li)
// Player name: first <a> NOT inside .fb-goal
let playerName = ''
$li.find('a').each((_, a) => {
if (!$(a).closest('.fb-goal').length) {
const t = $(a).text().trim()
if (t) { playerName = t; return false }
}
})
if (!playerName) return
const $fbGoal = $li.find('.fb-goal')
if (!$fbGoal.length) return
// Each direct child <span> inside .fb-goal (excluding image wrapper)
$fbGoal.children('span').each((_, span) => {
const $span = $(span)
if ($span.attr('typeof')) return // image wrapper
const text = $span.text()
const minMatch = text.match(/(\d+)(?:\+(\d+))?[']/)
if (!minMatch) return
const minute = parseInt(minMatch[1])
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
const isPen = text.includes('pen.')
const isOG = text.includes('o.g.')
const goal: Goal = { name: playerName }
if (!isNaN(minute)) goal.minute = minute
if (offset) goal.offset = offset
if (isPen) goal.penalty = true
if (isOG) goal.owngoal = true
goals.push(goal)
})
})
return goals
}
// ── Ground extraction ──────────────────────────────────────────────────────
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
const $loc = $box.find('[itemprop="name address"]').first()
if ($loc.length) return $loc.text().trim()
return $box.find('.fright').first().text().split('\n')[0].trim()
}
function parseGroundParts(ground: string): { name: string; city: string } {
const commaIdx = ground.indexOf(',')
if (commaIdx !== -1) {
return {
name: ground.slice(0, commaIdx).trim(),
city: ground.slice(commaIdx + 1).trim(),
}
}
return { name: ground, city: '' }
}
// ── Footballbox parsing ────────────────────────────────────────────────────
function parseBox(
$: CheerioAPI,
$box: Cheerio<Element>,
round: string,
group: string | null,
): Match | null {
const team1 = extractTeam($, $box.find('.fhome'))
const team2 = extractTeam($, $box.find('.faway'))
if (!team1 || !team2) return null
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
const timeText = $box.find('.ftime').first().text().trim()
const timeMatch = timeText.match(/(\d{2}:\d{2})/)
const timeStr = timeMatch?.[1]
const scoreText = $box.find('.fscore').first().text().trim()
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
const scoreArr = parseScoreText(scoreText)
// Use first fgoals row only (exclude penalty shootout row)
const $regularRow = $box.find('tr.fgoals').first()
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
// Penalty shootout score: row after "Penalties" header tr
let penScore: [number, number] | undefined
$box.find('tr').each((_, tr) => {
const $tr = $(tr)
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
const penText = $tr.next('tr').find('th').not('.fhome,.faway').first().text().trim()
const ps = parseScoreText(penText)
if (ps) penScore = ps
return false
}
})
let score: ScoreObj | undefined
if (scoreArr) {
if (hasAET) {
// scoreArr is ET total; compute FT from goals in ≤90 min
const ftGoals = (gs: Goal[], includeOG = false) =>
gs.filter(g => {
const w90 = g.minute === undefined || g.minute <= 90
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
}).length
const ftHome = ftGoals(goals1) + ftGoals(goals2, true)
const ftAway = ftGoals(goals2) + ftGoals(goals1, true)
score = { ft: [ftHome, ftAway], et: scoreArr }
} else {
score = { ft: scoreArr }
}
if (penScore) score.p = penScore
}
const ground = extractGround($, $box) || undefined
return {
round,
...(group ? { group } : {}),
...(dateStr ? { date: dateStr } : {}),
...(timeStr ? { time: timeStr } : {}),
team1,
team2,
...(score ? { score } : {}),
...(goals1.length ? { goals1 } : {}),
...(goals2.length ? { goals2 } : {}),
...(ground ? { ground } : {}),
}
}
// ── Collect matches from a pre-loaded page ─────────────────────────────────
function collectBoxes(
$: CheerioAPI,
round: string,
group: string | null,
): Match[] {
const matches: Match[] = []
$('.footballbox').each((_, el) => {
const m = parseBox($, $(el), round, group)
if (m) matches.push(m)
})
return matches
}
// ── Section heading state machine ──────────────────────────────────────────
type State = {
active: boolean
round: string
group: string | null
}
function processHeading(text: string, level: number, state: State): void {
const t = text.toLowerCase().trim()
if (level === 2) {
if (/group stage/i.test(t) && !/second/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/first group stage/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/second group stage/i.test(t)) {
state.active = true; state.round = 'Second group stage'; state.group = null
} else if (t === 'final round') {
state.active = true; state.round = 'Final round'; state.group = null
} else if (/final tournament/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/knock.?out stage/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/round of 16/i.test(t)) {
state.active = true; state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.active = true; state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.active = true; state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.active = true; state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.active = true; state.round = 'Final'; state.group = null
} else {
state.active = false
}
return
}
if (!state.active) return
if (level === 3 || level === 4) {
if (/^group [a-h1-9]+$/i.test(t)) {
state.group = text.trim()
} else if (/round of 32/i.test(t)) {
state.round = 'Round of 32'; state.group = null
} else if (/round of 16/i.test(t)) {
state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.round = 'Final'; state.group = null
}
// bracket, draw, seeding, replay → keep current state
}
}
// ── Main year scraper ──────────────────────────────────────────────────────
type YearResult = {
matches: Match[]
stadiums: Map<string, Stadium>
groups: Map<string, Set<string>>
}
async function scrapeYear(year: number, mainHtml: string): Promise<YearResult> {
const $ = load(mainHtml)
const matches: Match[] = []
const stadiums = new Map<string, Stadium>()
const groups = new Map<string, Set<string>>()
const state: State = { active: false, round: '', group: null }
// Maps group name → sub-page to fetch (if main page has no matches for that group)
const groupSubpages = new Map<string, string>()
// Groups that got at least one match from the main page
const groupsOnMainPage = new Set<string>()
function recordMatch(m: Match) {
matches.push(m)
if (m.group) groupsOnMainPage.add(m.group)
if (m.ground) {
const { name, city } = parseGroundParts(m.ground)
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
}
if (m.group) {
if (!groups.has(m.group)) groups.set(m.group, new Set())
groups.get(m.group)!.add(m.team1)
groups.get(m.group)!.add(m.team2)
}
}
// Walk elements in document order: headings, hatnotes, footballboxes
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
const text = $h.text().replace(/\[edit\]/g, '').trim()
processHeading(text, level, state)
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
// Record sub-page link for current group context (for fallback if no main-page matches)
if (state.active && state.group) {
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
if (link) {
const page = link.replace('/wiki/', '').split('#')[0]
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group)) {
groupSubpages.set(state.group, page)
}
}
}
} else if ($el.hasClass('footballbox')) {
if (!state.active) return
const round = state.round || state.group || 'Unknown'
const m = parseBox($, $el, round, state.group)
if (m) recordMatch(m)
}
})
// Fetch group sub-pages for any group that got 0 matches from main page
for (const [group, page] of groupSubpages) {
if (groupsOnMainPage.has(group)) continue
await delay(1200)
const subHtml = await fetchWikiHtml(page)
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
// Determine the round for this group from the state machine result
// (we'll reconstruct from the main-page walk state — use the round that was active when this group was seen)
// Since we can't easily recover state here, we re-walk to find the round for this group
let round = 'Group stage'
let foundGroup = false
const stateTemp: State = { active: false, round: '', group: null }
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
const $h = $(el).find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
const text = $h.text().replace(/\[edit\]/g, '').trim()
processHeading(text, level, stateTemp)
if (stateTemp.group === group) {
round = stateTemp.round || 'Group stage'
foundGroup = true
return false
}
})
const $sub = load(subHtml)
const subMatches = collectBoxes($sub, round || 'Group stage', group)
for (const m of subMatches) {
recordMatch(m)
}
process.stdout.write(`[+${page.slice(-8)}] `)
}
return { matches, stadiums, groups }
}
// ── Squad page scraper ─────────────────────────────────────────────────────
function scrapeSquads(html: string): Squad[] {
const $ = load(html)
const squads: Squad[] = []
let currentTeam: Squad | null = null
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h3, h4').first()
if (!$h.length) return
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
if (level !== 3) return
const name = $h.text().replace(/\[edit\]/g, '').trim()
if (/^group /i.test(name)) return // skip group headers
currentTeam = { name, players: [] }
squads.push(currentTeam)
return
}
if (!currentTeam) return
let number: number | undefined
let pos: string | undefined
let playerName = ''
let dob: string | undefined
$el.find('td, th[scope="row"]').each((i, td) => {
const $td = $(td)
const text = $td.text().trim()
if ($td.is('th[scope="row"]')) {
playerName = $td.find('a').first().text().trim() || text
} else if (i === 0 && !playerName) {
const n = parseInt(text)
if (!isNaN(n)) number = n
} else if (i === 1 && !playerName && !pos) {
const posLink = $td.find('a').first().text().trim()
if (['GK', 'DF', 'MF', 'FW'].includes(posLink)) pos = posLink
}
const $bday = $td.find('.bday')
if ($bday.length) dob = $bday.text().trim()
})
if (!playerName) return
const player: Player = { name: playerName }
if (number !== undefined) player.number = number
if (pos) player.pos = pos
if (dob) player.date_of_birth = dob
currentTeam.players.push(player)
})
return squads
}
// ── Output ─────────────────────────────────────────────────────────────────
function writeOutput(
year: number,
matches: Match[],
stadiums: Map<string, Stadium>,
groups: Map<string, Set<string>>,
squads: Squad[],
): void {
const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true })
writeFileSync(
path.join(dir, 'worldcup.json'),
JSON.stringify({ matches }, null, 2),
'utf-8',
)
if (stadiums.size > 0) {
writeFileSync(
path.join(dir, 'worldcup.stadiums.json'),
JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2),
'utf-8',
)
}
const groupList: Group[] = []
groups.forEach((teams, name) => {
groupList.push({ name, teams: Array.from(teams) })
})
if (groupList.length > 0) {
writeFileSync(
path.join(dir, 'worldcup.groups.json'),
JSON.stringify({ groups: groupList }, null, 2),
'utf-8',
)
}
if (squads.length > 0) {
writeFileSync(
path.join(dir, 'worldcup.squads.json'),
JSON.stringify(squads, null, 2),
'utf-8',
)
}
}
// ── Entry point ────────────────────────────────────────────────────────────
async function main() {
const onlyYear = process.argv[2] ? parseInt(process.argv[2]) : null
const yearsToScrape = onlyYear ? [onlyYear] : YEARS
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia...`)
for (const year of yearsToScrape) {
process.stdout.write(` ${year}... `)
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
if (!mainHtml) { console.log('FAILED'); continue }
const { matches, stadiums, groups } = await scrapeYear(year, mainHtml)
await delay(600)
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
const squads = squadHtml ? scrapeSquads(squadHtml) : []
writeOutput(year, matches, stadiums, groups, squads)
console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`)
await delay(600)
}
console.log('\nDone! Files written to app/data/wikipedia/{year}/')
}
main().catch(e => { console.error(e); process.exit(1) })