refactor: extract lib/wiki-scraper.ts, make scraper composable, sync from Wikipedia

Move all scraping logic (fetchWikiHtml, scrapeYear, scrapeSquads and all
helpers) into lib/wiki-scraper.ts as exported functions shared by both scripts.

scrape-wikipedia.ts becomes a composable CLI:
  pnpm scrape [year]             — matches + squads (default)
  pnpm scrape [year] --matches   — matches/meta/stadiums only
  pnpm scrape [year] --squads    — squads only

sync.ts drops the openfootball GitHub dependency entirely and scrapes
Wikipedia directly. Incremental: completed groups (all matches have FT
scores) are detected via DB query and their sub-pages are skipped each run.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 17:23:17 +02:00
parent d1171267a8
commit f885e4312c
3 changed files with 635 additions and 757 deletions
+55 -618
View File
@@ -1,10 +1,20 @@
import { load } from 'cheerio'
import type { CheerioAPI } from 'cheerio'
import type { Cheerio } from 'cheerio'
import type { Element } from 'domhandler'
/**
* Scrape English Wikipedia for World Cup data and write JSON files to
* app/data/wikipedia/{year}/.
*
* Usage:
* pnpm scrape # all years, matches + squads
* pnpm scrape 2022 # single year, matches + squads
* pnpm scrape 2022 --matches # matches + meta + stadiums only
* pnpm scrape 2022 --squads # squads only
*/
import { mkdirSync, writeFileSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
import {
fetchWikiHtml, scrapeYear, scrapeSquads,
type Match, type Stadium, type Group, type Meta, type Squad,
} from '../lib/wiki-scraper'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const DATA_DIR = path.join(__dirname, '../app/data/wikipedia')
@@ -16,646 +26,73 @@ const YEARS = [
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
// ── Types ──────────────────────────────────────────────────────────────────
// ── File output ────────────────────────────────────────────────────────────
type Goal = {
name: string
minute?: number
offset?: number
penalty?: boolean
owngoal?: boolean
}
type ScoreObj = {
ft?: [number, number]
et?: [number, number]
p?: [number, number]
}
type Match = {
round: string
group?: string
date?: string
time?: string
team1: string
team2: string
score?: ScoreObj
goals1?: Goal[]
goals2?: Goal[]
ground?: string
}
type Stadium = { name: string; city: string }
type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
type Squad = { name: string; players: Player[] }
type Group = { name: string; teams: string[] }
type Meta = {
host: string
teams_count: number | null
winner: string | null
runner_up: string | null
third_place: string | null
fourth_place: string | null
}
// ── Fetch ──────────────────────────────────────────────────────────────────
async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
for (let attempt = 0; attempt < retries; attempt++) {
try {
if (attempt > 0) await delay(3000 * attempt)
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0 (github.com/worldcup)' } })
if (!res.ok) continue
const data = await res.json() as { parse?: { text?: { '*': string } } }
const html = data?.parse?.text?.['*']
if (html) return html
} catch {
// retry
}
}
return null
}
// ── Score parsing ──────────────────────────────────────────────────────────
function parseScoreText(text: string): [number, number] | null {
const m = text.match(/(\d+)\s*[\-]\s*(\d+)/)
if (!m) return null
return [parseInt(m[1]), parseInt(m[2])]
}
// ── Team name extraction ───────────────────────────────────────────────────
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
let name = ''
$cell.find('a').each((_, a) => {
const $a = $(a)
if (!$a.find('img').length && $a.text().trim()) {
name = $a.text().trim()
return false
}
})
return name
}
// ── Goal parsing ───────────────────────────────────────────────────────────
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
const goals: Goal[] = []
$td.find('li').each((_, li) => {
const $li = $(li)
// Player name: first <a> NOT inside .fb-goal
let playerName = ''
$li.find('a').each((_, a) => {
if (!$(a).closest('.fb-goal').length) {
const t = $(a).text().trim()
if (t) { playerName = t; return false }
}
})
if (!playerName) return
const $fbGoal = $li.find('.fb-goal')
if (!$fbGoal.length) return
// Each direct child <span> inside .fb-goal (excluding image wrapper)
$fbGoal.children('span').each((_, span) => {
const $span = $(span)
if ($span.attr('typeof')) return // image wrapper
const text = $span.text()
const minMatch = text.match(/(\d+)(?:\+(\d+))?[']/)
if (!minMatch) return
const minute = parseInt(minMatch[1])
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
const isPen = text.includes('pen.')
const isOG = text.includes('o.g.')
const goal: Goal = { name: playerName }
if (!isNaN(minute)) goal.minute = minute
if (offset) goal.offset = offset
if (isPen) goal.penalty = true
if (isOG) goal.owngoal = true
goals.push(goal)
})
})
return goals
}
// ── Ground extraction ──────────────────────────────────────────────────────
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
const $loc = $box.find('[itemprop="name address"]').first()
if ($loc.length) return $loc.text().trim()
return $box.find('.fright').first().text().split('\n')[0].trim()
}
function parseGroundParts(ground: string): { name: string; city: string } {
const commaIdx = ground.indexOf(',')
if (commaIdx !== -1) {
return {
name: ground.slice(0, commaIdx).trim(),
city: ground.slice(commaIdx + 1).trim(),
}
}
return { name: ground, city: '' }
}
// ── Footballbox parsing ────────────────────────────────────────────────────
function parseBox(
$: CheerioAPI,
$box: Cheerio<Element>,
round: string,
group: string | null,
): Match | null {
const team1 = extractTeam($, $box.find('.fhome'))
const team2 = extractTeam($, $box.find('.faway'))
if (!team1 || !team2) return null
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
const timeText = $box.find('.ftime').first().text().trim()
const timeMatch = timeText.match(/(\d{2}:\d{2})/)
const timeStr = timeMatch?.[1]
const scoreText = $box.find('.fscore').first().text().trim()
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
const scoreArr = parseScoreText(scoreText)
// Use first fgoals row only (exclude penalty shootout row)
const $regularRow = $box.find('tr.fgoals').first()
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
// Penalty shootout score: row after "Penalties" header tr
let penScore: [number, number] | undefined
$box.find('tr').each((_, tr) => {
const $tr = $(tr)
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
const penText = $tr.next('tr').find('th').not('.fhome,.faway').first().text().trim()
const ps = parseScoreText(penText)
if (ps) penScore = ps
return false
}
})
let score: ScoreObj | undefined
if (scoreArr) {
if (hasAET) {
// scoreArr is ET total; compute FT from goals in ≤90 min
const ftGoals = (gs: Goal[], includeOG = false) =>
gs.filter(g => {
const w90 = g.minute === undefined || g.minute <= 90
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
}).length
const ftHome = ftGoals(goals1) + ftGoals(goals2, true)
const ftAway = ftGoals(goals2) + ftGoals(goals1, true)
score = { ft: [ftHome, ftAway], et: scoreArr }
} else {
score = { ft: scoreArr }
}
if (penScore) score.p = penScore
}
const ground = extractGround($, $box) || undefined
return {
round,
...(group ? { group } : {}),
...(dateStr ? { date: dateStr } : {}),
...(timeStr ? { time: timeStr } : {}),
team1,
team2,
...(score ? { score } : {}),
...(goals1.length ? { goals1 } : {}),
...(goals2.length ? { goals2 } : {}),
...(ground ? { ground } : {}),
}
}
// ── Collect matches from a pre-loaded page ─────────────────────────────────
function collectBoxes(
$: CheerioAPI,
round: string,
group: string | null,
): Match[] {
const matches: Match[] = []
$('.footballbox').each((_, el) => {
const m = parseBox($, $(el), round, group)
if (m) matches.push(m)
})
return matches
}
// ── Section heading state machine ──────────────────────────────────────────
type State = {
active: boolean
round: string
group: string | null
}
function processHeading(text: string, level: number, state: State): void {
const t = text.toLowerCase().trim()
if (level === 2) {
if (/group stage/i.test(t) && !/second/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/first group stage/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/second group stage/i.test(t)) {
state.active = true; state.round = 'Second group stage'; state.group = null
} else if (t === 'final round') {
state.active = true; state.round = 'Final round'; state.group = null
} else if (/final tournament/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/knock.?out stage/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/round of 16/i.test(t)) {
state.active = true; state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.active = true; state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.active = true; state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.active = true; state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.active = true; state.round = 'Final'; state.group = null
} else {
state.active = false
}
return
}
if (!state.active) return
if (level === 3 || level === 4) {
if (/^group [a-h1-9]+$/i.test(t)) {
state.group = text.trim()
} else if (/round of 32/i.test(t)) {
state.round = 'Round of 32'; state.group = null
} else if (/round of 16/i.test(t)) {
state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.round = 'Final'; state.group = null
}
// bracket, draw, seeding, replay → keep current state
}
}
// ── Main year scraper ──────────────────────────────────────────────────────
// ── Infobox parsing ────────────────────────────────────────────────────────
function parseInfobox($: CheerioAPI): Partial<Meta> {
const result: Partial<Meta> = {}
function tdText($td: Cheerio<Element>): string {
const $clone = $td.clone()
$clone.find('br').replaceWith(' / ')
$clone.find('sup, img').remove()
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
}
function tdFirstLink($td: Cheerio<Element>): string | null {
let name: string | null = null
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
})
return name ?? (tdText($td) || null)
}
function tdAllLinks($td: Cheerio<Element>): string {
const names: string[] = []
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) names.push(t)
})
return names.length ? names.join(' / ') : tdText($td)
}
$('table.infobox').first().find('tr').each((_, tr) => {
const $tr = $(tr)
const label = $tr.find('th').text().trim().toLowerCase()
const $td = $tr.find('td').first()
if (!$td.length) return
if (/host countr/i.test(label)) {
result.host = tdAllLinks($td)
} else if (/^teams$/i.test(label)) {
const m = $td.text().match(/\d+/)
if (m) result.teams_count = parseInt(m[0])
} else if (/champion/i.test(label)) {
result.winner = tdFirstLink($td)
} else if (/runners?.up/i.test(label)) {
result.runner_up = tdFirstLink($td)
} else if (/third.place/i.test(label)) {
result.third_place = tdFirstLink($td)
} else if (/fourth.place/i.test(label)) {
result.fourth_place = tdFirstLink($td)
}
})
return result
}
// ── Placement derivation ───────────────────────────────────────────────────
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
function matchWinner(m: Match): [string, string] | null {
if (!m.score) return null
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
if (h > a) return [m.team1, m.team2]
if (a > h) return [m.team2, m.team1]
if (m.score.p) {
const [ph, pa] = m.score.p
if (ph > pa) return [m.team1, m.team2]
if (pa > ph) return [m.team2, m.team1]
}
return null
}
let winner: string | null = null, runner_up: string | null = null
let third_place: string | null = null, fourth_place: string | null = null
for (const m of matches) {
if (m.round === 'Final') {
const result = matchWinner(m)
if (result) { [winner, runner_up] = result }
} else if (m.round === 'Third-place match') {
const result = matchWinner(m)
if (result) { [third_place, fourth_place] = result }
}
}
return { winner, runner_up, third_place, fourth_place }
}
// ── Year result ────────────────────────────────────────────────────────────
type YearResult = {
matches: Match[]
stadiums: Map<string, Stadium>
groups: Map<string, Set<string>>
meta: Meta
}
async function scrapeYear(year: number, mainHtml: string): Promise<YearResult> {
const $ = load(mainHtml)
const matches: Match[] = []
const stadiums = new Map<string, Stadium>()
const groups = new Map<string, Set<string>>()
const state: State = { active: false, round: '', group: null }
// Maps group name → sub-page to fetch (if main page has no matches for that group)
const groupSubpages = new Map<string, string>()
// Groups that got at least one match from the main page
const groupsOnMainPage = new Set<string>()
function recordMatch(m: Match) {
matches.push(m)
if (m.group) groupsOnMainPage.add(m.group)
if (m.ground) {
const { name, city } = parseGroundParts(m.ground)
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
}
if (m.group) {
if (!groups.has(m.group)) groups.set(m.group, new Set())
groups.get(m.group)!.add(m.team1)
groups.get(m.group)!.add(m.team2)
}
}
// Walk elements in document order: headings, hatnotes, footballboxes
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
const text = $h.text().replace(/\[edit\]/g, '').trim()
processHeading(text, level, state)
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
// Record sub-page link for current group context (for fallback if no main-page matches)
if (state.active && state.group) {
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
if (link) {
const page = link.replace('/wiki/', '').split('#')[0]
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group)) {
groupSubpages.set(state.group, page)
}
}
}
} else if ($el.hasClass('footballbox')) {
if (!state.active) return
const round = state.round || state.group || 'Unknown'
const m = parseBox($, $el, round, state.group)
if (m) recordMatch(m)
}
})
// Fetch group sub-pages for any group that got 0 matches from main page
for (const [group, page] of groupSubpages) {
if (groupsOnMainPage.has(group)) continue
await delay(1200)
const subHtml = await fetchWikiHtml(page)
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
// Determine the round for this group from the state machine result
// (we'll reconstruct from the main-page walk state — use the round that was active when this group was seen)
// Since we can't easily recover state here, we re-walk to find the round for this group
let round = 'Group stage'
let foundGroup = false
const stateTemp: State = { active: false, round: '', group: null }
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
const $h = $(el).find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
const text = $h.text().replace(/\[edit\]/g, '').trim()
processHeading(text, level, stateTemp)
if (stateTemp.group === group) {
round = stateTemp.round || 'Group stage'
foundGroup = true
return false
}
})
const $sub = load(subHtml)
const subMatches = collectBoxes($sub, round || 'Group stage', group)
for (const m of subMatches) {
recordMatch(m)
}
process.stdout.write(`[+${page.slice(-8)}] `)
}
const infobox = parseInfobox($)
const placements = derivePlacements(matches)
const meta: Meta = {
host: infobox.host ?? '',
teams_count: infobox.teams_count ?? null,
winner: placements.winner ?? infobox.winner ?? null,
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
third_place: placements.third_place ?? infobox.third_place ?? null,
fourth_place:placements.fourth_place?? infobox.fourth_place?? null,
}
return { matches, stadiums, groups, meta }
}
// ── Squad page scraper ─────────────────────────────────────────────────────
function scrapeSquads(html: string): Squad[] {
const $ = load(html)
const squads: Squad[] = []
let currentTeam: Squad | null = null
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h3, h4').first()
if (!$h.length) return
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
if (level !== 3) return
const name = $h.text().replace(/\[edit\]/g, '').trim()
if (/^group /i.test(name)) return // skip group headers
currentTeam = { name, players: [] }
squads.push(currentTeam)
return
}
if (!currentTeam) return
let number: number | undefined
let pos: string | undefined
let playerName = ''
let dob: string | undefined
$el.find('td, th[scope="row"]').each((i, td) => {
const $td = $(td)
const text = $td.text().trim()
if ($td.is('th[scope="row"]')) {
playerName = $td.find('a').first().text().trim() || text
} else if (i === 0 && !playerName) {
const n = parseInt(text)
if (!isNaN(n)) number = n
} else if (i === 1 && !playerName && !pos) {
const posLink = $td.find('a').first().text().trim()
if (['GK', 'DF', 'MF', 'FW'].includes(posLink)) pos = posLink
}
const $bday = $td.find('.bday')
if ($bday.length) dob = $bday.text().trim()
})
if (!playerName) return
const player: Player = { name: playerName }
if (number !== undefined) player.number = number
if (pos) player.pos = pos
if (dob) player.date_of_birth = dob
currentTeam.players.push(player)
})
return squads.filter(s => s.players.length > 0)
}
// ── Output ─────────────────────────────────────────────────────────────────
function writeOutput(
function writeMatches(
year: number,
matches: Match[],
stadiums: Map<string, Stadium>,
groups: Map<string, Set<string>>,
squads: Squad[],
meta: Meta,
): void {
const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true })
writeFileSync(
path.join(dir, 'worldcup.meta.json'),
JSON.stringify(meta, null, 2),
'utf-8',
)
writeFileSync(path.join(dir, 'worldcup.meta.json'), JSON.stringify(meta, null, 2), 'utf-8')
writeFileSync(path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), 'utf-8')
writeFileSync(
path.join(dir, 'worldcup.json'),
JSON.stringify({ matches }, null, 2),
'utf-8',
)
if (stadiums.size > 0) {
writeFileSync(
path.join(dir, 'worldcup.stadiums.json'),
JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2),
'utf-8',
)
}
if (stadiums.size > 0)
writeFileSync(path.join(dir, 'worldcup.stadiums.json'),
JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), 'utf-8')
const groupList: Group[] = []
groups.forEach((teams, name) => {
groupList.push({ name, teams: Array.from(teams) })
})
if (groupList.length > 0) {
writeFileSync(
path.join(dir, 'worldcup.groups.json'),
JSON.stringify({ groups: groupList }, null, 2),
'utf-8',
)
}
groups.forEach((teams, name) => groupList.push({ name, teams: Array.from(teams) }))
if (groupList.length > 0)
writeFileSync(path.join(dir, 'worldcup.groups.json'),
JSON.stringify({ groups: groupList }, null, 2), 'utf-8')
}
if (squads.length > 0) {
writeFileSync(
path.join(dir, 'worldcup.squads.json'),
JSON.stringify(squads, null, 2),
'utf-8',
)
}
function writeSquads(year: number, squads: Squad[]): void {
if (squads.length === 0) return
const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true })
writeFileSync(path.join(dir, 'worldcup.squads.json'), JSON.stringify(squads, null, 2), 'utf-8')
}
// ── Entry point ────────────────────────────────────────────────────────────
async function main() {
const onlyYear = process.argv[2] ? parseInt(process.argv[2]) : null
const yearsToScrape = onlyYear ? [onlyYear] : YEARS
const args = process.argv.slice(2)
const yearArg = args.find(a => /^\d{4}$/.test(a))
const doMatches = args.includes('--matches') || !args.includes('--squads')
const doSquads = args.includes('--squads') || !args.includes('--matches')
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia...`)
const yearsToScrape = yearArg ? [parseInt(yearArg)] : YEARS
const modeLabel = [doMatches && 'matches', doSquads && 'squads'].filter(Boolean).join(' + ')
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia [${modeLabel}]...`)
for (const year of yearsToScrape) {
process.stdout.write(` ${year}... `)
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
if (!mainHtml) { console.log('FAILED'); continue }
if (doMatches) {
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
if (!mainHtml) { console.log('FAILED (main page)'); continue }
const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
writeMatches(year, matches, stadiums, groups, meta)
process.stdout.write(`${matches.length} matches`)
await delay(600)
}
const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
if (doSquads) {
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
const squads = squadHtml ? scrapeSquads(squadHtml) : []
writeSquads(year, squads)
process.stdout.write(`${doMatches ? ', ' : ''}${squads.length} squads`)
await delay(600)
}
await delay(600)
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
const squads = squadHtml ? scrapeSquads(squadHtml) : []
writeOutput(year, matches, stadiums, groups, squads, meta)
console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`)
await delay(600)
console.log()
}
console.log('\nDone! Files written to app/data/wikipedia/{year}/')