refactor: extract lib/wiki-scraper.ts, make scraper composable, sync from Wikipedia

Move all scraping logic (fetchWikiHtml, scrapeYear, scrapeSquads and all
helpers) into lib/wiki-scraper.ts as exported functions shared by both scripts.

scrape-wikipedia.ts becomes a composable CLI:
  pnpm scrape [year]             — matches + squads (default)
  pnpm scrape [year] --matches   — matches/meta/stadiums only
  pnpm scrape [year] --squads    — squads only

sync.ts drops the openfootball GitHub dependency entirely and scrapes
Wikipedia directly. Incremental: completed groups (all matches have FT
scores) are detected via DB query and their sub-pages are skipped each run.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 17:23:17 +02:00
parent d1171267a8
commit f885e4312c
3 changed files with 635 additions and 757 deletions
+467
View File
@@ -0,0 +1,467 @@
import { load } from 'cheerio'
import type { CheerioAPI, Cheerio } from 'cheerio'
import type { Element } from 'domhandler'
// ── Types ──────────────────────────────────────────────────────────────────
export type Goal = {
name: string
minute?: number
offset?: number
penalty?: boolean
owngoal?: boolean
}
export type ScoreObj = {
ft?: [number, number]
et?: [number, number]
p?: [number, number]
}
export type Match = {
round: string
group?: string
date?: string
time?: string
team1: string
team2: string
score?: ScoreObj
goals1?: Goal[]
goals2?: Goal[]
ground?: string
}
export type Stadium = { name: string; city: string }
export type Group = { name: string; teams: string[] }
export type Meta = {
host: string
teams_count: number | null
winner: string | null
runner_up: string | null
third_place: string | null
fourth_place: string | null
}
export type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
export type Squad = { name: string; players: Player[] }
export type YearResult = {
matches: Match[]
stadiums: Map<string, Stadium>
groups: Map<string, Set<string>>
meta: Meta
}
type State = { active: boolean; round: string; group: string | null }
// ── Fetch ──────────────────────────────────────────────────────────────────
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
export async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
for (let attempt = 0; attempt < retries; attempt++) {
try {
if (attempt > 0) await delay(3000 * attempt)
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0' } })
if (!res.ok) continue
const data = await res.json() as { parse?: { text?: { '*': string } } }
const html = data?.parse?.text?.['*']
if (html) return html
} catch {
// retry
}
}
return null
}
// ── Parsing helpers ────────────────────────────────────────────────────────
function parseScoreText(text: string): [number, number] | null {
const m = text.match(/(\d+)\s*[\-]\s*(\d+)/)
if (!m) return null
return [parseInt(m[1]), parseInt(m[2])]
}
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
let name = ''
$cell.find('a').each((_, a) => {
const $a = $(a)
if (!$a.find('img').length && $a.text().trim()) {
name = $a.text().trim()
return false
}
})
return name
}
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
const goals: Goal[] = []
$td.find('li').each((_, li) => {
const $li = $(li)
let playerName = ''
$li.find('a').each((_, a) => {
if (!$(a).closest('.fb-goal').length) {
const t = $(a).text().trim()
if (t) { playerName = t; return false }
}
})
if (!playerName) return
const $fbGoal = $li.find('.fb-goal')
if (!$fbGoal.length) return
$fbGoal.children('span').each((_, span) => {
const $span = $(span)
if ($span.attr('typeof')) return
const text = $span.text()
const minMatch = text.match(/(\d+)(?:\+(\d+))?[']/)
if (!minMatch) return
const minute = parseInt(minMatch[1])
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
const goal: Goal = { name: playerName }
if (!isNaN(minute)) goal.minute = minute
if (offset) goal.offset = offset
if (text.includes('pen.')) goal.penalty = true
if (text.includes('o.g.')) goal.owngoal = true
goals.push(goal)
})
})
return goals
}
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
const $loc = $box.find('[itemprop="name address"]').first()
if ($loc.length) return $loc.text().trim()
return $box.find('.fright').first().text().split('\n')[0].trim()
}
function parseGroundParts(ground: string): { name: string; city: string } {
const commaIdx = ground.indexOf(',')
if (commaIdx !== -1) return { name: ground.slice(0, commaIdx).trim(), city: ground.slice(commaIdx + 1).trim() }
return { name: ground, city: '' }
}
function parseBox($: CheerioAPI, $box: Cheerio<Element>, round: string, group: string | null): Match | null {
const team1 = extractTeam($, $box.find('.fhome'))
const team2 = extractTeam($, $box.find('.faway'))
if (!team1 || !team2) return null
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
const timeText = $box.find('.ftime').first().text().trim()
const timeStr = timeText.match(/(\d{2}:\d{2})/)?.[1]
const scoreText = $box.find('.fscore').first().text().trim()
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
const scoreArr = parseScoreText(scoreText)
const $regularRow = $box.find('tr.fgoals').first()
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
let penScore: [number, number] | undefined
$box.find('tr').each((_, tr) => {
const $tr = $(tr)
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
const ps = parseScoreText($tr.next('tr').find('th').not('.fhome,.faway').first().text().trim())
if (ps) penScore = ps
return false
}
})
let score: ScoreObj | undefined
if (scoreArr) {
if (hasAET) {
const ftGoals = (gs: Goal[], includeOG = false) =>
gs.filter(g => {
const w90 = g.minute === undefined || g.minute <= 90
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
}).length
score = { ft: [ftGoals(goals1) + ftGoals(goals2, true), ftGoals(goals2) + ftGoals(goals1, true)], et: scoreArr }
} else {
score = { ft: scoreArr }
}
if (penScore) score.p = penScore
}
const ground = extractGround($, $box) || undefined
return {
round,
...(group ? { group } : {}),
...(dateStr ? { date: dateStr } : {}),
...(timeStr ? { time: timeStr } : {}),
team1, team2,
...(score ? { score } : {}),
...(goals1.length ? { goals1 } : {}),
...(goals2.length ? { goals2 } : {}),
...(ground ? { ground } : {}),
}
}
function collectBoxes($: CheerioAPI, round: string, group: string | null): Match[] {
const matches: Match[] = []
$('.footballbox').each((_, el) => {
const m = parseBox($, $(el), round, group)
if (m) matches.push(m)
})
return matches
}
function processHeading(text: string, level: number, state: State): void {
const t = text.toLowerCase().trim()
if (level === 2) {
if (/group stage/i.test(t) && !/second/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/first group stage/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/second group stage/i.test(t)) {
state.active = true; state.round = 'Second group stage'; state.group = null
} else if (t === 'final round') {
state.active = true; state.round = 'Final round'; state.group = null
} else if (/final tournament/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/knock.?out stage/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/round of 16/i.test(t)) {
state.active = true; state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.active = true; state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.active = true; state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.active = true; state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.active = true; state.round = 'Final'; state.group = null
} else {
state.active = false
}
return
}
if (!state.active) return
if (level === 3 || level === 4) {
if (/^group [a-h1-9]+$/i.test(t)) {
state.group = text.trim()
} else if (/round of 32/i.test(t)) {
state.round = 'Round of 32'; state.group = null
} else if (/round of 16/i.test(t)) {
state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.round = 'Final'; state.group = null
}
}
}
// ── Infobox ────────────────────────────────────────────────────────────────
function parseInfobox($: CheerioAPI): Partial<Meta> {
const result: Partial<Meta> = {}
function tdText($td: Cheerio<Element>): string {
const $clone = $td.clone()
$clone.find('br').replaceWith(' / ')
$clone.find('sup, img').remove()
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
}
function tdFirstLink($td: Cheerio<Element>): string | null {
let name: string | null = null
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
})
return name ?? (tdText($td) || null)
}
function tdAllLinks($td: Cheerio<Element>): string {
const names: string[] = []
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) names.push(t)
})
return names.length ? names.join(' / ') : tdText($td)
}
$('table.infobox').first().find('tr').each((_, tr) => {
const $tr = $(tr)
const label = $tr.find('th').text().trim().toLowerCase()
const $td = $tr.find('td').first()
if (!$td.length) return
if (/host countr/i.test(label)) result.host = tdAllLinks($td)
else if (/^teams$/i.test(label)) { const m = $td.text().match(/\d+/); if (m) result.teams_count = parseInt(m[0]) }
else if (/champion/i.test(label)) result.winner = tdFirstLink($td)
else if (/runners?.up/i.test(label)) result.runner_up = tdFirstLink($td)
else if (/third.place/i.test(label)) result.third_place = tdFirstLink($td)
else if (/fourth.place/i.test(label)) result.fourth_place = tdFirstLink($td)
})
return result
}
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
function matchWinner(m: Match): [string, string] | null {
if (!m.score) return null
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
if (h > a) return [m.team1, m.team2]
if (a > h) return [m.team2, m.team1]
if (m.score.p) {
const [ph, pa] = m.score.p
if (ph > pa) return [m.team1, m.team2]
if (pa > ph) return [m.team2, m.team1]
}
return null
}
let winner: string | null = null, runner_up: string | null = null
let third_place: string | null = null, fourth_place: string | null = null
for (const m of matches) {
if (m.round === 'Final') {
const r = matchWinner(m); if (r) [winner, runner_up] = r
} else if (m.round === 'Third-place match') {
const r = matchWinner(m); if (r) [third_place, fourth_place] = r
}
}
return { winner, runner_up, third_place, fourth_place }
}
// ── Main year scraper ──────────────────────────────────────────────────────
export async function scrapeYear(
year: number,
mainHtml: string,
opts?: { skipGroups?: Set<string> },
): Promise<YearResult> {
const $ = load(mainHtml)
const matches: Match[] = []
const stadiums = new Map<string, Stadium>()
const groups = new Map<string, Set<string>>()
const state: State = { active: false, round: '', group: null }
const groupSubpages = new Map<string, string>()
const groupsOnMainPage = new Set<string>()
function recordMatch(m: Match) {
matches.push(m)
if (m.group) groupsOnMainPage.add(m.group)
if (m.ground) {
const { name, city } = parseGroundParts(m.ground)
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
}
if (m.group) {
if (!groups.has(m.group)) groups.set(m.group, new Set())
groups.get(m.group)!.add(m.team1)
groups.get(m.group)!.add(m.team2)
}
}
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, state)
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
if (state.active && state.group) {
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
if (link) {
const page = link.replace('/wiki/', '').split('#')[0]
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group))
groupSubpages.set(state.group, page)
}
}
} else if ($el.hasClass('footballbox')) {
if (!state.active) return
const m = parseBox($, $el, state.round || state.group || 'Unknown', state.group)
if (m) recordMatch(m)
}
})
for (const [group, page] of groupSubpages) {
if (groupsOnMainPage.has(group)) continue
if (opts?.skipGroups?.has(group)) {
process.stdout.write(`[skip ${group}] `)
continue
}
await delay(1200)
const subHtml = await fetchWikiHtml(page)
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
const stateTemp: State = { active: false, round: '', group: null }
let round = 'Group stage'
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
const $h = $(el).find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, stateTemp)
if (stateTemp.group === group) { round = stateTemp.round || 'Group stage'; return false }
})
const $sub = load(subHtml)
for (const m of collectBoxes($sub, round, group)) recordMatch(m)
process.stdout.write(`[+${page.slice(-8)}] `)
}
const infobox = parseInfobox($)
const placements = derivePlacements(matches)
const meta: Meta = {
host: infobox.host ?? '',
teams_count: infobox.teams_count ?? null,
winner: placements.winner ?? infobox.winner ?? null,
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
third_place: placements.third_place ?? infobox.third_place ?? null,
fourth_place: placements.fourth_place?? infobox.fourth_place?? null,
}
return { matches, stadiums, groups, meta }
}
// ── Squad page scraper ─────────────────────────────────────────────────────
export function scrapeSquads(html: string): Squad[] {
const $ = load(html)
const squads: Squad[] = []
let currentTeam: Squad | null = null
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h3, h4').first()
if (!$h.length) return
if (parseInt($h.prop('tagName')?.slice(1) ?? '9') !== 3) return
const name = $h.text().replace(/\[edit\]/g, '').trim()
if (/^group /i.test(name)) return
currentTeam = { name, players: [] }
squads.push(currentTeam)
return
}
if (!currentTeam) return
let number: number | undefined
let pos: string | undefined
let playerName = ''
let dob: string | undefined
$el.find('td, th[scope="row"]').each((i, td) => {
const $td = $(td)
const text = $td.text().trim()
if ($td.is('th[scope="row"]')) {
playerName = $td.find('a').first().text().trim() || text
} else if (i === 0 && !playerName) {
const n = parseInt(text); if (!isNaN(n)) number = n
} else if (i === 1 && !playerName && !pos) {
const p = $td.find('a').first().text().trim()
if (['GK', 'DF', 'MF', 'FW'].includes(p)) pos = p
}
const $bday = $td.find('.bday')
if ($bday.length) dob = $bday.text().trim()
})
if (!playerName) return
const player: Player = { name: playerName }
if (number !== undefined) player.number = number
if (pos) player.pos = pos
if (dob) player.date_of_birth = dob
currentTeam.players.push(player)
})
return squads.filter(s => s.players.length > 0)
}
+50 -613
View File
@@ -1,10 +1,20 @@
import { load } from 'cheerio' /**
import type { CheerioAPI } from 'cheerio' * Scrape English Wikipedia for World Cup data and write JSON files to
import type { Cheerio } from 'cheerio' * app/data/wikipedia/{year}/.
import type { Element } from 'domhandler' *
* Usage:
* pnpm scrape # all years, matches + squads
* pnpm scrape 2022 # single year, matches + squads
* pnpm scrape 2022 --matches # matches + meta + stadiums only
* pnpm scrape 2022 --squads # squads only
*/
import { mkdirSync, writeFileSync } from 'fs' import { mkdirSync, writeFileSync } from 'fs'
import path from 'path' import path from 'path'
import { fileURLToPath } from 'url' import { fileURLToPath } from 'url'
import {
fetchWikiHtml, scrapeYear, scrapeSquads,
type Match, type Stadium, type Group, type Meta, type Squad,
} from '../lib/wiki-scraper'
const __dirname = path.dirname(fileURLToPath(import.meta.url)) const __dirname = path.dirname(fileURLToPath(import.meta.url))
const DATA_DIR = path.join(__dirname, '../app/data/wikipedia') const DATA_DIR = path.join(__dirname, '../app/data/wikipedia')
@@ -16,648 +26,75 @@ const YEARS = [
const delay = (ms: number) => new Promise(r => setTimeout(r, ms)) const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
// ── Types ────────────────────────────────────────────────────────────────── // ── File output ────────────────────────────────────────────────────────────
type Goal = { function writeMatches(
name: string
minute?: number
offset?: number
penalty?: boolean
owngoal?: boolean
}
type ScoreObj = {
ft?: [number, number]
et?: [number, number]
p?: [number, number]
}
type Match = {
round: string
group?: string
date?: string
time?: string
team1: string
team2: string
score?: ScoreObj
goals1?: Goal[]
goals2?: Goal[]
ground?: string
}
type Stadium = { name: string; city: string }
type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
type Squad = { name: string; players: Player[] }
type Group = { name: string; teams: string[] }
type Meta = {
host: string
teams_count: number | null
winner: string | null
runner_up: string | null
third_place: string | null
fourth_place: string | null
}
// ── Fetch ──────────────────────────────────────────────────────────────────
async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
for (let attempt = 0; attempt < retries; attempt++) {
try {
if (attempt > 0) await delay(3000 * attempt)
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0 (github.com/worldcup)' } })
if (!res.ok) continue
const data = await res.json() as { parse?: { text?: { '*': string } } }
const html = data?.parse?.text?.['*']
if (html) return html
} catch {
// retry
}
}
return null
}
// ── Score parsing ──────────────────────────────────────────────────────────
function parseScoreText(text: string): [number, number] | null {
const m = text.match(/(\d+)\s*[\-]\s*(\d+)/)
if (!m) return null
return [parseInt(m[1]), parseInt(m[2])]
}
// ── Team name extraction ───────────────────────────────────────────────────
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
let name = ''
$cell.find('a').each((_, a) => {
const $a = $(a)
if (!$a.find('img').length && $a.text().trim()) {
name = $a.text().trim()
return false
}
})
return name
}
// ── Goal parsing ───────────────────────────────────────────────────────────
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
const goals: Goal[] = []
$td.find('li').each((_, li) => {
const $li = $(li)
// Player name: first <a> NOT inside .fb-goal
let playerName = ''
$li.find('a').each((_, a) => {
if (!$(a).closest('.fb-goal').length) {
const t = $(a).text().trim()
if (t) { playerName = t; return false }
}
})
if (!playerName) return
const $fbGoal = $li.find('.fb-goal')
if (!$fbGoal.length) return
// Each direct child <span> inside .fb-goal (excluding image wrapper)
$fbGoal.children('span').each((_, span) => {
const $span = $(span)
if ($span.attr('typeof')) return // image wrapper
const text = $span.text()
const minMatch = text.match(/(\d+)(?:\+(\d+))?[']/)
if (!minMatch) return
const minute = parseInt(minMatch[1])
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
const isPen = text.includes('pen.')
const isOG = text.includes('o.g.')
const goal: Goal = { name: playerName }
if (!isNaN(minute)) goal.minute = minute
if (offset) goal.offset = offset
if (isPen) goal.penalty = true
if (isOG) goal.owngoal = true
goals.push(goal)
})
})
return goals
}
// ── Ground extraction ──────────────────────────────────────────────────────
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
const $loc = $box.find('[itemprop="name address"]').first()
if ($loc.length) return $loc.text().trim()
return $box.find('.fright').first().text().split('\n')[0].trim()
}
function parseGroundParts(ground: string): { name: string; city: string } {
const commaIdx = ground.indexOf(',')
if (commaIdx !== -1) {
return {
name: ground.slice(0, commaIdx).trim(),
city: ground.slice(commaIdx + 1).trim(),
}
}
return { name: ground, city: '' }
}
// ── Footballbox parsing ────────────────────────────────────────────────────
function parseBox(
$: CheerioAPI,
$box: Cheerio<Element>,
round: string,
group: string | null,
): Match | null {
const team1 = extractTeam($, $box.find('.fhome'))
const team2 = extractTeam($, $box.find('.faway'))
if (!team1 || !team2) return null
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
const timeText = $box.find('.ftime').first().text().trim()
const timeMatch = timeText.match(/(\d{2}:\d{2})/)
const timeStr = timeMatch?.[1]
const scoreText = $box.find('.fscore').first().text().trim()
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
const scoreArr = parseScoreText(scoreText)
// Use first fgoals row only (exclude penalty shootout row)
const $regularRow = $box.find('tr.fgoals').first()
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
// Penalty shootout score: row after "Penalties" header tr
let penScore: [number, number] | undefined
$box.find('tr').each((_, tr) => {
const $tr = $(tr)
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
const penText = $tr.next('tr').find('th').not('.fhome,.faway').first().text().trim()
const ps = parseScoreText(penText)
if (ps) penScore = ps
return false
}
})
let score: ScoreObj | undefined
if (scoreArr) {
if (hasAET) {
// scoreArr is ET total; compute FT from goals in ≤90 min
const ftGoals = (gs: Goal[], includeOG = false) =>
gs.filter(g => {
const w90 = g.minute === undefined || g.minute <= 90
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
}).length
const ftHome = ftGoals(goals1) + ftGoals(goals2, true)
const ftAway = ftGoals(goals2) + ftGoals(goals1, true)
score = { ft: [ftHome, ftAway], et: scoreArr }
} else {
score = { ft: scoreArr }
}
if (penScore) score.p = penScore
}
const ground = extractGround($, $box) || undefined
return {
round,
...(group ? { group } : {}),
...(dateStr ? { date: dateStr } : {}),
...(timeStr ? { time: timeStr } : {}),
team1,
team2,
...(score ? { score } : {}),
...(goals1.length ? { goals1 } : {}),
...(goals2.length ? { goals2 } : {}),
...(ground ? { ground } : {}),
}
}
// ── Collect matches from a pre-loaded page ─────────────────────────────────
function collectBoxes(
$: CheerioAPI,
round: string,
group: string | null,
): Match[] {
const matches: Match[] = []
$('.footballbox').each((_, el) => {
const m = parseBox($, $(el), round, group)
if (m) matches.push(m)
})
return matches
}
// ── Section heading state machine ──────────────────────────────────────────
type State = {
active: boolean
round: string
group: string | null
}
function processHeading(text: string, level: number, state: State): void {
const t = text.toLowerCase().trim()
if (level === 2) {
if (/group stage/i.test(t) && !/second/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/first group stage/i.test(t)) {
state.active = true; state.round = 'Group stage'; state.group = null
} else if (/second group stage/i.test(t)) {
state.active = true; state.round = 'Second group stage'; state.group = null
} else if (t === 'final round') {
state.active = true; state.round = 'Final round'; state.group = null
} else if (/final tournament/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/knock.?out stage/i.test(t)) {
state.active = true; state.round = ''; state.group = null
} else if (/round of 16/i.test(t)) {
state.active = true; state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.active = true; state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.active = true; state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.active = true; state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.active = true; state.round = 'Final'; state.group = null
} else {
state.active = false
}
return
}
if (!state.active) return
if (level === 3 || level === 4) {
if (/^group [a-h1-9]+$/i.test(t)) {
state.group = text.trim()
} else if (/round of 32/i.test(t)) {
state.round = 'Round of 32'; state.group = null
} else if (/round of 16/i.test(t)) {
state.round = 'Round of 16'; state.group = null
} else if (/quarter.final/i.test(t)) {
state.round = 'Quarter-finals'; state.group = null
} else if (/semi.final/i.test(t)) {
state.round = 'Semi-finals'; state.group = null
} else if (/third.place|match for third|play.off for third/i.test(t)) {
state.round = 'Third-place match'; state.group = null
} else if (t === 'final') {
state.round = 'Final'; state.group = null
}
// bracket, draw, seeding, replay → keep current state
}
}
// ── Main year scraper ──────────────────────────────────────────────────────
// ── Infobox parsing ────────────────────────────────────────────────────────
function parseInfobox($: CheerioAPI): Partial<Meta> {
const result: Partial<Meta> = {}
function tdText($td: Cheerio<Element>): string {
const $clone = $td.clone()
$clone.find('br').replaceWith(' / ')
$clone.find('sup, img').remove()
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
}
function tdFirstLink($td: Cheerio<Element>): string | null {
let name: string | null = null
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
})
return name ?? (tdText($td) || null)
}
function tdAllLinks($td: Cheerio<Element>): string {
const names: string[] = []
$td.find('a').each((_, a) => {
const t = $(a).clone().find('img').remove().end().text().trim()
if (t && !/\[\d+\]/.test(t)) names.push(t)
})
return names.length ? names.join(' / ') : tdText($td)
}
$('table.infobox').first().find('tr').each((_, tr) => {
const $tr = $(tr)
const label = $tr.find('th').text().trim().toLowerCase()
const $td = $tr.find('td').first()
if (!$td.length) return
if (/host countr/i.test(label)) {
result.host = tdAllLinks($td)
} else if (/^teams$/i.test(label)) {
const m = $td.text().match(/\d+/)
if (m) result.teams_count = parseInt(m[0])
} else if (/champion/i.test(label)) {
result.winner = tdFirstLink($td)
} else if (/runners?.up/i.test(label)) {
result.runner_up = tdFirstLink($td)
} else if (/third.place/i.test(label)) {
result.third_place = tdFirstLink($td)
} else if (/fourth.place/i.test(label)) {
result.fourth_place = tdFirstLink($td)
}
})
return result
}
// ── Placement derivation ───────────────────────────────────────────────────
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
function matchWinner(m: Match): [string, string] | null {
if (!m.score) return null
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
if (h > a) return [m.team1, m.team2]
if (a > h) return [m.team2, m.team1]
if (m.score.p) {
const [ph, pa] = m.score.p
if (ph > pa) return [m.team1, m.team2]
if (pa > ph) return [m.team2, m.team1]
}
return null
}
let winner: string | null = null, runner_up: string | null = null
let third_place: string | null = null, fourth_place: string | null = null
for (const m of matches) {
if (m.round === 'Final') {
const result = matchWinner(m)
if (result) { [winner, runner_up] = result }
} else if (m.round === 'Third-place match') {
const result = matchWinner(m)
if (result) { [third_place, fourth_place] = result }
}
}
return { winner, runner_up, third_place, fourth_place }
}
// ── Year result ────────────────────────────────────────────────────────────
type YearResult = {
matches: Match[]
stadiums: Map<string, Stadium>
groups: Map<string, Set<string>>
meta: Meta
}
async function scrapeYear(year: number, mainHtml: string): Promise<YearResult> {
const $ = load(mainHtml)
const matches: Match[] = []
const stadiums = new Map<string, Stadium>()
const groups = new Map<string, Set<string>>()
const state: State = { active: false, round: '', group: null }
// Maps group name → sub-page to fetch (if main page has no matches for that group)
const groupSubpages = new Map<string, string>()
// Groups that got at least one match from the main page
const groupsOnMainPage = new Set<string>()
function recordMatch(m: Match) {
matches.push(m)
if (m.group) groupsOnMainPage.add(m.group)
if (m.ground) {
const { name, city } = parseGroundParts(m.ground)
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
}
if (m.group) {
if (!groups.has(m.group)) groups.set(m.group, new Set())
groups.get(m.group)!.add(m.team1)
groups.get(m.group)!.add(m.team2)
}
}
// Walk elements in document order: headings, hatnotes, footballboxes
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
const text = $h.text().replace(/\[edit\]/g, '').trim()
processHeading(text, level, state)
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
// Record sub-page link for current group context (for fallback if no main-page matches)
if (state.active && state.group) {
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
if (link) {
const page = link.replace('/wiki/', '').split('#')[0]
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group)) {
groupSubpages.set(state.group, page)
}
}
}
} else if ($el.hasClass('footballbox')) {
if (!state.active) return
const round = state.round || state.group || 'Unknown'
const m = parseBox($, $el, round, state.group)
if (m) recordMatch(m)
}
})
// Fetch group sub-pages for any group that got 0 matches from main page
for (const [group, page] of groupSubpages) {
if (groupsOnMainPage.has(group)) continue
await delay(1200)
const subHtml = await fetchWikiHtml(page)
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
// Determine the round for this group from the state machine result
// (we'll reconstruct from the main-page walk state — use the round that was active when this group was seen)
// Since we can't easily recover state here, we re-walk to find the round for this group
let round = 'Group stage'
let foundGroup = false
const stateTemp: State = { active: false, round: '', group: null }
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
const $h = $(el).find('h2, h3, h4').first()
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
const text = $h.text().replace(/\[edit\]/g, '').trim()
processHeading(text, level, stateTemp)
if (stateTemp.group === group) {
round = stateTemp.round || 'Group stage'
foundGroup = true
return false
}
})
const $sub = load(subHtml)
const subMatches = collectBoxes($sub, round || 'Group stage', group)
for (const m of subMatches) {
recordMatch(m)
}
process.stdout.write(`[+${page.slice(-8)}] `)
}
const infobox = parseInfobox($)
const placements = derivePlacements(matches)
const meta: Meta = {
host: infobox.host ?? '',
teams_count: infobox.teams_count ?? null,
winner: placements.winner ?? infobox.winner ?? null,
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
third_place: placements.third_place ?? infobox.third_place ?? null,
fourth_place:placements.fourth_place?? infobox.fourth_place?? null,
}
return { matches, stadiums, groups, meta }
}
// ── Squad page scraper ─────────────────────────────────────────────────────
function scrapeSquads(html: string): Squad[] {
const $ = load(html)
const squads: Squad[] = []
let currentTeam: Squad | null = null
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
const $el = $(el)
if ($el.hasClass('mw-heading')) {
const $h = $el.find('h3, h4').first()
if (!$h.length) return
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
if (level !== 3) return
const name = $h.text().replace(/\[edit\]/g, '').trim()
if (/^group /i.test(name)) return // skip group headers
currentTeam = { name, players: [] }
squads.push(currentTeam)
return
}
if (!currentTeam) return
let number: number | undefined
let pos: string | undefined
let playerName = ''
let dob: string | undefined
$el.find('td, th[scope="row"]').each((i, td) => {
const $td = $(td)
const text = $td.text().trim()
if ($td.is('th[scope="row"]')) {
playerName = $td.find('a').first().text().trim() || text
} else if (i === 0 && !playerName) {
const n = parseInt(text)
if (!isNaN(n)) number = n
} else if (i === 1 && !playerName && !pos) {
const posLink = $td.find('a').first().text().trim()
if (['GK', 'DF', 'MF', 'FW'].includes(posLink)) pos = posLink
}
const $bday = $td.find('.bday')
if ($bday.length) dob = $bday.text().trim()
})
if (!playerName) return
const player: Player = { name: playerName }
if (number !== undefined) player.number = number
if (pos) player.pos = pos
if (dob) player.date_of_birth = dob
currentTeam.players.push(player)
})
return squads.filter(s => s.players.length > 0)
}
// ── Output ─────────────────────────────────────────────────────────────────
function writeOutput(
year: number, year: number,
matches: Match[], matches: Match[],
stadiums: Map<string, Stadium>, stadiums: Map<string, Stadium>,
groups: Map<string, Set<string>>, groups: Map<string, Set<string>>,
squads: Squad[],
meta: Meta, meta: Meta,
): void { ): void {
const dir = path.join(DATA_DIR, String(year)) const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true }) mkdirSync(dir, { recursive: true })
writeFileSync( writeFileSync(path.join(dir, 'worldcup.meta.json'), JSON.stringify(meta, null, 2), 'utf-8')
path.join(dir, 'worldcup.meta.json'), writeFileSync(path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), 'utf-8')
JSON.stringify(meta, null, 2),
'utf-8',
)
writeFileSync( if (stadiums.size > 0)
path.join(dir, 'worldcup.json'), writeFileSync(path.join(dir, 'worldcup.stadiums.json'),
JSON.stringify({ matches }, null, 2), JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), 'utf-8')
'utf-8',
)
if (stadiums.size > 0) {
writeFileSync(
path.join(dir, 'worldcup.stadiums.json'),
JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2),
'utf-8',
)
}
const groupList: Group[] = [] const groupList: Group[] = []
groups.forEach((teams, name) => { groups.forEach((teams, name) => groupList.push({ name, teams: Array.from(teams) }))
groupList.push({ name, teams: Array.from(teams) }) if (groupList.length > 0)
}) writeFileSync(path.join(dir, 'worldcup.groups.json'),
if (groupList.length > 0) { JSON.stringify({ groups: groupList }, null, 2), 'utf-8')
writeFileSync( }
path.join(dir, 'worldcup.groups.json'),
JSON.stringify({ groups: groupList }, null, 2),
'utf-8',
)
}
if (squads.length > 0) { function writeSquads(year: number, squads: Squad[]): void {
writeFileSync( if (squads.length === 0) return
path.join(dir, 'worldcup.squads.json'), const dir = path.join(DATA_DIR, String(year))
JSON.stringify(squads, null, 2), mkdirSync(dir, { recursive: true })
'utf-8', writeFileSync(path.join(dir, 'worldcup.squads.json'), JSON.stringify(squads, null, 2), 'utf-8')
)
}
} }
// ── Entry point ──────────────────────────────────────────────────────────── // ── Entry point ────────────────────────────────────────────────────────────
async function main() { async function main() {
const onlyYear = process.argv[2] ? parseInt(process.argv[2]) : null const args = process.argv.slice(2)
const yearsToScrape = onlyYear ? [onlyYear] : YEARS const yearArg = args.find(a => /^\d{4}$/.test(a))
const doMatches = args.includes('--matches') || !args.includes('--squads')
const doSquads = args.includes('--squads') || !args.includes('--matches')
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia...`) const yearsToScrape = yearArg ? [parseInt(yearArg)] : YEARS
const modeLabel = [doMatches && 'matches', doSquads && 'squads'].filter(Boolean).join(' + ')
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia [${modeLabel}]...`)
for (const year of yearsToScrape) { for (const year of yearsToScrape) {
process.stdout.write(` ${year}... `) process.stdout.write(` ${year}... `)
if (doMatches) {
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`) const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
if (!mainHtml) { console.log('FAILED'); continue } if (!mainHtml) { console.log('FAILED (main page)'); continue }
const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml) const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
writeMatches(year, matches, stadiums, groups, meta)
process.stdout.write(`${matches.length} matches`)
await delay(600) await delay(600)
}
if (doSquads) {
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`) const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
const squads = squadHtml ? scrapeSquads(squadHtml) : [] const squads = squadHtml ? scrapeSquads(squadHtml) : []
writeSquads(year, squads)
writeOutput(year, matches, stadiums, groups, squads, meta) process.stdout.write(`${doMatches ? ', ' : ''}${squads.length} squads`)
console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`)
await delay(600) await delay(600)
} }
console.log()
}
console.log('\nDone! Files written to app/data/wikipedia/{year}/') console.log('\nDone! Files written to app/data/wikipedia/{year}/')
} }
+101 -127
View File
@@ -1,40 +1,16 @@
import postgres from 'postgres' import postgres from 'postgres'
import { drizzle } from 'drizzle-orm/postgres-js' import { drizzle } from 'drizzle-orm/postgres-js'
import { sql } from 'drizzle-orm' import { sql } from 'drizzle-orm'
import { TEAM_ISO, getIso } from '../lib/iso-codes' import { fetchWikiHtml, scrapeYear, scrapeSquads } from '../lib/wiki-scraper'
import { getIso } from '../lib/iso-codes'
const DATABASE_URL = process.env.DATABASE_URL const DATABASE_URL = process.env.DATABASE_URL
if (!DATABASE_URL) { if (!DATABASE_URL) {
console.error('ERROR: DATABASE_URL environment variable is not set') console.error('ERROR: DATABASE_URL environment variable is not set')
process.exit(1) process.exit(1)
} }
const BASE = 'https://raw.githubusercontent.com/openfootball/worldcup.json/master'
async function fetchJson(url: string): Promise<unknown> { // ── DB helpers ─────────────────────────────────────────────────────────────
try {
const res = await fetch(url)
if (!res.ok) return null
return res.json()
} catch {
return null
}
}
type RawGoal = { name: string; minute?: string | number; offset?: number; penalty?: boolean; owngoal?: boolean }
type RawScore = { ft?: number[]; ht?: number[]; et?: number[]; p?: number[] } | number[]
type RawMatch = {
round?: string; date?: string; time?: string;
team1: string; team2: string; score?: RawScore;
goals1?: RawGoal[]; goals2?: RawGoal[];
group?: string; ground?: string;
}
type RawData = { matches: RawMatch[] }
function parseScore(score: RawScore | undefined) {
if (!score) return {}
if (Array.isArray(score)) return { ft: score }
return { ft: score.ft, ht: score.ht, et: score.et, p: score.p }
}
async function run() { async function run() {
const client = postgres(DATABASE_URL!, { max: 2 }) const client = postgres(DATABASE_URL!, { max: 2 })
@@ -42,17 +18,13 @@ async function run() {
const teamCache = new Map<string, number>() const teamCache = new Map<string, number>()
async function upsertTeam(rawName: string, extra?: { iso2?: string | null; fifaCode?: string; continent?: string; confederation?: string }) { async function upsertTeam(rawName: string) {
if (teamCache.has(rawName)) return teamCache.get(rawName)! if (teamCache.has(rawName)) return teamCache.get(rawName)!
const iso2 = (extra && 'iso2' in extra) ? extra.iso2 : getIso(rawName) const iso2 = getIso(rawName)
const [row] = await db.execute(sql` const [row] = await db.execute(sql`
INSERT INTO teams (name, iso2, fifa_code, continent, confederation) INSERT INTO teams (name, iso2)
VALUES (${rawName}, ${iso2 ?? null}, ${extra?.fifaCode ?? null}, ${extra?.continent ?? null}, ${extra?.confederation ?? null}) VALUES (${rawName}, ${iso2 ?? null})
ON CONFLICT (name) DO UPDATE SET ON CONFLICT (name) DO UPDATE SET iso2 = COALESCE(EXCLUDED.iso2, teams.iso2)
iso2 = COALESCE(EXCLUDED.iso2, teams.iso2),
fifa_code = COALESCE(EXCLUDED.fifa_code, teams.fifa_code),
continent = COALESCE(EXCLUDED.continent, teams.continent),
confederation = COALESCE(EXCLUDED.confederation, teams.confederation)
RETURNING id RETURNING id
`) `)
const id = (row as { id: number }).id const id = (row as { id: number }).id
@@ -62,20 +34,19 @@ async function run() {
async function upsertMatch( async function upsertMatch(
year: number, round: string, group: string | null, dateStr: string | null, year: number, round: string, group: string | null, dateStr: string | null,
timeStr: string | null, team1Id: number, team2Id: number, score: ReturnType<typeof parseScore>, timeStr: string | null, team1Id: number, team2Id: number,
isQuali: boolean ft: [number, number] | undefined, et: [number, number] | undefined, p: [number, number] | undefined,
isQuali: boolean,
) { ) {
const rows = await db.execute(sql` const rows = await db.execute(sql`
INSERT INTO matches (tournament_year, round, group_name, date, time_local, team1_id, team2_id, INSERT INTO matches (tournament_year, round, group_name, date, time_local, team1_id, team2_id,
score_ft_home, score_ft_away, score_ht_home, score_ht_away, score_ft_home, score_ft_away, score_et_home, score_et_away,
score_et_home, score_et_away, score_p_home, score_p_away, is_quali_playoff) score_p_home, score_p_away, is_quali_playoff)
VALUES ( VALUES (
${year}, ${round}, ${group}, ${dateStr ?? null}, ${timeStr ?? null}, ${year}, ${round}, ${group}, ${dateStr}, ${timeStr}, ${team1Id}, ${team2Id},
${team1Id}, ${team2Id}, ${ft?.[0] ?? null}, ${ft?.[1] ?? null},
${score.ft?.[0] ?? null}, ${score.ft?.[1] ?? null}, ${et?.[0] ?? null}, ${et?.[1] ?? null},
${score.ht?.[0] ?? null}, ${score.ht?.[1] ?? null}, ${p?.[0] ?? null}, ${p?.[1] ?? null},
${score.et?.[0] ?? null}, ${score.et?.[1] ?? null},
${score.p?.[0] ?? null}, ${score.p?.[1] ?? null},
${isQuali} ${isQuali}
) )
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
@@ -83,8 +54,6 @@ async function run() {
time_local = COALESCE(EXCLUDED.time_local, matches.time_local), time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home), score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away), score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
score_ht_home = COALESCE(EXCLUDED.score_ht_home, matches.score_ht_home),
score_ht_away = COALESCE(EXCLUDED.score_ht_away, matches.score_ht_away),
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home), score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away), score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home), score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
@@ -94,24 +63,13 @@ async function run() {
return (rows[0] as { id: number }).id return (rows[0] as { id: number }).id
} }
type GoalRow = { teamId: number; name: string; minute: number | null; offset: number; penalty: boolean; owngoal: boolean } async function replaceGoals(matchId: number, goals: Array<{
teamId: number; name: string; minute: number | null; offset: number; penalty: boolean; owngoal: boolean
function collectGoals(teamId: number, rawGoals: RawGoal[], isOwnGoalTeamId: number): GoalRow[] { }>) {
return rawGoals.flatMap(g => {
if (!g.name) return []
const minute = g.minute != null ? parseInt(String(g.minute)) : null
return [{ teamId: g.owngoal ? isOwnGoalTeamId : teamId, name: g.name,
minute: isNaN(minute!) ? null : minute, offset: g.offset ?? 0,
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false }]
})
}
async function replaceGoals(matchId: number, rows: GoalRow[]) {
await db.transaction(async tx => { await db.transaction(async tx => {
await tx.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`) await tx.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
if (rows.length > 0) { if (goals.length > 0) {
// Single bulk INSERT — readers see old goals until commit, never an empty window const vals = goals.map(g =>
const vals = rows.map(g =>
sql`(${matchId}, ${g.teamId}, ${g.name}, ${g.minute}, ${g.offset}, ${g.penalty}, ${g.owngoal})` sql`(${matchId}, ${g.teamId}, ${g.name}, ${g.minute}, ${g.offset}, ${g.penalty}, ${g.owngoal})`
) )
await tx.execute(sql` await tx.execute(sql`
@@ -122,101 +80,117 @@ async function run() {
}) })
} }
console.log('\nSyncing 2026...') // ── Incremental group detection ────────────────────────────────────────────
// Groups where every known match already has a FT score — no need to re-fetch their sub-page.
async function getCompletedGroups(): Promise<Set<string>> {
const rows = await db.execute(sql`
SELECT group_name
FROM matches
WHERE tournament_year = 2026
AND group_name IS NOT NULL
AND is_quali_playoff = false
GROUP BY group_name
HAVING COUNT(*) > 0
AND COUNT(*) = SUM(CASE WHEN score_ft_home IS NOT NULL THEN 1 ELSE 0 END)
`)
return new Set(rows.map(r => (r as { group_name: string }).group_name))
}
// ── Sync 2026 from Wikipedia ───────────────────────────────────────────────
console.log('\nSyncing 2026 from Wikipedia...')
// Upsert 2026 tournament row (no winner yet)
await db.execute(sql` await db.execute(sql`
INSERT INTO tournaments (year, host) INSERT INTO tournaments (year, host)
VALUES (2026, 'USA / Canada / Mexico') VALUES (2026, 'USA / Canada / Mexico')
ON CONFLICT (year) DO NOTHING ON CONFLICT (year) DO NOTHING
`) `)
// Teams enrichment const mainHtml = await fetchWikiHtml('2026_FIFA_World_Cup')
const teamsData = await fetchJson(`${BASE}/2026/worldcup.teams.json`) as Record<string, unknown>[] | null if (!mainHtml) {
if (teamsData && Array.isArray(teamsData)) { console.error(' FAILED to fetch 2026 Wikipedia page')
for (const t of teamsData) { await client.end()
const name = (t.name ?? t.name_normalised) as string process.exit(1)
await upsertTeam(name, {
iso2: TEAM_ISO[name] ?? getIso(name),
fifaCode: t.fifa_code as string,
continent: t.continent as string,
confederation: t.confed as string,
})
}
} }
const completedGroups = await getCompletedGroups()
if (completedGroups.size > 0)
console.log(` Skipping completed groups: ${[...completedGroups].sort().join(', ')}`)
process.stdout.write(' ')
const { matches, stadiums, meta } = await scrapeYear(2026, mainHtml, { skipGroups: completedGroups })
console.log()
// Stadiums // Stadiums
const stadiumsData = await fetchJson(`${BASE}/2026/worldcup.stadiums.json`) as { stadiums?: Record<string, unknown>[] } | null for (const s of stadiums.values()) {
if (stadiumsData?.stadiums) {
for (const s of stadiumsData.stadiums) {
await db.execute(sql` await db.execute(sql`
INSERT INTO stadiums (tournament_year, name, city, country_code, capacity, timezone, coordinates) INSERT INTO stadiums (tournament_year, name, city)
VALUES (2026, ${s.name as string}, ${s.city as string}, ${(s.cc as string | undefined) ?? null}, VALUES (2026, ${s.name}, ${s.city ?? null})
${(s.capacity as number | undefined) ?? null}, ${(s.timezone as string | undefined) ?? null}, ${(s.coords as string | undefined) ?? null})
ON CONFLICT DO NOTHING ON CONFLICT DO NOTHING
`) `)
} }
}
// Main matches // Matches + goals
const mainData = await fetchJson(`${BASE}/2026/worldcup.json`) as RawData | null
let matchCount = 0, goalCount = 0 let matchCount = 0, goalCount = 0
if (mainData?.matches) { for (const m of matches) {
for (const m of mainData.matches) {
const t1Id = await upsertTeam(m.team1) const t1Id = await upsertTeam(m.team1)
const t2Id = await upsertTeam(m.team2) const t2Id = await upsertTeam(m.team2)
const score = parseScore(m.score) const matchId = await upsertMatch(
const matchId = await upsertMatch(2026, m.round ?? 'Unknown', m.group ?? null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, false) 2026, m.round, m.group ?? null, m.date ?? null, m.time ?? null,
if (m.goals1?.length || m.goals2?.length) { t1Id, t2Id, m.score?.ft, m.score?.et, m.score?.p, false,
const goalRows = [ )
...(m.goals1?.length ? collectGoals(t1Id, m.goals1, t2Id) : []), const goals = [
...(m.goals2?.length ? collectGoals(t2Id, m.goals2, t1Id) : []), ...(m.goals1 ?? []).map(g => ({
teamId: g.owngoal ? t2Id : t1Id, name: g.name,
minute: g.minute ?? null, offset: g.offset ?? 0,
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false,
})),
...(m.goals2 ?? []).map(g => ({
teamId: g.owngoal ? t1Id : t2Id, name: g.name,
minute: g.minute ?? null, offset: g.offset ?? 0,
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false,
})),
] ]
await replaceGoals(matchId, goalRows) if (goals.length > 0) await replaceGoals(matchId, goals)
}
matchCount++ matchCount++
goalCount += (m.goals1?.length ?? 0) + (m.goals2?.length ?? 0) goalCount += goals.length
}
} }
// Squads // Squads (fetch once; idempotent upsert so safe to re-run)
const squadsData = await fetchJson(`${BASE}/2026/worldcup.squads.json`) as Record<string, unknown>[] | null const squadHtml = await fetchWikiHtml('2026_FIFA_World_Cup_squads')
if (squadsData && Array.isArray(squadsData)) { if (squadHtml) {
for (const sq of squadsData) { const squads = scrapeSquads(squadHtml)
const teamId = await upsertTeam(sq.name as string) for (const sq of squads) {
for (const p of (sq.players as Record<string, unknown>[])) { const teamId = await upsertTeam(sq.name)
for (const p of sq.players) {
const dob = p.date_of_birth ? p.date_of_birth.replace(/\s/g, '') : null
await db.execute(sql` await db.execute(sql`
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth) INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
VALUES (2026, ${teamId}, ${p.name as string}, ${p.number as number ?? null}, VALUES (2026, ${teamId}, ${p.name}, ${p.number ?? null}, ${p.pos ?? null}, ${dob})
${p.pos as string ?? null}, ${p.date_of_birth as string ?? null})
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
player_name = EXCLUDED.player_name, position = EXCLUDED.position, date_of_birth = EXCLUDED.date_of_birth player_name = EXCLUDED.player_name,
position = EXCLUDED.position,
date_of_birth = EXCLUDED.date_of_birth
`) `)
} }
} }
console.log(' Squads loaded for 2026') console.log(` Squads: ${squads.length} teams`)
} }
// Quali playoffs // Tournament winner (once the final is played)
const qualiData = await fetchJson(`${BASE}/2026/worldcup.quali_playoffs.json`) as RawData | null if (meta.winner) {
if (qualiData?.matches) { await db.execute(sql`
for (const m of qualiData.matches) { UPDATE tournaments SET
const t1Id = await upsertTeam(m.team1) winner = ${meta.winner},
const t2Id = await upsertTeam(m.team2) runner_up = ${meta.runner_up},
const score = parseScore(m.score) third_place = ${meta.third_place},
const matchId = await upsertMatch(2026, m.round ?? 'Qualifier', null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, true) fourth_place = ${meta.fourth_place}
if (m.goals1?.length || m.goals2?.length) { WHERE year = 2026
const goalRows = [ `)
...(m.goals1?.length ? collectGoals(t1Id, m.goals1, t2Id) : []),
...(m.goals2?.length ? collectGoals(t2Id, m.goals2, t1Id) : []),
]
await replaceGoals(matchId, goalRows)
}
}
console.log(` Quali playoffs: ${qualiData.matches.length} matches`)
} }
// Group standings from match results // Group standings
await db.execute(sql` await db.execute(sql`
WITH match_results AS ( WITH match_results AS (
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga