From f885e4312c4b40e48bcb4a7c6a303d5280ea1e20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Mon, 15 Jun 2026 17:23:17 +0200 Subject: [PATCH] refactor: extract lib/wiki-scraper.ts, make scraper composable, sync from Wikipedia MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move all scraping logic (fetchWikiHtml, scrapeYear, scrapeSquads and all helpers) into lib/wiki-scraper.ts as exported functions shared by both scripts. scrape-wikipedia.ts becomes a composable CLI: pnpm scrape [year] — matches + squads (default) pnpm scrape [year] --matches — matches/meta/stadiums only pnpm scrape [year] --squads — squads only sync.ts drops the openfootball GitHub dependency entirely and scrapes Wikipedia directly. Incremental: completed groups (all matches have FT scores) are detected via DB query and their sub-pages are skipped each run. Co-Authored-By: Claude Sonnet 4.6 --- lib/wiki-scraper.ts | 467 +++++++++++++++++++++++++ scripts/scrape-wikipedia.ts | 673 +++--------------------------------- scripts/sync.ts | 252 ++++++-------- 3 files changed, 635 insertions(+), 757 deletions(-) create mode 100644 lib/wiki-scraper.ts diff --git a/lib/wiki-scraper.ts b/lib/wiki-scraper.ts new file mode 100644 index 0000000..78c596c --- /dev/null +++ b/lib/wiki-scraper.ts @@ -0,0 +1,467 @@ +import { load } from 'cheerio' +import type { CheerioAPI, Cheerio } from 'cheerio' +import type { Element } from 'domhandler' + +// ── Types ────────────────────────────────────────────────────────────────── + +export type Goal = { + name: string + minute?: number + offset?: number + penalty?: boolean + owngoal?: boolean +} + +export type ScoreObj = { + ft?: [number, number] + et?: [number, number] + p?: [number, number] +} + +export type Match = { + round: string + group?: string + date?: string + time?: string + team1: string + team2: string + score?: ScoreObj + goals1?: Goal[] + goals2?: Goal[] + ground?: string +} + +export type Stadium = { name: string; city: string } +export type Group = { name: string; teams: string[] } +export type Meta = { + host: string + teams_count: number | null + winner: string | null + runner_up: string | null + third_place: string | null + fourth_place: string | null +} + +export type Player = { name: string; number?: number; pos?: string; date_of_birth?: string } +export type Squad = { name: string; players: Player[] } + +export type YearResult = { + matches: Match[] + stadiums: Map + groups: Map> + meta: Meta +} + +type State = { active: boolean; round: string; group: string | null } + +// ── Fetch ────────────────────────────────────────────────────────────────── + +const delay = (ms: number) => new Promise(r => setTimeout(r, ms)) + +export async function fetchWikiHtml(page: string, retries = 5): Promise { + const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1` + for (let attempt = 0; attempt < retries; attempt++) { + try { + if (attempt > 0) await delay(3000 * attempt) + const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0' } }) + if (!res.ok) continue + const data = await res.json() as { parse?: { text?: { '*': string } } } + const html = data?.parse?.text?.['*'] + if (html) return html + } catch { + // retry + } + } + return null +} + +// ── Parsing helpers ──────────────────────────────────────────────────────── + +function parseScoreText(text: string): [number, number] | null { + const m = text.match(/(\d+)\s*[–\-]\s*(\d+)/) + if (!m) return null + return [parseInt(m[1]), parseInt(m[2])] +} + +function extractTeam($: CheerioAPI, $cell: Cheerio): string { + let name = '' + $cell.find('a').each((_, a) => { + const $a = $(a) + if (!$a.find('img').length && $a.text().trim()) { + name = $a.text().trim() + return false + } + }) + return name +} + +function parseGoals($: CheerioAPI, $td: Cheerio): Goal[] { + const goals: Goal[] = [] + $td.find('li').each((_, li) => { + const $li = $(li) + let playerName = '' + $li.find('a').each((_, a) => { + if (!$(a).closest('.fb-goal').length) { + const t = $(a).text().trim() + if (t) { playerName = t; return false } + } + }) + if (!playerName) return + const $fbGoal = $li.find('.fb-goal') + if (!$fbGoal.length) return + $fbGoal.children('span').each((_, span) => { + const $span = $(span) + if ($span.attr('typeof')) return + const text = $span.text() + const minMatch = text.match(/(\d+)(?:\+(\d+))?['′]/) + if (!minMatch) return + const minute = parseInt(minMatch[1]) + const offset = minMatch[2] ? parseInt(minMatch[2]) : 0 + const goal: Goal = { name: playerName } + if (!isNaN(minute)) goal.minute = minute + if (offset) goal.offset = offset + if (text.includes('pen.')) goal.penalty = true + if (text.includes('o.g.')) goal.owngoal = true + goals.push(goal) + }) + }) + return goals +} + +function extractGround($: CheerioAPI, $box: Cheerio): string { + const $loc = $box.find('[itemprop="name address"]').first() + if ($loc.length) return $loc.text().trim() + return $box.find('.fright').first().text().split('\n')[0].trim() +} + +function parseGroundParts(ground: string): { name: string; city: string } { + const commaIdx = ground.indexOf(',') + if (commaIdx !== -1) return { name: ground.slice(0, commaIdx).trim(), city: ground.slice(commaIdx + 1).trim() } + return { name: ground, city: '' } +} + +function parseBox($: CheerioAPI, $box: Cheerio, round: string, group: string | null): Match | null { + const team1 = extractTeam($, $box.find('.fhome')) + const team2 = extractTeam($, $box.find('.faway')) + if (!team1 || !team2) return null + + const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined + const timeText = $box.find('.ftime').first().text().trim() + const timeStr = timeText.match(/(\d{2}:\d{2})/)?.[1] + const scoreText = $box.find('.fscore').first().text().trim() + const hasAET = scoreText.toLowerCase().includes('a.e.t.') + const scoreArr = parseScoreText(scoreText) + + const $regularRow = $box.find('tr.fgoals').first() + const goals1 = parseGoals($, $regularRow.find('.fhgoal')) + const goals2 = parseGoals($, $regularRow.find('.fagoal')) + + let penScore: [number, number] | undefined + $box.find('tr').each((_, tr) => { + const $tr = $(tr) + if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) { + const ps = parseScoreText($tr.next('tr').find('th').not('.fhome,.faway').first().text().trim()) + if (ps) penScore = ps + return false + } + }) + + let score: ScoreObj | undefined + if (scoreArr) { + if (hasAET) { + const ftGoals = (gs: Goal[], includeOG = false) => + gs.filter(g => { + const w90 = g.minute === undefined || g.minute <= 90 + return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90 + }).length + score = { ft: [ftGoals(goals1) + ftGoals(goals2, true), ftGoals(goals2) + ftGoals(goals1, true)], et: scoreArr } + } else { + score = { ft: scoreArr } + } + if (penScore) score.p = penScore + } + + const ground = extractGround($, $box) || undefined + return { + round, + ...(group ? { group } : {}), + ...(dateStr ? { date: dateStr } : {}), + ...(timeStr ? { time: timeStr } : {}), + team1, team2, + ...(score ? { score } : {}), + ...(goals1.length ? { goals1 } : {}), + ...(goals2.length ? { goals2 } : {}), + ...(ground ? { ground } : {}), + } +} + +function collectBoxes($: CheerioAPI, round: string, group: string | null): Match[] { + const matches: Match[] = [] + $('.footballbox').each((_, el) => { + const m = parseBox($, $(el), round, group) + if (m) matches.push(m) + }) + return matches +} + +function processHeading(text: string, level: number, state: State): void { + const t = text.toLowerCase().trim() + if (level === 2) { + if (/group stage/i.test(t) && !/second/i.test(t)) { + state.active = true; state.round = 'Group stage'; state.group = null + } else if (/first group stage/i.test(t)) { + state.active = true; state.round = 'Group stage'; state.group = null + } else if (/second group stage/i.test(t)) { + state.active = true; state.round = 'Second group stage'; state.group = null + } else if (t === 'final round') { + state.active = true; state.round = 'Final round'; state.group = null + } else if (/final tournament/i.test(t)) { + state.active = true; state.round = ''; state.group = null + } else if (/knock.?out stage/i.test(t)) { + state.active = true; state.round = ''; state.group = null + } else if (/round of 16/i.test(t)) { + state.active = true; state.round = 'Round of 16'; state.group = null + } else if (/quarter.final/i.test(t)) { + state.active = true; state.round = 'Quarter-finals'; state.group = null + } else if (/semi.final/i.test(t)) { + state.active = true; state.round = 'Semi-finals'; state.group = null + } else if (/third.place|match for third|play.off for third/i.test(t)) { + state.active = true; state.round = 'Third-place match'; state.group = null + } else if (t === 'final') { + state.active = true; state.round = 'Final'; state.group = null + } else { + state.active = false + } + return + } + if (!state.active) return + if (level === 3 || level === 4) { + if (/^group [a-h1-9]+$/i.test(t)) { + state.group = text.trim() + } else if (/round of 32/i.test(t)) { + state.round = 'Round of 32'; state.group = null + } else if (/round of 16/i.test(t)) { + state.round = 'Round of 16'; state.group = null + } else if (/quarter.final/i.test(t)) { + state.round = 'Quarter-finals'; state.group = null + } else if (/semi.final/i.test(t)) { + state.round = 'Semi-finals'; state.group = null + } else if (/third.place|match for third|play.off for third/i.test(t)) { + state.round = 'Third-place match'; state.group = null + } else if (t === 'final') { + state.round = 'Final'; state.group = null + } + } +} + +// ── Infobox ──────────────────────────────────────────────────────────────── + +function parseInfobox($: CheerioAPI): Partial { + const result: Partial = {} + + function tdText($td: Cheerio): string { + const $clone = $td.clone() + $clone.find('br').replaceWith(' / ') + $clone.find('sup, img').remove() + return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim() + } + function tdFirstLink($td: Cheerio): string | null { + let name: string | null = null + $td.find('a').each((_, a) => { + const t = $(a).clone().find('img').remove().end().text().trim() + if (t && !/\[\d+\]/.test(t)) { name = t; return false } + }) + return name ?? (tdText($td) || null) + } + function tdAllLinks($td: Cheerio): string { + const names: string[] = [] + $td.find('a').each((_, a) => { + const t = $(a).clone().find('img').remove().end().text().trim() + if (t && !/\[\d+\]/.test(t)) names.push(t) + }) + return names.length ? names.join(' / ') : tdText($td) + } + + $('table.infobox').first().find('tr').each((_, tr) => { + const $tr = $(tr) + const label = $tr.find('th').text().trim().toLowerCase() + const $td = $tr.find('td').first() + if (!$td.length) return + if (/host countr/i.test(label)) result.host = tdAllLinks($td) + else if (/^teams$/i.test(label)) { const m = $td.text().match(/\d+/); if (m) result.teams_count = parseInt(m[0]) } + else if (/champion/i.test(label)) result.winner = tdFirstLink($td) + else if (/runners?.up/i.test(label)) result.runner_up = tdFirstLink($td) + else if (/third.place/i.test(label)) result.third_place = tdFirstLink($td) + else if (/fourth.place/i.test(label)) result.fourth_place = tdFirstLink($td) + }) + + return result +} + +function derivePlacements(matches: Match[]): Pick { + function matchWinner(m: Match): [string, string] | null { + if (!m.score) return null + const [h, a] = m.score.et ?? m.score.ft ?? [0, 0] + if (h > a) return [m.team1, m.team2] + if (a > h) return [m.team2, m.team1] + if (m.score.p) { + const [ph, pa] = m.score.p + if (ph > pa) return [m.team1, m.team2] + if (pa > ph) return [m.team2, m.team1] + } + return null + } + let winner: string | null = null, runner_up: string | null = null + let third_place: string | null = null, fourth_place: string | null = null + for (const m of matches) { + if (m.round === 'Final') { + const r = matchWinner(m); if (r) [winner, runner_up] = r + } else if (m.round === 'Third-place match') { + const r = matchWinner(m); if (r) [third_place, fourth_place] = r + } + } + return { winner, runner_up, third_place, fourth_place } +} + +// ── Main year scraper ────────────────────────────────────────────────────── + +export async function scrapeYear( + year: number, + mainHtml: string, + opts?: { skipGroups?: Set }, +): Promise { + const $ = load(mainHtml) + const matches: Match[] = [] + const stadiums = new Map() + const groups = new Map>() + const state: State = { active: false, round: '', group: null } + const groupSubpages = new Map() + const groupsOnMainPage = new Set() + + function recordMatch(m: Match) { + matches.push(m) + if (m.group) groupsOnMainPage.add(m.group) + if (m.ground) { + const { name, city } = parseGroundParts(m.ground) + if (name && !stadiums.has(name)) stadiums.set(name, { name, city }) + } + if (m.group) { + if (!groups.has(m.group)) groups.set(m.group, new Set()) + groups.get(m.group)!.add(m.team1) + groups.get(m.group)!.add(m.team2) + } + } + + $('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => { + const $el = $(el) + if ($el.hasClass('mw-heading')) { + const $h = $el.find('h2, h3, h4').first() + const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') + processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, state) + } else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) { + if (state.active && state.group) { + const link = $el.find('a[href^="/wiki/"]').first().attr('href') + if (link) { + const page = link.replace('/wiki/', '').split('#')[0] + if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group)) + groupSubpages.set(state.group, page) + } + } + } else if ($el.hasClass('footballbox')) { + if (!state.active) return + const m = parseBox($, $el, state.round || state.group || 'Unknown', state.group) + if (m) recordMatch(m) + } + }) + + for (const [group, page] of groupSubpages) { + if (groupsOnMainPage.has(group)) continue + if (opts?.skipGroups?.has(group)) { + process.stdout.write(`[skip ${group}] `) + continue + } + await delay(1200) + const subHtml = await fetchWikiHtml(page) + if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue } + + const stateTemp: State = { active: false, round: '', group: null } + let round = 'Group stage' + $('.mw-parser-output').find('div.mw-heading').each((_, el) => { + const $h = $(el).find('h2, h3, h4').first() + const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') + processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, stateTemp) + if (stateTemp.group === group) { round = stateTemp.round || 'Group stage'; return false } + }) + + const $sub = load(subHtml) + for (const m of collectBoxes($sub, round, group)) recordMatch(m) + process.stdout.write(`[+${page.slice(-8)}] `) + } + + const infobox = parseInfobox($) + const placements = derivePlacements(matches) + const meta: Meta = { + host: infobox.host ?? '', + teams_count: infobox.teams_count ?? null, + winner: placements.winner ?? infobox.winner ?? null, + runner_up: placements.runner_up ?? infobox.runner_up ?? null, + third_place: placements.third_place ?? infobox.third_place ?? null, + fourth_place: placements.fourth_place?? infobox.fourth_place?? null, + } + + return { matches, stadiums, groups, meta } +} + +// ── Squad page scraper ───────────────────────────────────────────────────── + +export function scrapeSquads(html: string): Squad[] { + const $ = load(html) + const squads: Squad[] = [] + let currentTeam: Squad | null = null + + $('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => { + const $el = $(el) + + if ($el.hasClass('mw-heading')) { + const $h = $el.find('h3, h4').first() + if (!$h.length) return + if (parseInt($h.prop('tagName')?.slice(1) ?? '9') !== 3) return + const name = $h.text().replace(/\[edit\]/g, '').trim() + if (/^group /i.test(name)) return + currentTeam = { name, players: [] } + squads.push(currentTeam) + return + } + + if (!currentTeam) return + + let number: number | undefined + let pos: string | undefined + let playerName = '' + let dob: string | undefined + + $el.find('td, th[scope="row"]').each((i, td) => { + const $td = $(td) + const text = $td.text().trim() + if ($td.is('th[scope="row"]')) { + playerName = $td.find('a').first().text().trim() || text + } else if (i === 0 && !playerName) { + const n = parseInt(text); if (!isNaN(n)) number = n + } else if (i === 1 && !playerName && !pos) { + const p = $td.find('a').first().text().trim() + if (['GK', 'DF', 'MF', 'FW'].includes(p)) pos = p + } + const $bday = $td.find('.bday') + if ($bday.length) dob = $bday.text().trim() + }) + + if (!playerName) return + const player: Player = { name: playerName } + if (number !== undefined) player.number = number + if (pos) player.pos = pos + if (dob) player.date_of_birth = dob + currentTeam.players.push(player) + }) + + return squads.filter(s => s.players.length > 0) +} diff --git a/scripts/scrape-wikipedia.ts b/scripts/scrape-wikipedia.ts index 1679843..3d7922b 100644 --- a/scripts/scrape-wikipedia.ts +++ b/scripts/scrape-wikipedia.ts @@ -1,10 +1,20 @@ -import { load } from 'cheerio' -import type { CheerioAPI } from 'cheerio' -import type { Cheerio } from 'cheerio' -import type { Element } from 'domhandler' +/** + * Scrape English Wikipedia for World Cup data and write JSON files to + * app/data/wikipedia/{year}/. + * + * Usage: + * pnpm scrape # all years, matches + squads + * pnpm scrape 2022 # single year, matches + squads + * pnpm scrape 2022 --matches # matches + meta + stadiums only + * pnpm scrape 2022 --squads # squads only + */ import { mkdirSync, writeFileSync } from 'fs' import path from 'path' import { fileURLToPath } from 'url' +import { + fetchWikiHtml, scrapeYear, scrapeSquads, + type Match, type Stadium, type Group, type Meta, type Squad, +} from '../lib/wiki-scraper' const __dirname = path.dirname(fileURLToPath(import.meta.url)) const DATA_DIR = path.join(__dirname, '../app/data/wikipedia') @@ -16,646 +26,73 @@ const YEARS = [ const delay = (ms: number) => new Promise(r => setTimeout(r, ms)) -// ── Types ────────────────────────────────────────────────────────────────── +// ── File output ──────────────────────────────────────────────────────────── -type Goal = { - name: string - minute?: number - offset?: number - penalty?: boolean - owngoal?: boolean -} - -type ScoreObj = { - ft?: [number, number] - et?: [number, number] - p?: [number, number] -} - -type Match = { - round: string - group?: string - date?: string - time?: string - team1: string - team2: string - score?: ScoreObj - goals1?: Goal[] - goals2?: Goal[] - ground?: string -} - -type Stadium = { name: string; city: string } -type Player = { name: string; number?: number; pos?: string; date_of_birth?: string } -type Squad = { name: string; players: Player[] } -type Group = { name: string; teams: string[] } -type Meta = { - host: string - teams_count: number | null - winner: string | null - runner_up: string | null - third_place: string | null - fourth_place: string | null -} - -// ── Fetch ────────────────────────────────────────────────────────────────── - -async function fetchWikiHtml(page: string, retries = 5): Promise { - const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1` - for (let attempt = 0; attempt < retries; attempt++) { - try { - if (attempt > 0) await delay(3000 * attempt) - const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0 (github.com/worldcup)' } }) - if (!res.ok) continue - const data = await res.json() as { parse?: { text?: { '*': string } } } - const html = data?.parse?.text?.['*'] - if (html) return html - } catch { - // retry - } - } - return null -} - -// ── Score parsing ────────────────────────────────────────────────────────── - -function parseScoreText(text: string): [number, number] | null { - const m = text.match(/(\d+)\s*[–\-]\s*(\d+)/) - if (!m) return null - return [parseInt(m[1]), parseInt(m[2])] -} - -// ── Team name extraction ─────────────────────────────────────────────────── - -function extractTeam($: CheerioAPI, $cell: Cheerio): string { - let name = '' - $cell.find('a').each((_, a) => { - const $a = $(a) - if (!$a.find('img').length && $a.text().trim()) { - name = $a.text().trim() - return false - } - }) - return name -} - -// ── Goal parsing ─────────────────────────────────────────────────────────── - -function parseGoals($: CheerioAPI, $td: Cheerio): Goal[] { - const goals: Goal[] = [] - - $td.find('li').each((_, li) => { - const $li = $(li) - - // Player name: first NOT inside .fb-goal - let playerName = '' - $li.find('a').each((_, a) => { - if (!$(a).closest('.fb-goal').length) { - const t = $(a).text().trim() - if (t) { playerName = t; return false } - } - }) - if (!playerName) return - - const $fbGoal = $li.find('.fb-goal') - if (!$fbGoal.length) return - - // Each direct child inside .fb-goal (excluding image wrapper) - $fbGoal.children('span').each((_, span) => { - const $span = $(span) - if ($span.attr('typeof')) return // image wrapper - - const text = $span.text() - const minMatch = text.match(/(\d+)(?:\+(\d+))?['′]/) - if (!minMatch) return - - const minute = parseInt(minMatch[1]) - const offset = minMatch[2] ? parseInt(minMatch[2]) : 0 - const isPen = text.includes('pen.') - const isOG = text.includes('o.g.') - - const goal: Goal = { name: playerName } - if (!isNaN(minute)) goal.minute = minute - if (offset) goal.offset = offset - if (isPen) goal.penalty = true - if (isOG) goal.owngoal = true - goals.push(goal) - }) - }) - - return goals -} - -// ── Ground extraction ────────────────────────────────────────────────────── - -function extractGround($: CheerioAPI, $box: Cheerio): string { - const $loc = $box.find('[itemprop="name address"]').first() - if ($loc.length) return $loc.text().trim() - return $box.find('.fright').first().text().split('\n')[0].trim() -} - -function parseGroundParts(ground: string): { name: string; city: string } { - const commaIdx = ground.indexOf(',') - if (commaIdx !== -1) { - return { - name: ground.slice(0, commaIdx).trim(), - city: ground.slice(commaIdx + 1).trim(), - } - } - return { name: ground, city: '' } -} - -// ── Footballbox parsing ──────────────────────────────────────────────────── - -function parseBox( - $: CheerioAPI, - $box: Cheerio, - round: string, - group: string | null, -): Match | null { - const team1 = extractTeam($, $box.find('.fhome')) - const team2 = extractTeam($, $box.find('.faway')) - if (!team1 || !team2) return null - - const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined - - const timeText = $box.find('.ftime').first().text().trim() - const timeMatch = timeText.match(/(\d{2}:\d{2})/) - const timeStr = timeMatch?.[1] - - const scoreText = $box.find('.fscore').first().text().trim() - const hasAET = scoreText.toLowerCase().includes('a.e.t.') - const scoreArr = parseScoreText(scoreText) - - // Use first fgoals row only (exclude penalty shootout row) - const $regularRow = $box.find('tr.fgoals').first() - const goals1 = parseGoals($, $regularRow.find('.fhgoal')) - const goals2 = parseGoals($, $regularRow.find('.fagoal')) - - // Penalty shootout score: row after "Penalties" header tr - let penScore: [number, number] | undefined - $box.find('tr').each((_, tr) => { - const $tr = $(tr) - if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) { - const penText = $tr.next('tr').find('th').not('.fhome,.faway').first().text().trim() - const ps = parseScoreText(penText) - if (ps) penScore = ps - return false - } - }) - - let score: ScoreObj | undefined - if (scoreArr) { - if (hasAET) { - // scoreArr is ET total; compute FT from goals in ≤90 min - const ftGoals = (gs: Goal[], includeOG = false) => - gs.filter(g => { - const w90 = g.minute === undefined || g.minute <= 90 - return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90 - }).length - const ftHome = ftGoals(goals1) + ftGoals(goals2, true) - const ftAway = ftGoals(goals2) + ftGoals(goals1, true) - score = { ft: [ftHome, ftAway], et: scoreArr } - } else { - score = { ft: scoreArr } - } - if (penScore) score.p = penScore - } - - const ground = extractGround($, $box) || undefined - - return { - round, - ...(group ? { group } : {}), - ...(dateStr ? { date: dateStr } : {}), - ...(timeStr ? { time: timeStr } : {}), - team1, - team2, - ...(score ? { score } : {}), - ...(goals1.length ? { goals1 } : {}), - ...(goals2.length ? { goals2 } : {}), - ...(ground ? { ground } : {}), - } -} - -// ── Collect matches from a pre-loaded page ───────────────────────────────── - -function collectBoxes( - $: CheerioAPI, - round: string, - group: string | null, -): Match[] { - const matches: Match[] = [] - $('.footballbox').each((_, el) => { - const m = parseBox($, $(el), round, group) - if (m) matches.push(m) - }) - return matches -} - -// ── Section heading state machine ────────────────────────────────────────── - -type State = { - active: boolean - round: string - group: string | null -} - -function processHeading(text: string, level: number, state: State): void { - const t = text.toLowerCase().trim() - - if (level === 2) { - if (/group stage/i.test(t) && !/second/i.test(t)) { - state.active = true; state.round = 'Group stage'; state.group = null - } else if (/first group stage/i.test(t)) { - state.active = true; state.round = 'Group stage'; state.group = null - } else if (/second group stage/i.test(t)) { - state.active = true; state.round = 'Second group stage'; state.group = null - } else if (t === 'final round') { - state.active = true; state.round = 'Final round'; state.group = null - } else if (/final tournament/i.test(t)) { - state.active = true; state.round = ''; state.group = null - } else if (/knock.?out stage/i.test(t)) { - state.active = true; state.round = ''; state.group = null - } else if (/round of 16/i.test(t)) { - state.active = true; state.round = 'Round of 16'; state.group = null - } else if (/quarter.final/i.test(t)) { - state.active = true; state.round = 'Quarter-finals'; state.group = null - } else if (/semi.final/i.test(t)) { - state.active = true; state.round = 'Semi-finals'; state.group = null - } else if (/third.place|match for third|play.off for third/i.test(t)) { - state.active = true; state.round = 'Third-place match'; state.group = null - } else if (t === 'final') { - state.active = true; state.round = 'Final'; state.group = null - } else { - state.active = false - } - return - } - - if (!state.active) return - - if (level === 3 || level === 4) { - if (/^group [a-h1-9]+$/i.test(t)) { - state.group = text.trim() - } else if (/round of 32/i.test(t)) { - state.round = 'Round of 32'; state.group = null - } else if (/round of 16/i.test(t)) { - state.round = 'Round of 16'; state.group = null - } else if (/quarter.final/i.test(t)) { - state.round = 'Quarter-finals'; state.group = null - } else if (/semi.final/i.test(t)) { - state.round = 'Semi-finals'; state.group = null - } else if (/third.place|match for third|play.off for third/i.test(t)) { - state.round = 'Third-place match'; state.group = null - } else if (t === 'final') { - state.round = 'Final'; state.group = null - } - // bracket, draw, seeding, replay → keep current state - } -} - -// ── Main year scraper ────────────────────────────────────────────────────── - -// ── Infobox parsing ──────────────────────────────────────────────────────── - -function parseInfobox($: CheerioAPI): Partial { - const result: Partial = {} - - function tdText($td: Cheerio): string { - const $clone = $td.clone() - $clone.find('br').replaceWith(' / ') - $clone.find('sup, img').remove() - return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim() - } - - function tdFirstLink($td: Cheerio): string | null { - let name: string | null = null - $td.find('a').each((_, a) => { - const t = $(a).clone().find('img').remove().end().text().trim() - if (t && !/\[\d+\]/.test(t)) { name = t; return false } - }) - return name ?? (tdText($td) || null) - } - - function tdAllLinks($td: Cheerio): string { - const names: string[] = [] - $td.find('a').each((_, a) => { - const t = $(a).clone().find('img').remove().end().text().trim() - if (t && !/\[\d+\]/.test(t)) names.push(t) - }) - return names.length ? names.join(' / ') : tdText($td) - } - - $('table.infobox').first().find('tr').each((_, tr) => { - const $tr = $(tr) - const label = $tr.find('th').text().trim().toLowerCase() - const $td = $tr.find('td').first() - if (!$td.length) return - if (/host countr/i.test(label)) { - result.host = tdAllLinks($td) - } else if (/^teams$/i.test(label)) { - const m = $td.text().match(/\d+/) - if (m) result.teams_count = parseInt(m[0]) - } else if (/champion/i.test(label)) { - result.winner = tdFirstLink($td) - } else if (/runners?.up/i.test(label)) { - result.runner_up = tdFirstLink($td) - } else if (/third.place/i.test(label)) { - result.third_place = tdFirstLink($td) - } else if (/fourth.place/i.test(label)) { - result.fourth_place = tdFirstLink($td) - } - }) - - return result -} - -// ── Placement derivation ─────────────────────────────────────────────────── - -function derivePlacements(matches: Match[]): Pick { - function matchWinner(m: Match): [string, string] | null { - if (!m.score) return null - const [h, a] = m.score.et ?? m.score.ft ?? [0, 0] - if (h > a) return [m.team1, m.team2] - if (a > h) return [m.team2, m.team1] - if (m.score.p) { - const [ph, pa] = m.score.p - if (ph > pa) return [m.team1, m.team2] - if (pa > ph) return [m.team2, m.team1] - } - return null - } - - let winner: string | null = null, runner_up: string | null = null - let third_place: string | null = null, fourth_place: string | null = null - - for (const m of matches) { - if (m.round === 'Final') { - const result = matchWinner(m) - if (result) { [winner, runner_up] = result } - } else if (m.round === 'Third-place match') { - const result = matchWinner(m) - if (result) { [third_place, fourth_place] = result } - } - } - - return { winner, runner_up, third_place, fourth_place } -} - -// ── Year result ──────────────────────────────────────────────────────────── - -type YearResult = { - matches: Match[] - stadiums: Map - groups: Map> - meta: Meta -} - -async function scrapeYear(year: number, mainHtml: string): Promise { - const $ = load(mainHtml) - const matches: Match[] = [] - const stadiums = new Map() - const groups = new Map>() - - const state: State = { active: false, round: '', group: null } - - // Maps group name → sub-page to fetch (if main page has no matches for that group) - const groupSubpages = new Map() - // Groups that got at least one match from the main page - const groupsOnMainPage = new Set() - - function recordMatch(m: Match) { - matches.push(m) - if (m.group) groupsOnMainPage.add(m.group) - if (m.ground) { - const { name, city } = parseGroundParts(m.ground) - if (name && !stadiums.has(name)) stadiums.set(name, { name, city }) - } - if (m.group) { - if (!groups.has(m.group)) groups.set(m.group, new Set()) - groups.get(m.group)!.add(m.team1) - groups.get(m.group)!.add(m.team2) - } - } - - // Walk elements in document order: headings, hatnotes, footballboxes - $('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => { - const $el = $(el) - - if ($el.hasClass('mw-heading')) { - const $h = $el.find('h2, h3, h4').first() - const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') - const text = $h.text().replace(/\[edit\]/g, '').trim() - processHeading(text, level, state) - - } else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) { - // Record sub-page link for current group context (for fallback if no main-page matches) - if (state.active && state.group) { - const link = $el.find('a[href^="/wiki/"]').first().attr('href') - if (link) { - const page = link.replace('/wiki/', '').split('#')[0] - if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group)) { - groupSubpages.set(state.group, page) - } - } - } - - } else if ($el.hasClass('footballbox')) { - if (!state.active) return - const round = state.round || state.group || 'Unknown' - const m = parseBox($, $el, round, state.group) - if (m) recordMatch(m) - } - }) - - // Fetch group sub-pages for any group that got 0 matches from main page - for (const [group, page] of groupSubpages) { - if (groupsOnMainPage.has(group)) continue - - await delay(1200) - const subHtml = await fetchWikiHtml(page) - if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue } - - // Determine the round for this group from the state machine result - // (we'll reconstruct from the main-page walk state — use the round that was active when this group was seen) - // Since we can't easily recover state here, we re-walk to find the round for this group - let round = 'Group stage' - let foundGroup = false - const stateTemp: State = { active: false, round: '', group: null } - $('.mw-parser-output').find('div.mw-heading').each((_, el) => { - const $h = $(el).find('h2, h3, h4').first() - const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') - const text = $h.text().replace(/\[edit\]/g, '').trim() - processHeading(text, level, stateTemp) - if (stateTemp.group === group) { - round = stateTemp.round || 'Group stage' - foundGroup = true - return false - } - }) - - const $sub = load(subHtml) - const subMatches = collectBoxes($sub, round || 'Group stage', group) - for (const m of subMatches) { - recordMatch(m) - } - process.stdout.write(`[+${page.slice(-8)}] `) - } - - const infobox = parseInfobox($) - const placements = derivePlacements(matches) - const meta: Meta = { - host: infobox.host ?? '', - teams_count: infobox.teams_count ?? null, - winner: placements.winner ?? infobox.winner ?? null, - runner_up: placements.runner_up ?? infobox.runner_up ?? null, - third_place: placements.third_place ?? infobox.third_place ?? null, - fourth_place:placements.fourth_place?? infobox.fourth_place?? null, - } - - return { matches, stadiums, groups, meta } -} - -// ── Squad page scraper ───────────────────────────────────────────────────── - -function scrapeSquads(html: string): Squad[] { - const $ = load(html) - const squads: Squad[] = [] - let currentTeam: Squad | null = null - - $('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => { - const $el = $(el) - - if ($el.hasClass('mw-heading')) { - const $h = $el.find('h3, h4').first() - if (!$h.length) return - const level = parseInt($h.prop('tagName')?.slice(1) ?? '9') - if (level !== 3) return - const name = $h.text().replace(/\[edit\]/g, '').trim() - if (/^group /i.test(name)) return // skip group headers - currentTeam = { name, players: [] } - squads.push(currentTeam) - return - } - - if (!currentTeam) return - - let number: number | undefined - let pos: string | undefined - let playerName = '' - let dob: string | undefined - - $el.find('td, th[scope="row"]').each((i, td) => { - const $td = $(td) - const text = $td.text().trim() - - if ($td.is('th[scope="row"]')) { - playerName = $td.find('a').first().text().trim() || text - } else if (i === 0 && !playerName) { - const n = parseInt(text) - if (!isNaN(n)) number = n - } else if (i === 1 && !playerName && !pos) { - const posLink = $td.find('a').first().text().trim() - if (['GK', 'DF', 'MF', 'FW'].includes(posLink)) pos = posLink - } - - const $bday = $td.find('.bday') - if ($bday.length) dob = $bday.text().trim() - }) - - if (!playerName) return - - const player: Player = { name: playerName } - if (number !== undefined) player.number = number - if (pos) player.pos = pos - if (dob) player.date_of_birth = dob - currentTeam.players.push(player) - }) - - return squads.filter(s => s.players.length > 0) -} - -// ── Output ───────────────────────────────────────────────────────────────── - -function writeOutput( +function writeMatches( year: number, matches: Match[], stadiums: Map, groups: Map>, - squads: Squad[], meta: Meta, ): void { const dir = path.join(DATA_DIR, String(year)) mkdirSync(dir, { recursive: true }) - writeFileSync( - path.join(dir, 'worldcup.meta.json'), - JSON.stringify(meta, null, 2), - 'utf-8', - ) + writeFileSync(path.join(dir, 'worldcup.meta.json'), JSON.stringify(meta, null, 2), 'utf-8') + writeFileSync(path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), 'utf-8') - writeFileSync( - path.join(dir, 'worldcup.json'), - JSON.stringify({ matches }, null, 2), - 'utf-8', - ) - - if (stadiums.size > 0) { - writeFileSync( - path.join(dir, 'worldcup.stadiums.json'), - JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), - 'utf-8', - ) - } + if (stadiums.size > 0) + writeFileSync(path.join(dir, 'worldcup.stadiums.json'), + JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), 'utf-8') const groupList: Group[] = [] - groups.forEach((teams, name) => { - groupList.push({ name, teams: Array.from(teams) }) - }) - if (groupList.length > 0) { - writeFileSync( - path.join(dir, 'worldcup.groups.json'), - JSON.stringify({ groups: groupList }, null, 2), - 'utf-8', - ) - } + groups.forEach((teams, name) => groupList.push({ name, teams: Array.from(teams) })) + if (groupList.length > 0) + writeFileSync(path.join(dir, 'worldcup.groups.json'), + JSON.stringify({ groups: groupList }, null, 2), 'utf-8') +} - if (squads.length > 0) { - writeFileSync( - path.join(dir, 'worldcup.squads.json'), - JSON.stringify(squads, null, 2), - 'utf-8', - ) - } +function writeSquads(year: number, squads: Squad[]): void { + if (squads.length === 0) return + const dir = path.join(DATA_DIR, String(year)) + mkdirSync(dir, { recursive: true }) + writeFileSync(path.join(dir, 'worldcup.squads.json'), JSON.stringify(squads, null, 2), 'utf-8') } // ── Entry point ──────────────────────────────────────────────────────────── async function main() { - const onlyYear = process.argv[2] ? parseInt(process.argv[2]) : null - const yearsToScrape = onlyYear ? [onlyYear] : YEARS + const args = process.argv.slice(2) + const yearArg = args.find(a => /^\d{4}$/.test(a)) + const doMatches = args.includes('--matches') || !args.includes('--squads') + const doSquads = args.includes('--squads') || !args.includes('--matches') - console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia...`) + const yearsToScrape = yearArg ? [parseInt(yearArg)] : YEARS + const modeLabel = [doMatches && 'matches', doSquads && 'squads'].filter(Boolean).join(' + ') + + console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia [${modeLabel}]...`) for (const year of yearsToScrape) { process.stdout.write(` ${year}... `) - const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`) - if (!mainHtml) { console.log('FAILED'); continue } + if (doMatches) { + const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`) + if (!mainHtml) { console.log('FAILED (main page)'); continue } + const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml) + writeMatches(year, matches, stadiums, groups, meta) + process.stdout.write(`${matches.length} matches`) + await delay(600) + } - const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml) + if (doSquads) { + const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`) + const squads = squadHtml ? scrapeSquads(squadHtml) : [] + writeSquads(year, squads) + process.stdout.write(`${doMatches ? ', ' : ''}${squads.length} squads`) + await delay(600) + } - await delay(600) - - const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`) - const squads = squadHtml ? scrapeSquads(squadHtml) : [] - - writeOutput(year, matches, stadiums, groups, squads, meta) - - console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`) - - await delay(600) + console.log() } console.log('\nDone! Files written to app/data/wikipedia/{year}/') diff --git a/scripts/sync.ts b/scripts/sync.ts index 7b80bb3..a6e61e8 100644 --- a/scripts/sync.ts +++ b/scripts/sync.ts @@ -1,40 +1,16 @@ import postgres from 'postgres' import { drizzle } from 'drizzle-orm/postgres-js' import { sql } from 'drizzle-orm' -import { TEAM_ISO, getIso } from '../lib/iso-codes' +import { fetchWikiHtml, scrapeYear, scrapeSquads } from '../lib/wiki-scraper' +import { getIso } from '../lib/iso-codes' const DATABASE_URL = process.env.DATABASE_URL if (!DATABASE_URL) { console.error('ERROR: DATABASE_URL environment variable is not set') process.exit(1) } -const BASE = 'https://raw.githubusercontent.com/openfootball/worldcup.json/master' -async function fetchJson(url: string): Promise { - try { - const res = await fetch(url) - if (!res.ok) return null - return res.json() - } catch { - return null - } -} - -type RawGoal = { name: string; minute?: string | number; offset?: number; penalty?: boolean; owngoal?: boolean } -type RawScore = { ft?: number[]; ht?: number[]; et?: number[]; p?: number[] } | number[] -type RawMatch = { - round?: string; date?: string; time?: string; - team1: string; team2: string; score?: RawScore; - goals1?: RawGoal[]; goals2?: RawGoal[]; - group?: string; ground?: string; -} -type RawData = { matches: RawMatch[] } - -function parseScore(score: RawScore | undefined) { - if (!score) return {} - if (Array.isArray(score)) return { ft: score } - return { ft: score.ft, ht: score.ht, et: score.et, p: score.p } -} +// ── DB helpers ───────────────────────────────────────────────────────────── async function run() { const client = postgres(DATABASE_URL!, { max: 2 }) @@ -42,17 +18,13 @@ async function run() { const teamCache = new Map() - async function upsertTeam(rawName: string, extra?: { iso2?: string | null; fifaCode?: string; continent?: string; confederation?: string }) { + async function upsertTeam(rawName: string) { if (teamCache.has(rawName)) return teamCache.get(rawName)! - const iso2 = (extra && 'iso2' in extra) ? extra.iso2 : getIso(rawName) + const iso2 = getIso(rawName) const [row] = await db.execute(sql` - INSERT INTO teams (name, iso2, fifa_code, continent, confederation) - VALUES (${rawName}, ${iso2 ?? null}, ${extra?.fifaCode ?? null}, ${extra?.continent ?? null}, ${extra?.confederation ?? null}) - ON CONFLICT (name) DO UPDATE SET - iso2 = COALESCE(EXCLUDED.iso2, teams.iso2), - fifa_code = COALESCE(EXCLUDED.fifa_code, teams.fifa_code), - continent = COALESCE(EXCLUDED.continent, teams.continent), - confederation = COALESCE(EXCLUDED.confederation, teams.confederation) + INSERT INTO teams (name, iso2) + VALUES (${rawName}, ${iso2 ?? null}) + ON CONFLICT (name) DO UPDATE SET iso2 = COALESCE(EXCLUDED.iso2, teams.iso2) RETURNING id `) const id = (row as { id: number }).id @@ -62,56 +34,42 @@ async function run() { async function upsertMatch( year: number, round: string, group: string | null, dateStr: string | null, - timeStr: string | null, team1Id: number, team2Id: number, score: ReturnType, - isQuali: boolean + timeStr: string | null, team1Id: number, team2Id: number, + ft: [number, number] | undefined, et: [number, number] | undefined, p: [number, number] | undefined, + isQuali: boolean, ) { const rows = await db.execute(sql` INSERT INTO matches (tournament_year, round, group_name, date, time_local, team1_id, team2_id, - score_ft_home, score_ft_away, score_ht_home, score_ht_away, - score_et_home, score_et_away, score_p_home, score_p_away, is_quali_playoff) + score_ft_home, score_ft_away, score_et_home, score_et_away, + score_p_home, score_p_away, is_quali_playoff) VALUES ( - ${year}, ${round}, ${group}, ${dateStr ?? null}, ${timeStr ?? null}, - ${team1Id}, ${team2Id}, - ${score.ft?.[0] ?? null}, ${score.ft?.[1] ?? null}, - ${score.ht?.[0] ?? null}, ${score.ht?.[1] ?? null}, - ${score.et?.[0] ?? null}, ${score.et?.[1] ?? null}, - ${score.p?.[0] ?? null}, ${score.p?.[1] ?? null}, + ${year}, ${round}, ${group}, ${dateStr}, ${timeStr}, ${team1Id}, ${team2Id}, + ${ft?.[0] ?? null}, ${ft?.[1] ?? null}, + ${et?.[0] ?? null}, ${et?.[1] ?? null}, + ${p?.[0] ?? null}, ${p?.[1] ?? null}, ${isQuali} ) ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET round = EXCLUDED.round, - time_local = COALESCE(EXCLUDED.time_local, matches.time_local), + time_local = COALESCE(EXCLUDED.time_local, matches.time_local), score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home), score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away), - score_ht_home = COALESCE(EXCLUDED.score_ht_home, matches.score_ht_home), - score_ht_away = COALESCE(EXCLUDED.score_ht_away, matches.score_ht_away), score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home), score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away), - score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home), - score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away) + score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home), + score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away) RETURNING id `) return (rows[0] as { id: number }).id } - type GoalRow = { teamId: number; name: string; minute: number | null; offset: number; penalty: boolean; owngoal: boolean } - - function collectGoals(teamId: number, rawGoals: RawGoal[], isOwnGoalTeamId: number): GoalRow[] { - return rawGoals.flatMap(g => { - if (!g.name) return [] - const minute = g.minute != null ? parseInt(String(g.minute)) : null - return [{ teamId: g.owngoal ? isOwnGoalTeamId : teamId, name: g.name, - minute: isNaN(minute!) ? null : minute, offset: g.offset ?? 0, - penalty: g.penalty ?? false, owngoal: g.owngoal ?? false }] - }) - } - - async function replaceGoals(matchId: number, rows: GoalRow[]) { + async function replaceGoals(matchId: number, goals: Array<{ + teamId: number; name: string; minute: number | null; offset: number; penalty: boolean; owngoal: boolean + }>) { await db.transaction(async tx => { await tx.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`) - if (rows.length > 0) { - // Single bulk INSERT — readers see old goals until commit, never an empty window - const vals = rows.map(g => + if (goals.length > 0) { + const vals = goals.map(g => sql`(${matchId}, ${g.teamId}, ${g.name}, ${g.minute}, ${g.offset}, ${g.penalty}, ${g.owngoal})` ) await tx.execute(sql` @@ -122,101 +80,117 @@ async function run() { }) } - console.log('\nSyncing 2026...') + // ── Incremental group detection ──────────────────────────────────────────── + // Groups where every known match already has a FT score — no need to re-fetch their sub-page. + + async function getCompletedGroups(): Promise> { + const rows = await db.execute(sql` + SELECT group_name + FROM matches + WHERE tournament_year = 2026 + AND group_name IS NOT NULL + AND is_quali_playoff = false + GROUP BY group_name + HAVING COUNT(*) > 0 + AND COUNT(*) = SUM(CASE WHEN score_ft_home IS NOT NULL THEN 1 ELSE 0 END) + `) + return new Set(rows.map(r => (r as { group_name: string }).group_name)) + } + + // ── Sync 2026 from Wikipedia ─────────────────────────────────────────────── + + console.log('\nSyncing 2026 from Wikipedia...') - // Upsert 2026 tournament row (no winner yet) await db.execute(sql` INSERT INTO tournaments (year, host) VALUES (2026, 'USA / Canada / Mexico') ON CONFLICT (year) DO NOTHING `) - // Teams enrichment - const teamsData = await fetchJson(`${BASE}/2026/worldcup.teams.json`) as Record[] | null - if (teamsData && Array.isArray(teamsData)) { - for (const t of teamsData) { - const name = (t.name ?? t.name_normalised) as string - await upsertTeam(name, { - iso2: TEAM_ISO[name] ?? getIso(name), - fifaCode: t.fifa_code as string, - continent: t.continent as string, - confederation: t.confed as string, - }) - } + const mainHtml = await fetchWikiHtml('2026_FIFA_World_Cup') + if (!mainHtml) { + console.error(' FAILED to fetch 2026 Wikipedia page') + await client.end() + process.exit(1) } + const completedGroups = await getCompletedGroups() + if (completedGroups.size > 0) + console.log(` Skipping completed groups: ${[...completedGroups].sort().join(', ')}`) + + process.stdout.write(' ') + const { matches, stadiums, meta } = await scrapeYear(2026, mainHtml, { skipGroups: completedGroups }) + console.log() + // Stadiums - const stadiumsData = await fetchJson(`${BASE}/2026/worldcup.stadiums.json`) as { stadiums?: Record[] } | null - if (stadiumsData?.stadiums) { - for (const s of stadiumsData.stadiums) { - await db.execute(sql` - INSERT INTO stadiums (tournament_year, name, city, country_code, capacity, timezone, coordinates) - VALUES (2026, ${s.name as string}, ${s.city as string}, ${(s.cc as string | undefined) ?? null}, - ${(s.capacity as number | undefined) ?? null}, ${(s.timezone as string | undefined) ?? null}, ${(s.coords as string | undefined) ?? null}) - ON CONFLICT DO NOTHING - `) - } + for (const s of stadiums.values()) { + await db.execute(sql` + INSERT INTO stadiums (tournament_year, name, city) + VALUES (2026, ${s.name}, ${s.city ?? null}) + ON CONFLICT DO NOTHING + `) } - // Main matches - const mainData = await fetchJson(`${BASE}/2026/worldcup.json`) as RawData | null + // Matches + goals let matchCount = 0, goalCount = 0 - if (mainData?.matches) { - for (const m of mainData.matches) { - const t1Id = await upsertTeam(m.team1) - const t2Id = await upsertTeam(m.team2) - const score = parseScore(m.score) - const matchId = await upsertMatch(2026, m.round ?? 'Unknown', m.group ?? null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, false) - if (m.goals1?.length || m.goals2?.length) { - const goalRows = [ - ...(m.goals1?.length ? collectGoals(t1Id, m.goals1, t2Id) : []), - ...(m.goals2?.length ? collectGoals(t2Id, m.goals2, t1Id) : []), - ] - await replaceGoals(matchId, goalRows) - } - matchCount++ - goalCount += (m.goals1?.length ?? 0) + (m.goals2?.length ?? 0) - } + for (const m of matches) { + const t1Id = await upsertTeam(m.team1) + const t2Id = await upsertTeam(m.team2) + const matchId = await upsertMatch( + 2026, m.round, m.group ?? null, m.date ?? null, m.time ?? null, + t1Id, t2Id, m.score?.ft, m.score?.et, m.score?.p, false, + ) + const goals = [ + ...(m.goals1 ?? []).map(g => ({ + teamId: g.owngoal ? t2Id : t1Id, name: g.name, + minute: g.minute ?? null, offset: g.offset ?? 0, + penalty: g.penalty ?? false, owngoal: g.owngoal ?? false, + })), + ...(m.goals2 ?? []).map(g => ({ + teamId: g.owngoal ? t1Id : t2Id, name: g.name, + minute: g.minute ?? null, offset: g.offset ?? 0, + penalty: g.penalty ?? false, owngoal: g.owngoal ?? false, + })), + ] + if (goals.length > 0) await replaceGoals(matchId, goals) + matchCount++ + goalCount += goals.length } - // Squads - const squadsData = await fetchJson(`${BASE}/2026/worldcup.squads.json`) as Record[] | null - if (squadsData && Array.isArray(squadsData)) { - for (const sq of squadsData) { - const teamId = await upsertTeam(sq.name as string) - for (const p of (sq.players as Record[])) { + // Squads (fetch once; idempotent upsert so safe to re-run) + const squadHtml = await fetchWikiHtml('2026_FIFA_World_Cup_squads') + if (squadHtml) { + const squads = scrapeSquads(squadHtml) + for (const sq of squads) { + const teamId = await upsertTeam(sq.name) + for (const p of sq.players) { + const dob = p.date_of_birth ? p.date_of_birth.replace(/\s/g, '') : null await db.execute(sql` INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth) - VALUES (2026, ${teamId}, ${p.name as string}, ${p.number as number ?? null}, - ${p.pos as string ?? null}, ${p.date_of_birth as string ?? null}) + VALUES (2026, ${teamId}, ${p.name}, ${p.number ?? null}, ${p.pos ?? null}, ${dob}) ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET - player_name = EXCLUDED.player_name, position = EXCLUDED.position, date_of_birth = EXCLUDED.date_of_birth + player_name = EXCLUDED.player_name, + position = EXCLUDED.position, + date_of_birth = EXCLUDED.date_of_birth `) } } - console.log(' Squads loaded for 2026') + console.log(` Squads: ${squads.length} teams`) } - // Quali playoffs - const qualiData = await fetchJson(`${BASE}/2026/worldcup.quali_playoffs.json`) as RawData | null - if (qualiData?.matches) { - for (const m of qualiData.matches) { - const t1Id = await upsertTeam(m.team1) - const t2Id = await upsertTeam(m.team2) - const score = parseScore(m.score) - const matchId = await upsertMatch(2026, m.round ?? 'Qualifier', null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, true) - if (m.goals1?.length || m.goals2?.length) { - const goalRows = [ - ...(m.goals1?.length ? collectGoals(t1Id, m.goals1, t2Id) : []), - ...(m.goals2?.length ? collectGoals(t2Id, m.goals2, t1Id) : []), - ] - await replaceGoals(matchId, goalRows) - } - } - console.log(` Quali playoffs: ${qualiData.matches.length} matches`) + // Tournament winner (once the final is played) + if (meta.winner) { + await db.execute(sql` + UPDATE tournaments SET + winner = ${meta.winner}, + runner_up = ${meta.runner_up}, + third_place = ${meta.third_place}, + fourth_place = ${meta.fourth_place} + WHERE year = 2026 + `) } - // Group standings from match results + // Group standings await db.execute(sql` WITH match_results AS ( SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga @@ -244,8 +218,8 @@ async function run() { // Tournament aggregates await db.execute(sql` UPDATE tournaments SET - matches_count = (SELECT COUNT(*)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false), - total_goals = (SELECT COALESCE(SUM(score_ft_home + score_ft_away), 0)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL), + matches_count = (SELECT COUNT(*)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false), + total_goals = (SELECT COALESCE(SUM(score_ft_home + score_ft_away), 0)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL), avg_goals_per_game = ( SELECT ROUND(COALESCE(SUM(score_ft_home + score_ft_away), 0)::numeric / NULLIF(COUNT(*), 0), 2) FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL