refactor: extract lib/wiki-scraper.ts, make scraper composable, sync from Wikipedia
Move all scraping logic (fetchWikiHtml, scrapeYear, scrapeSquads and all helpers) into lib/wiki-scraper.ts as exported functions shared by both scripts. scrape-wikipedia.ts becomes a composable CLI: pnpm scrape [year] — matches + squads (default) pnpm scrape [year] --matches — matches/meta/stadiums only pnpm scrape [year] --squads — squads only sync.ts drops the openfootball GitHub dependency entirely and scrapes Wikipedia directly. Incremental: completed groups (all matches have FT scores) are detected via DB query and their sub-pages are skipped each run. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,467 @@
|
|||||||
|
import { load } from 'cheerio'
|
||||||
|
import type { CheerioAPI, Cheerio } from 'cheerio'
|
||||||
|
import type { Element } from 'domhandler'
|
||||||
|
|
||||||
|
// ── Types ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export type Goal = {
|
||||||
|
name: string
|
||||||
|
minute?: number
|
||||||
|
offset?: number
|
||||||
|
penalty?: boolean
|
||||||
|
owngoal?: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScoreObj = {
|
||||||
|
ft?: [number, number]
|
||||||
|
et?: [number, number]
|
||||||
|
p?: [number, number]
|
||||||
|
}
|
||||||
|
|
||||||
|
export type Match = {
|
||||||
|
round: string
|
||||||
|
group?: string
|
||||||
|
date?: string
|
||||||
|
time?: string
|
||||||
|
team1: string
|
||||||
|
team2: string
|
||||||
|
score?: ScoreObj
|
||||||
|
goals1?: Goal[]
|
||||||
|
goals2?: Goal[]
|
||||||
|
ground?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export type Stadium = { name: string; city: string }
|
||||||
|
export type Group = { name: string; teams: string[] }
|
||||||
|
export type Meta = {
|
||||||
|
host: string
|
||||||
|
teams_count: number | null
|
||||||
|
winner: string | null
|
||||||
|
runner_up: string | null
|
||||||
|
third_place: string | null
|
||||||
|
fourth_place: string | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
|
||||||
|
export type Squad = { name: string; players: Player[] }
|
||||||
|
|
||||||
|
export type YearResult = {
|
||||||
|
matches: Match[]
|
||||||
|
stadiums: Map<string, Stadium>
|
||||||
|
groups: Map<string, Set<string>>
|
||||||
|
meta: Meta
|
||||||
|
}
|
||||||
|
|
||||||
|
type State = { active: boolean; round: string; group: string | null }
|
||||||
|
|
||||||
|
// ── Fetch ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
|
||||||
|
|
||||||
|
export async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
|
||||||
|
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
|
||||||
|
for (let attempt = 0; attempt < retries; attempt++) {
|
||||||
|
try {
|
||||||
|
if (attempt > 0) await delay(3000 * attempt)
|
||||||
|
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0' } })
|
||||||
|
if (!res.ok) continue
|
||||||
|
const data = await res.json() as { parse?: { text?: { '*': string } } }
|
||||||
|
const html = data?.parse?.text?.['*']
|
||||||
|
if (html) return html
|
||||||
|
} catch {
|
||||||
|
// retry
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Parsing helpers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function parseScoreText(text: string): [number, number] | null {
|
||||||
|
const m = text.match(/(\d+)\s*[–\-]\s*(\d+)/)
|
||||||
|
if (!m) return null
|
||||||
|
return [parseInt(m[1]), parseInt(m[2])]
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
|
||||||
|
let name = ''
|
||||||
|
$cell.find('a').each((_, a) => {
|
||||||
|
const $a = $(a)
|
||||||
|
if (!$a.find('img').length && $a.text().trim()) {
|
||||||
|
name = $a.text().trim()
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
|
||||||
|
const goals: Goal[] = []
|
||||||
|
$td.find('li').each((_, li) => {
|
||||||
|
const $li = $(li)
|
||||||
|
let playerName = ''
|
||||||
|
$li.find('a').each((_, a) => {
|
||||||
|
if (!$(a).closest('.fb-goal').length) {
|
||||||
|
const t = $(a).text().trim()
|
||||||
|
if (t) { playerName = t; return false }
|
||||||
|
}
|
||||||
|
})
|
||||||
|
if (!playerName) return
|
||||||
|
const $fbGoal = $li.find('.fb-goal')
|
||||||
|
if (!$fbGoal.length) return
|
||||||
|
$fbGoal.children('span').each((_, span) => {
|
||||||
|
const $span = $(span)
|
||||||
|
if ($span.attr('typeof')) return
|
||||||
|
const text = $span.text()
|
||||||
|
const minMatch = text.match(/(\d+)(?:\+(\d+))?['′]/)
|
||||||
|
if (!minMatch) return
|
||||||
|
const minute = parseInt(minMatch[1])
|
||||||
|
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
|
||||||
|
const goal: Goal = { name: playerName }
|
||||||
|
if (!isNaN(minute)) goal.minute = minute
|
||||||
|
if (offset) goal.offset = offset
|
||||||
|
if (text.includes('pen.')) goal.penalty = true
|
||||||
|
if (text.includes('o.g.')) goal.owngoal = true
|
||||||
|
goals.push(goal)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
return goals
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
|
||||||
|
const $loc = $box.find('[itemprop="name address"]').first()
|
||||||
|
if ($loc.length) return $loc.text().trim()
|
||||||
|
return $box.find('.fright').first().text().split('\n')[0].trim()
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseGroundParts(ground: string): { name: string; city: string } {
|
||||||
|
const commaIdx = ground.indexOf(',')
|
||||||
|
if (commaIdx !== -1) return { name: ground.slice(0, commaIdx).trim(), city: ground.slice(commaIdx + 1).trim() }
|
||||||
|
return { name: ground, city: '' }
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseBox($: CheerioAPI, $box: Cheerio<Element>, round: string, group: string | null): Match | null {
|
||||||
|
const team1 = extractTeam($, $box.find('.fhome'))
|
||||||
|
const team2 = extractTeam($, $box.find('.faway'))
|
||||||
|
if (!team1 || !team2) return null
|
||||||
|
|
||||||
|
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
|
||||||
|
const timeText = $box.find('.ftime').first().text().trim()
|
||||||
|
const timeStr = timeText.match(/(\d{2}:\d{2})/)?.[1]
|
||||||
|
const scoreText = $box.find('.fscore').first().text().trim()
|
||||||
|
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
|
||||||
|
const scoreArr = parseScoreText(scoreText)
|
||||||
|
|
||||||
|
const $regularRow = $box.find('tr.fgoals').first()
|
||||||
|
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
|
||||||
|
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
|
||||||
|
|
||||||
|
let penScore: [number, number] | undefined
|
||||||
|
$box.find('tr').each((_, tr) => {
|
||||||
|
const $tr = $(tr)
|
||||||
|
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
|
||||||
|
const ps = parseScoreText($tr.next('tr').find('th').not('.fhome,.faway').first().text().trim())
|
||||||
|
if (ps) penScore = ps
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
let score: ScoreObj | undefined
|
||||||
|
if (scoreArr) {
|
||||||
|
if (hasAET) {
|
||||||
|
const ftGoals = (gs: Goal[], includeOG = false) =>
|
||||||
|
gs.filter(g => {
|
||||||
|
const w90 = g.minute === undefined || g.minute <= 90
|
||||||
|
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
|
||||||
|
}).length
|
||||||
|
score = { ft: [ftGoals(goals1) + ftGoals(goals2, true), ftGoals(goals2) + ftGoals(goals1, true)], et: scoreArr }
|
||||||
|
} else {
|
||||||
|
score = { ft: scoreArr }
|
||||||
|
}
|
||||||
|
if (penScore) score.p = penScore
|
||||||
|
}
|
||||||
|
|
||||||
|
const ground = extractGround($, $box) || undefined
|
||||||
|
return {
|
||||||
|
round,
|
||||||
|
...(group ? { group } : {}),
|
||||||
|
...(dateStr ? { date: dateStr } : {}),
|
||||||
|
...(timeStr ? { time: timeStr } : {}),
|
||||||
|
team1, team2,
|
||||||
|
...(score ? { score } : {}),
|
||||||
|
...(goals1.length ? { goals1 } : {}),
|
||||||
|
...(goals2.length ? { goals2 } : {}),
|
||||||
|
...(ground ? { ground } : {}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectBoxes($: CheerioAPI, round: string, group: string | null): Match[] {
|
||||||
|
const matches: Match[] = []
|
||||||
|
$('.footballbox').each((_, el) => {
|
||||||
|
const m = parseBox($, $(el), round, group)
|
||||||
|
if (m) matches.push(m)
|
||||||
|
})
|
||||||
|
return matches
|
||||||
|
}
|
||||||
|
|
||||||
|
function processHeading(text: string, level: number, state: State): void {
|
||||||
|
const t = text.toLowerCase().trim()
|
||||||
|
if (level === 2) {
|
||||||
|
if (/group stage/i.test(t) && !/second/i.test(t)) {
|
||||||
|
state.active = true; state.round = 'Group stage'; state.group = null
|
||||||
|
} else if (/first group stage/i.test(t)) {
|
||||||
|
state.active = true; state.round = 'Group stage'; state.group = null
|
||||||
|
} else if (/second group stage/i.test(t)) {
|
||||||
|
state.active = true; state.round = 'Second group stage'; state.group = null
|
||||||
|
} else if (t === 'final round') {
|
||||||
|
state.active = true; state.round = 'Final round'; state.group = null
|
||||||
|
} else if (/final tournament/i.test(t)) {
|
||||||
|
state.active = true; state.round = ''; state.group = null
|
||||||
|
} else if (/knock.?out stage/i.test(t)) {
|
||||||
|
state.active = true; state.round = ''; state.group = null
|
||||||
|
} else if (/round of 16/i.test(t)) {
|
||||||
|
state.active = true; state.round = 'Round of 16'; state.group = null
|
||||||
|
} else if (/quarter.final/i.test(t)) {
|
||||||
|
state.active = true; state.round = 'Quarter-finals'; state.group = null
|
||||||
|
} else if (/semi.final/i.test(t)) {
|
||||||
|
state.active = true; state.round = 'Semi-finals'; state.group = null
|
||||||
|
} else if (/third.place|match for third|play.off for third/i.test(t)) {
|
||||||
|
state.active = true; state.round = 'Third-place match'; state.group = null
|
||||||
|
} else if (t === 'final') {
|
||||||
|
state.active = true; state.round = 'Final'; state.group = null
|
||||||
|
} else {
|
||||||
|
state.active = false
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if (!state.active) return
|
||||||
|
if (level === 3 || level === 4) {
|
||||||
|
if (/^group [a-h1-9]+$/i.test(t)) {
|
||||||
|
state.group = text.trim()
|
||||||
|
} else if (/round of 32/i.test(t)) {
|
||||||
|
state.round = 'Round of 32'; state.group = null
|
||||||
|
} else if (/round of 16/i.test(t)) {
|
||||||
|
state.round = 'Round of 16'; state.group = null
|
||||||
|
} else if (/quarter.final/i.test(t)) {
|
||||||
|
state.round = 'Quarter-finals'; state.group = null
|
||||||
|
} else if (/semi.final/i.test(t)) {
|
||||||
|
state.round = 'Semi-finals'; state.group = null
|
||||||
|
} else if (/third.place|match for third|play.off for third/i.test(t)) {
|
||||||
|
state.round = 'Third-place match'; state.group = null
|
||||||
|
} else if (t === 'final') {
|
||||||
|
state.round = 'Final'; state.group = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Infobox ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function parseInfobox($: CheerioAPI): Partial<Meta> {
|
||||||
|
const result: Partial<Meta> = {}
|
||||||
|
|
||||||
|
function tdText($td: Cheerio<Element>): string {
|
||||||
|
const $clone = $td.clone()
|
||||||
|
$clone.find('br').replaceWith(' / ')
|
||||||
|
$clone.find('sup, img').remove()
|
||||||
|
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
|
||||||
|
}
|
||||||
|
function tdFirstLink($td: Cheerio<Element>): string | null {
|
||||||
|
let name: string | null = null
|
||||||
|
$td.find('a').each((_, a) => {
|
||||||
|
const t = $(a).clone().find('img').remove().end().text().trim()
|
||||||
|
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
|
||||||
|
})
|
||||||
|
return name ?? (tdText($td) || null)
|
||||||
|
}
|
||||||
|
function tdAllLinks($td: Cheerio<Element>): string {
|
||||||
|
const names: string[] = []
|
||||||
|
$td.find('a').each((_, a) => {
|
||||||
|
const t = $(a).clone().find('img').remove().end().text().trim()
|
||||||
|
if (t && !/\[\d+\]/.test(t)) names.push(t)
|
||||||
|
})
|
||||||
|
return names.length ? names.join(' / ') : tdText($td)
|
||||||
|
}
|
||||||
|
|
||||||
|
$('table.infobox').first().find('tr').each((_, tr) => {
|
||||||
|
const $tr = $(tr)
|
||||||
|
const label = $tr.find('th').text().trim().toLowerCase()
|
||||||
|
const $td = $tr.find('td').first()
|
||||||
|
if (!$td.length) return
|
||||||
|
if (/host countr/i.test(label)) result.host = tdAllLinks($td)
|
||||||
|
else if (/^teams$/i.test(label)) { const m = $td.text().match(/\d+/); if (m) result.teams_count = parseInt(m[0]) }
|
||||||
|
else if (/champion/i.test(label)) result.winner = tdFirstLink($td)
|
||||||
|
else if (/runners?.up/i.test(label)) result.runner_up = tdFirstLink($td)
|
||||||
|
else if (/third.place/i.test(label)) result.third_place = tdFirstLink($td)
|
||||||
|
else if (/fourth.place/i.test(label)) result.fourth_place = tdFirstLink($td)
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
|
||||||
|
function matchWinner(m: Match): [string, string] | null {
|
||||||
|
if (!m.score) return null
|
||||||
|
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
|
||||||
|
if (h > a) return [m.team1, m.team2]
|
||||||
|
if (a > h) return [m.team2, m.team1]
|
||||||
|
if (m.score.p) {
|
||||||
|
const [ph, pa] = m.score.p
|
||||||
|
if (ph > pa) return [m.team1, m.team2]
|
||||||
|
if (pa > ph) return [m.team2, m.team1]
|
||||||
|
}
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
let winner: string | null = null, runner_up: string | null = null
|
||||||
|
let third_place: string | null = null, fourth_place: string | null = null
|
||||||
|
for (const m of matches) {
|
||||||
|
if (m.round === 'Final') {
|
||||||
|
const r = matchWinner(m); if (r) [winner, runner_up] = r
|
||||||
|
} else if (m.round === 'Third-place match') {
|
||||||
|
const r = matchWinner(m); if (r) [third_place, fourth_place] = r
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { winner, runner_up, third_place, fourth_place }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Main year scraper ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export async function scrapeYear(
|
||||||
|
year: number,
|
||||||
|
mainHtml: string,
|
||||||
|
opts?: { skipGroups?: Set<string> },
|
||||||
|
): Promise<YearResult> {
|
||||||
|
const $ = load(mainHtml)
|
||||||
|
const matches: Match[] = []
|
||||||
|
const stadiums = new Map<string, Stadium>()
|
||||||
|
const groups = new Map<string, Set<string>>()
|
||||||
|
const state: State = { active: false, round: '', group: null }
|
||||||
|
const groupSubpages = new Map<string, string>()
|
||||||
|
const groupsOnMainPage = new Set<string>()
|
||||||
|
|
||||||
|
function recordMatch(m: Match) {
|
||||||
|
matches.push(m)
|
||||||
|
if (m.group) groupsOnMainPage.add(m.group)
|
||||||
|
if (m.ground) {
|
||||||
|
const { name, city } = parseGroundParts(m.ground)
|
||||||
|
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
|
||||||
|
}
|
||||||
|
if (m.group) {
|
||||||
|
if (!groups.has(m.group)) groups.set(m.group, new Set())
|
||||||
|
groups.get(m.group)!.add(m.team1)
|
||||||
|
groups.get(m.group)!.add(m.team2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
|
||||||
|
const $el = $(el)
|
||||||
|
if ($el.hasClass('mw-heading')) {
|
||||||
|
const $h = $el.find('h2, h3, h4').first()
|
||||||
|
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
|
||||||
|
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, state)
|
||||||
|
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
|
||||||
|
if (state.active && state.group) {
|
||||||
|
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
|
||||||
|
if (link) {
|
||||||
|
const page = link.replace('/wiki/', '').split('#')[0]
|
||||||
|
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group))
|
||||||
|
groupSubpages.set(state.group, page)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if ($el.hasClass('footballbox')) {
|
||||||
|
if (!state.active) return
|
||||||
|
const m = parseBox($, $el, state.round || state.group || 'Unknown', state.group)
|
||||||
|
if (m) recordMatch(m)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
for (const [group, page] of groupSubpages) {
|
||||||
|
if (groupsOnMainPage.has(group)) continue
|
||||||
|
if (opts?.skipGroups?.has(group)) {
|
||||||
|
process.stdout.write(`[skip ${group}] `)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
await delay(1200)
|
||||||
|
const subHtml = await fetchWikiHtml(page)
|
||||||
|
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
|
||||||
|
|
||||||
|
const stateTemp: State = { active: false, round: '', group: null }
|
||||||
|
let round = 'Group stage'
|
||||||
|
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
|
||||||
|
const $h = $(el).find('h2, h3, h4').first()
|
||||||
|
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
|
||||||
|
processHeading($h.text().replace(/\[edit\]/g, '').trim(), level, stateTemp)
|
||||||
|
if (stateTemp.group === group) { round = stateTemp.round || 'Group stage'; return false }
|
||||||
|
})
|
||||||
|
|
||||||
|
const $sub = load(subHtml)
|
||||||
|
for (const m of collectBoxes($sub, round, group)) recordMatch(m)
|
||||||
|
process.stdout.write(`[+${page.slice(-8)}] `)
|
||||||
|
}
|
||||||
|
|
||||||
|
const infobox = parseInfobox($)
|
||||||
|
const placements = derivePlacements(matches)
|
||||||
|
const meta: Meta = {
|
||||||
|
host: infobox.host ?? '',
|
||||||
|
teams_count: infobox.teams_count ?? null,
|
||||||
|
winner: placements.winner ?? infobox.winner ?? null,
|
||||||
|
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
|
||||||
|
third_place: placements.third_place ?? infobox.third_place ?? null,
|
||||||
|
fourth_place: placements.fourth_place?? infobox.fourth_place?? null,
|
||||||
|
}
|
||||||
|
|
||||||
|
return { matches, stadiums, groups, meta }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Squad page scraper ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export function scrapeSquads(html: string): Squad[] {
|
||||||
|
const $ = load(html)
|
||||||
|
const squads: Squad[] = []
|
||||||
|
let currentTeam: Squad | null = null
|
||||||
|
|
||||||
|
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
|
||||||
|
const $el = $(el)
|
||||||
|
|
||||||
|
if ($el.hasClass('mw-heading')) {
|
||||||
|
const $h = $el.find('h3, h4').first()
|
||||||
|
if (!$h.length) return
|
||||||
|
if (parseInt($h.prop('tagName')?.slice(1) ?? '9') !== 3) return
|
||||||
|
const name = $h.text().replace(/\[edit\]/g, '').trim()
|
||||||
|
if (/^group /i.test(name)) return
|
||||||
|
currentTeam = { name, players: [] }
|
||||||
|
squads.push(currentTeam)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!currentTeam) return
|
||||||
|
|
||||||
|
let number: number | undefined
|
||||||
|
let pos: string | undefined
|
||||||
|
let playerName = ''
|
||||||
|
let dob: string | undefined
|
||||||
|
|
||||||
|
$el.find('td, th[scope="row"]').each((i, td) => {
|
||||||
|
const $td = $(td)
|
||||||
|
const text = $td.text().trim()
|
||||||
|
if ($td.is('th[scope="row"]')) {
|
||||||
|
playerName = $td.find('a').first().text().trim() || text
|
||||||
|
} else if (i === 0 && !playerName) {
|
||||||
|
const n = parseInt(text); if (!isNaN(n)) number = n
|
||||||
|
} else if (i === 1 && !playerName && !pos) {
|
||||||
|
const p = $td.find('a').first().text().trim()
|
||||||
|
if (['GK', 'DF', 'MF', 'FW'].includes(p)) pos = p
|
||||||
|
}
|
||||||
|
const $bday = $td.find('.bday')
|
||||||
|
if ($bday.length) dob = $bday.text().trim()
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!playerName) return
|
||||||
|
const player: Player = { name: playerName }
|
||||||
|
if (number !== undefined) player.number = number
|
||||||
|
if (pos) player.pos = pos
|
||||||
|
if (dob) player.date_of_birth = dob
|
||||||
|
currentTeam.players.push(player)
|
||||||
|
})
|
||||||
|
|
||||||
|
return squads.filter(s => s.players.length > 0)
|
||||||
|
}
|
||||||
+50
-613
@@ -1,10 +1,20 @@
|
|||||||
import { load } from 'cheerio'
|
/**
|
||||||
import type { CheerioAPI } from 'cheerio'
|
* Scrape English Wikipedia for World Cup data and write JSON files to
|
||||||
import type { Cheerio } from 'cheerio'
|
* app/data/wikipedia/{year}/.
|
||||||
import type { Element } from 'domhandler'
|
*
|
||||||
|
* Usage:
|
||||||
|
* pnpm scrape # all years, matches + squads
|
||||||
|
* pnpm scrape 2022 # single year, matches + squads
|
||||||
|
* pnpm scrape 2022 --matches # matches + meta + stadiums only
|
||||||
|
* pnpm scrape 2022 --squads # squads only
|
||||||
|
*/
|
||||||
import { mkdirSync, writeFileSync } from 'fs'
|
import { mkdirSync, writeFileSync } from 'fs'
|
||||||
import path from 'path'
|
import path from 'path'
|
||||||
import { fileURLToPath } from 'url'
|
import { fileURLToPath } from 'url'
|
||||||
|
import {
|
||||||
|
fetchWikiHtml, scrapeYear, scrapeSquads,
|
||||||
|
type Match, type Stadium, type Group, type Meta, type Squad,
|
||||||
|
} from '../lib/wiki-scraper'
|
||||||
|
|
||||||
const __dirname = path.dirname(fileURLToPath(import.meta.url))
|
const __dirname = path.dirname(fileURLToPath(import.meta.url))
|
||||||
const DATA_DIR = path.join(__dirname, '../app/data/wikipedia')
|
const DATA_DIR = path.join(__dirname, '../app/data/wikipedia')
|
||||||
@@ -16,648 +26,75 @@ const YEARS = [
|
|||||||
|
|
||||||
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
|
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
|
||||||
|
|
||||||
// ── Types ──────────────────────────────────────────────────────────────────
|
// ── File output ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
type Goal = {
|
function writeMatches(
|
||||||
name: string
|
|
||||||
minute?: number
|
|
||||||
offset?: number
|
|
||||||
penalty?: boolean
|
|
||||||
owngoal?: boolean
|
|
||||||
}
|
|
||||||
|
|
||||||
type ScoreObj = {
|
|
||||||
ft?: [number, number]
|
|
||||||
et?: [number, number]
|
|
||||||
p?: [number, number]
|
|
||||||
}
|
|
||||||
|
|
||||||
type Match = {
|
|
||||||
round: string
|
|
||||||
group?: string
|
|
||||||
date?: string
|
|
||||||
time?: string
|
|
||||||
team1: string
|
|
||||||
team2: string
|
|
||||||
score?: ScoreObj
|
|
||||||
goals1?: Goal[]
|
|
||||||
goals2?: Goal[]
|
|
||||||
ground?: string
|
|
||||||
}
|
|
||||||
|
|
||||||
type Stadium = { name: string; city: string }
|
|
||||||
type Player = { name: string; number?: number; pos?: string; date_of_birth?: string }
|
|
||||||
type Squad = { name: string; players: Player[] }
|
|
||||||
type Group = { name: string; teams: string[] }
|
|
||||||
type Meta = {
|
|
||||||
host: string
|
|
||||||
teams_count: number | null
|
|
||||||
winner: string | null
|
|
||||||
runner_up: string | null
|
|
||||||
third_place: string | null
|
|
||||||
fourth_place: string | null
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Fetch ──────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
|
|
||||||
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
|
|
||||||
for (let attempt = 0; attempt < retries; attempt++) {
|
|
||||||
try {
|
|
||||||
if (attempt > 0) await delay(3000 * attempt)
|
|
||||||
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0 (github.com/worldcup)' } })
|
|
||||||
if (!res.ok) continue
|
|
||||||
const data = await res.json() as { parse?: { text?: { '*': string } } }
|
|
||||||
const html = data?.parse?.text?.['*']
|
|
||||||
if (html) return html
|
|
||||||
} catch {
|
|
||||||
// retry
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Score parsing ──────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function parseScoreText(text: string): [number, number] | null {
|
|
||||||
const m = text.match(/(\d+)\s*[–\-]\s*(\d+)/)
|
|
||||||
if (!m) return null
|
|
||||||
return [parseInt(m[1]), parseInt(m[2])]
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Team name extraction ───────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function extractTeam($: CheerioAPI, $cell: Cheerio<Element>): string {
|
|
||||||
let name = ''
|
|
||||||
$cell.find('a').each((_, a) => {
|
|
||||||
const $a = $(a)
|
|
||||||
if (!$a.find('img').length && $a.text().trim()) {
|
|
||||||
name = $a.text().trim()
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
})
|
|
||||||
return name
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Goal parsing ───────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function parseGoals($: CheerioAPI, $td: Cheerio<Element>): Goal[] {
|
|
||||||
const goals: Goal[] = []
|
|
||||||
|
|
||||||
$td.find('li').each((_, li) => {
|
|
||||||
const $li = $(li)
|
|
||||||
|
|
||||||
// Player name: first <a> NOT inside .fb-goal
|
|
||||||
let playerName = ''
|
|
||||||
$li.find('a').each((_, a) => {
|
|
||||||
if (!$(a).closest('.fb-goal').length) {
|
|
||||||
const t = $(a).text().trim()
|
|
||||||
if (t) { playerName = t; return false }
|
|
||||||
}
|
|
||||||
})
|
|
||||||
if (!playerName) return
|
|
||||||
|
|
||||||
const $fbGoal = $li.find('.fb-goal')
|
|
||||||
if (!$fbGoal.length) return
|
|
||||||
|
|
||||||
// Each direct child <span> inside .fb-goal (excluding image wrapper)
|
|
||||||
$fbGoal.children('span').each((_, span) => {
|
|
||||||
const $span = $(span)
|
|
||||||
if ($span.attr('typeof')) return // image wrapper
|
|
||||||
|
|
||||||
const text = $span.text()
|
|
||||||
const minMatch = text.match(/(\d+)(?:\+(\d+))?['′]/)
|
|
||||||
if (!minMatch) return
|
|
||||||
|
|
||||||
const minute = parseInt(minMatch[1])
|
|
||||||
const offset = minMatch[2] ? parseInt(minMatch[2]) : 0
|
|
||||||
const isPen = text.includes('pen.')
|
|
||||||
const isOG = text.includes('o.g.')
|
|
||||||
|
|
||||||
const goal: Goal = { name: playerName }
|
|
||||||
if (!isNaN(minute)) goal.minute = minute
|
|
||||||
if (offset) goal.offset = offset
|
|
||||||
if (isPen) goal.penalty = true
|
|
||||||
if (isOG) goal.owngoal = true
|
|
||||||
goals.push(goal)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
return goals
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Ground extraction ──────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function extractGround($: CheerioAPI, $box: Cheerio<Element>): string {
|
|
||||||
const $loc = $box.find('[itemprop="name address"]').first()
|
|
||||||
if ($loc.length) return $loc.text().trim()
|
|
||||||
return $box.find('.fright').first().text().split('\n')[0].trim()
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseGroundParts(ground: string): { name: string; city: string } {
|
|
||||||
const commaIdx = ground.indexOf(',')
|
|
||||||
if (commaIdx !== -1) {
|
|
||||||
return {
|
|
||||||
name: ground.slice(0, commaIdx).trim(),
|
|
||||||
city: ground.slice(commaIdx + 1).trim(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return { name: ground, city: '' }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Footballbox parsing ────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function parseBox(
|
|
||||||
$: CheerioAPI,
|
|
||||||
$box: Cheerio<Element>,
|
|
||||||
round: string,
|
|
||||||
group: string | null,
|
|
||||||
): Match | null {
|
|
||||||
const team1 = extractTeam($, $box.find('.fhome'))
|
|
||||||
const team2 = extractTeam($, $box.find('.faway'))
|
|
||||||
if (!team1 || !team2) return null
|
|
||||||
|
|
||||||
const dateStr = $box.find('.bday, .dtstart').first().text().trim() || undefined
|
|
||||||
|
|
||||||
const timeText = $box.find('.ftime').first().text().trim()
|
|
||||||
const timeMatch = timeText.match(/(\d{2}:\d{2})/)
|
|
||||||
const timeStr = timeMatch?.[1]
|
|
||||||
|
|
||||||
const scoreText = $box.find('.fscore').first().text().trim()
|
|
||||||
const hasAET = scoreText.toLowerCase().includes('a.e.t.')
|
|
||||||
const scoreArr = parseScoreText(scoreText)
|
|
||||||
|
|
||||||
// Use first fgoals row only (exclude penalty shootout row)
|
|
||||||
const $regularRow = $box.find('tr.fgoals').first()
|
|
||||||
const goals1 = parseGoals($, $regularRow.find('.fhgoal'))
|
|
||||||
const goals2 = parseGoals($, $regularRow.find('.fagoal'))
|
|
||||||
|
|
||||||
// Penalty shootout score: row after "Penalties" header tr
|
|
||||||
let penScore: [number, number] | undefined
|
|
||||||
$box.find('tr').each((_, tr) => {
|
|
||||||
const $tr = $(tr)
|
|
||||||
if ($tr.find('th[colspan]').text().toLowerCase().includes('penalt')) {
|
|
||||||
const penText = $tr.next('tr').find('th').not('.fhome,.faway').first().text().trim()
|
|
||||||
const ps = parseScoreText(penText)
|
|
||||||
if (ps) penScore = ps
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
let score: ScoreObj | undefined
|
|
||||||
if (scoreArr) {
|
|
||||||
if (hasAET) {
|
|
||||||
// scoreArr is ET total; compute FT from goals in ≤90 min
|
|
||||||
const ftGoals = (gs: Goal[], includeOG = false) =>
|
|
||||||
gs.filter(g => {
|
|
||||||
const w90 = g.minute === undefined || g.minute <= 90
|
|
||||||
return includeOG ? g.owngoal === true && w90 : !g.owngoal && w90
|
|
||||||
}).length
|
|
||||||
const ftHome = ftGoals(goals1) + ftGoals(goals2, true)
|
|
||||||
const ftAway = ftGoals(goals2) + ftGoals(goals1, true)
|
|
||||||
score = { ft: [ftHome, ftAway], et: scoreArr }
|
|
||||||
} else {
|
|
||||||
score = { ft: scoreArr }
|
|
||||||
}
|
|
||||||
if (penScore) score.p = penScore
|
|
||||||
}
|
|
||||||
|
|
||||||
const ground = extractGround($, $box) || undefined
|
|
||||||
|
|
||||||
return {
|
|
||||||
round,
|
|
||||||
...(group ? { group } : {}),
|
|
||||||
...(dateStr ? { date: dateStr } : {}),
|
|
||||||
...(timeStr ? { time: timeStr } : {}),
|
|
||||||
team1,
|
|
||||||
team2,
|
|
||||||
...(score ? { score } : {}),
|
|
||||||
...(goals1.length ? { goals1 } : {}),
|
|
||||||
...(goals2.length ? { goals2 } : {}),
|
|
||||||
...(ground ? { ground } : {}),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Collect matches from a pre-loaded page ─────────────────────────────────
|
|
||||||
|
|
||||||
function collectBoxes(
|
|
||||||
$: CheerioAPI,
|
|
||||||
round: string,
|
|
||||||
group: string | null,
|
|
||||||
): Match[] {
|
|
||||||
const matches: Match[] = []
|
|
||||||
$('.footballbox').each((_, el) => {
|
|
||||||
const m = parseBox($, $(el), round, group)
|
|
||||||
if (m) matches.push(m)
|
|
||||||
})
|
|
||||||
return matches
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Section heading state machine ──────────────────────────────────────────
|
|
||||||
|
|
||||||
type State = {
|
|
||||||
active: boolean
|
|
||||||
round: string
|
|
||||||
group: string | null
|
|
||||||
}
|
|
||||||
|
|
||||||
function processHeading(text: string, level: number, state: State): void {
|
|
||||||
const t = text.toLowerCase().trim()
|
|
||||||
|
|
||||||
if (level === 2) {
|
|
||||||
if (/group stage/i.test(t) && !/second/i.test(t)) {
|
|
||||||
state.active = true; state.round = 'Group stage'; state.group = null
|
|
||||||
} else if (/first group stage/i.test(t)) {
|
|
||||||
state.active = true; state.round = 'Group stage'; state.group = null
|
|
||||||
} else if (/second group stage/i.test(t)) {
|
|
||||||
state.active = true; state.round = 'Second group stage'; state.group = null
|
|
||||||
} else if (t === 'final round') {
|
|
||||||
state.active = true; state.round = 'Final round'; state.group = null
|
|
||||||
} else if (/final tournament/i.test(t)) {
|
|
||||||
state.active = true; state.round = ''; state.group = null
|
|
||||||
} else if (/knock.?out stage/i.test(t)) {
|
|
||||||
state.active = true; state.round = ''; state.group = null
|
|
||||||
} else if (/round of 16/i.test(t)) {
|
|
||||||
state.active = true; state.round = 'Round of 16'; state.group = null
|
|
||||||
} else if (/quarter.final/i.test(t)) {
|
|
||||||
state.active = true; state.round = 'Quarter-finals'; state.group = null
|
|
||||||
} else if (/semi.final/i.test(t)) {
|
|
||||||
state.active = true; state.round = 'Semi-finals'; state.group = null
|
|
||||||
} else if (/third.place|match for third|play.off for third/i.test(t)) {
|
|
||||||
state.active = true; state.round = 'Third-place match'; state.group = null
|
|
||||||
} else if (t === 'final') {
|
|
||||||
state.active = true; state.round = 'Final'; state.group = null
|
|
||||||
} else {
|
|
||||||
state.active = false
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!state.active) return
|
|
||||||
|
|
||||||
if (level === 3 || level === 4) {
|
|
||||||
if (/^group [a-h1-9]+$/i.test(t)) {
|
|
||||||
state.group = text.trim()
|
|
||||||
} else if (/round of 32/i.test(t)) {
|
|
||||||
state.round = 'Round of 32'; state.group = null
|
|
||||||
} else if (/round of 16/i.test(t)) {
|
|
||||||
state.round = 'Round of 16'; state.group = null
|
|
||||||
} else if (/quarter.final/i.test(t)) {
|
|
||||||
state.round = 'Quarter-finals'; state.group = null
|
|
||||||
} else if (/semi.final/i.test(t)) {
|
|
||||||
state.round = 'Semi-finals'; state.group = null
|
|
||||||
} else if (/third.place|match for third|play.off for third/i.test(t)) {
|
|
||||||
state.round = 'Third-place match'; state.group = null
|
|
||||||
} else if (t === 'final') {
|
|
||||||
state.round = 'Final'; state.group = null
|
|
||||||
}
|
|
||||||
// bracket, draw, seeding, replay → keep current state
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Main year scraper ──────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
// ── Infobox parsing ────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function parseInfobox($: CheerioAPI): Partial<Meta> {
|
|
||||||
const result: Partial<Meta> = {}
|
|
||||||
|
|
||||||
function tdText($td: Cheerio<Element>): string {
|
|
||||||
const $clone = $td.clone()
|
|
||||||
$clone.find('br').replaceWith(' / ')
|
|
||||||
$clone.find('sup, img').remove()
|
|
||||||
return $clone.text().replace(/\[\d+\]/g, '').replace(/\s+/g, ' ').trim()
|
|
||||||
}
|
|
||||||
|
|
||||||
function tdFirstLink($td: Cheerio<Element>): string | null {
|
|
||||||
let name: string | null = null
|
|
||||||
$td.find('a').each((_, a) => {
|
|
||||||
const t = $(a).clone().find('img').remove().end().text().trim()
|
|
||||||
if (t && !/\[\d+\]/.test(t)) { name = t; return false }
|
|
||||||
})
|
|
||||||
return name ?? (tdText($td) || null)
|
|
||||||
}
|
|
||||||
|
|
||||||
function tdAllLinks($td: Cheerio<Element>): string {
|
|
||||||
const names: string[] = []
|
|
||||||
$td.find('a').each((_, a) => {
|
|
||||||
const t = $(a).clone().find('img').remove().end().text().trim()
|
|
||||||
if (t && !/\[\d+\]/.test(t)) names.push(t)
|
|
||||||
})
|
|
||||||
return names.length ? names.join(' / ') : tdText($td)
|
|
||||||
}
|
|
||||||
|
|
||||||
$('table.infobox').first().find('tr').each((_, tr) => {
|
|
||||||
const $tr = $(tr)
|
|
||||||
const label = $tr.find('th').text().trim().toLowerCase()
|
|
||||||
const $td = $tr.find('td').first()
|
|
||||||
if (!$td.length) return
|
|
||||||
if (/host countr/i.test(label)) {
|
|
||||||
result.host = tdAllLinks($td)
|
|
||||||
} else if (/^teams$/i.test(label)) {
|
|
||||||
const m = $td.text().match(/\d+/)
|
|
||||||
if (m) result.teams_count = parseInt(m[0])
|
|
||||||
} else if (/champion/i.test(label)) {
|
|
||||||
result.winner = tdFirstLink($td)
|
|
||||||
} else if (/runners?.up/i.test(label)) {
|
|
||||||
result.runner_up = tdFirstLink($td)
|
|
||||||
} else if (/third.place/i.test(label)) {
|
|
||||||
result.third_place = tdFirstLink($td)
|
|
||||||
} else if (/fourth.place/i.test(label)) {
|
|
||||||
result.fourth_place = tdFirstLink($td)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Placement derivation ───────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function derivePlacements(matches: Match[]): Pick<Meta, 'winner' | 'runner_up' | 'third_place' | 'fourth_place'> {
|
|
||||||
function matchWinner(m: Match): [string, string] | null {
|
|
||||||
if (!m.score) return null
|
|
||||||
const [h, a] = m.score.et ?? m.score.ft ?? [0, 0]
|
|
||||||
if (h > a) return [m.team1, m.team2]
|
|
||||||
if (a > h) return [m.team2, m.team1]
|
|
||||||
if (m.score.p) {
|
|
||||||
const [ph, pa] = m.score.p
|
|
||||||
if (ph > pa) return [m.team1, m.team2]
|
|
||||||
if (pa > ph) return [m.team2, m.team1]
|
|
||||||
}
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
|
|
||||||
let winner: string | null = null, runner_up: string | null = null
|
|
||||||
let third_place: string | null = null, fourth_place: string | null = null
|
|
||||||
|
|
||||||
for (const m of matches) {
|
|
||||||
if (m.round === 'Final') {
|
|
||||||
const result = matchWinner(m)
|
|
||||||
if (result) { [winner, runner_up] = result }
|
|
||||||
} else if (m.round === 'Third-place match') {
|
|
||||||
const result = matchWinner(m)
|
|
||||||
if (result) { [third_place, fourth_place] = result }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return { winner, runner_up, third_place, fourth_place }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Year result ────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
type YearResult = {
|
|
||||||
matches: Match[]
|
|
||||||
stadiums: Map<string, Stadium>
|
|
||||||
groups: Map<string, Set<string>>
|
|
||||||
meta: Meta
|
|
||||||
}
|
|
||||||
|
|
||||||
async function scrapeYear(year: number, mainHtml: string): Promise<YearResult> {
|
|
||||||
const $ = load(mainHtml)
|
|
||||||
const matches: Match[] = []
|
|
||||||
const stadiums = new Map<string, Stadium>()
|
|
||||||
const groups = new Map<string, Set<string>>()
|
|
||||||
|
|
||||||
const state: State = { active: false, round: '', group: null }
|
|
||||||
|
|
||||||
// Maps group name → sub-page to fetch (if main page has no matches for that group)
|
|
||||||
const groupSubpages = new Map<string, string>()
|
|
||||||
// Groups that got at least one match from the main page
|
|
||||||
const groupsOnMainPage = new Set<string>()
|
|
||||||
|
|
||||||
function recordMatch(m: Match) {
|
|
||||||
matches.push(m)
|
|
||||||
if (m.group) groupsOnMainPage.add(m.group)
|
|
||||||
if (m.ground) {
|
|
||||||
const { name, city } = parseGroundParts(m.ground)
|
|
||||||
if (name && !stadiums.has(name)) stadiums.set(name, { name, city })
|
|
||||||
}
|
|
||||||
if (m.group) {
|
|
||||||
if (!groups.has(m.group)) groups.set(m.group, new Set())
|
|
||||||
groups.get(m.group)!.add(m.team1)
|
|
||||||
groups.get(m.group)!.add(m.team2)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Walk elements in document order: headings, hatnotes, footballboxes
|
|
||||||
$('.mw-parser-output').find('div.mw-heading, .footballbox, .hatnote').each((_, el) => {
|
|
||||||
const $el = $(el)
|
|
||||||
|
|
||||||
if ($el.hasClass('mw-heading')) {
|
|
||||||
const $h = $el.find('h2, h3, h4').first()
|
|
||||||
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
|
|
||||||
const text = $h.text().replace(/\[edit\]/g, '').trim()
|
|
||||||
processHeading(text, level, state)
|
|
||||||
|
|
||||||
} else if ($el.hasClass('hatnote') && $el.text().includes('Main article')) {
|
|
||||||
// Record sub-page link for current group context (for fallback if no main-page matches)
|
|
||||||
if (state.active && state.group) {
|
|
||||||
const link = $el.find('a[href^="/wiki/"]').first().attr('href')
|
|
||||||
if (link) {
|
|
||||||
const page = link.replace('/wiki/', '').split('#')[0]
|
|
||||||
if (/World_Cup_Group/i.test(page) && !groupSubpages.has(state.group)) {
|
|
||||||
groupSubpages.set(state.group, page)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if ($el.hasClass('footballbox')) {
|
|
||||||
if (!state.active) return
|
|
||||||
const round = state.round || state.group || 'Unknown'
|
|
||||||
const m = parseBox($, $el, round, state.group)
|
|
||||||
if (m) recordMatch(m)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
// Fetch group sub-pages for any group that got 0 matches from main page
|
|
||||||
for (const [group, page] of groupSubpages) {
|
|
||||||
if (groupsOnMainPage.has(group)) continue
|
|
||||||
|
|
||||||
await delay(1200)
|
|
||||||
const subHtml = await fetchWikiHtml(page)
|
|
||||||
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
|
|
||||||
|
|
||||||
// Determine the round for this group from the state machine result
|
|
||||||
// (we'll reconstruct from the main-page walk state — use the round that was active when this group was seen)
|
|
||||||
// Since we can't easily recover state here, we re-walk to find the round for this group
|
|
||||||
let round = 'Group stage'
|
|
||||||
let foundGroup = false
|
|
||||||
const stateTemp: State = { active: false, round: '', group: null }
|
|
||||||
$('.mw-parser-output').find('div.mw-heading').each((_, el) => {
|
|
||||||
const $h = $(el).find('h2, h3, h4').first()
|
|
||||||
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
|
|
||||||
const text = $h.text().replace(/\[edit\]/g, '').trim()
|
|
||||||
processHeading(text, level, stateTemp)
|
|
||||||
if (stateTemp.group === group) {
|
|
||||||
round = stateTemp.round || 'Group stage'
|
|
||||||
foundGroup = true
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
const $sub = load(subHtml)
|
|
||||||
const subMatches = collectBoxes($sub, round || 'Group stage', group)
|
|
||||||
for (const m of subMatches) {
|
|
||||||
recordMatch(m)
|
|
||||||
}
|
|
||||||
process.stdout.write(`[+${page.slice(-8)}] `)
|
|
||||||
}
|
|
||||||
|
|
||||||
const infobox = parseInfobox($)
|
|
||||||
const placements = derivePlacements(matches)
|
|
||||||
const meta: Meta = {
|
|
||||||
host: infobox.host ?? '',
|
|
||||||
teams_count: infobox.teams_count ?? null,
|
|
||||||
winner: placements.winner ?? infobox.winner ?? null,
|
|
||||||
runner_up: placements.runner_up ?? infobox.runner_up ?? null,
|
|
||||||
third_place: placements.third_place ?? infobox.third_place ?? null,
|
|
||||||
fourth_place:placements.fourth_place?? infobox.fourth_place?? null,
|
|
||||||
}
|
|
||||||
|
|
||||||
return { matches, stadiums, groups, meta }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Squad page scraper ─────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function scrapeSquads(html: string): Squad[] {
|
|
||||||
const $ = load(html)
|
|
||||||
const squads: Squad[] = []
|
|
||||||
let currentTeam: Squad | null = null
|
|
||||||
|
|
||||||
$('.mw-parser-output').find('div.mw-heading, tr.nat-fs-player').each((_, el) => {
|
|
||||||
const $el = $(el)
|
|
||||||
|
|
||||||
if ($el.hasClass('mw-heading')) {
|
|
||||||
const $h = $el.find('h3, h4').first()
|
|
||||||
if (!$h.length) return
|
|
||||||
const level = parseInt($h.prop('tagName')?.slice(1) ?? '9')
|
|
||||||
if (level !== 3) return
|
|
||||||
const name = $h.text().replace(/\[edit\]/g, '').trim()
|
|
||||||
if (/^group /i.test(name)) return // skip group headers
|
|
||||||
currentTeam = { name, players: [] }
|
|
||||||
squads.push(currentTeam)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!currentTeam) return
|
|
||||||
|
|
||||||
let number: number | undefined
|
|
||||||
let pos: string | undefined
|
|
||||||
let playerName = ''
|
|
||||||
let dob: string | undefined
|
|
||||||
|
|
||||||
$el.find('td, th[scope="row"]').each((i, td) => {
|
|
||||||
const $td = $(td)
|
|
||||||
const text = $td.text().trim()
|
|
||||||
|
|
||||||
if ($td.is('th[scope="row"]')) {
|
|
||||||
playerName = $td.find('a').first().text().trim() || text
|
|
||||||
} else if (i === 0 && !playerName) {
|
|
||||||
const n = parseInt(text)
|
|
||||||
if (!isNaN(n)) number = n
|
|
||||||
} else if (i === 1 && !playerName && !pos) {
|
|
||||||
const posLink = $td.find('a').first().text().trim()
|
|
||||||
if (['GK', 'DF', 'MF', 'FW'].includes(posLink)) pos = posLink
|
|
||||||
}
|
|
||||||
|
|
||||||
const $bday = $td.find('.bday')
|
|
||||||
if ($bday.length) dob = $bday.text().trim()
|
|
||||||
})
|
|
||||||
|
|
||||||
if (!playerName) return
|
|
||||||
|
|
||||||
const player: Player = { name: playerName }
|
|
||||||
if (number !== undefined) player.number = number
|
|
||||||
if (pos) player.pos = pos
|
|
||||||
if (dob) player.date_of_birth = dob
|
|
||||||
currentTeam.players.push(player)
|
|
||||||
})
|
|
||||||
|
|
||||||
return squads.filter(s => s.players.length > 0)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Output ─────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function writeOutput(
|
|
||||||
year: number,
|
year: number,
|
||||||
matches: Match[],
|
matches: Match[],
|
||||||
stadiums: Map<string, Stadium>,
|
stadiums: Map<string, Stadium>,
|
||||||
groups: Map<string, Set<string>>,
|
groups: Map<string, Set<string>>,
|
||||||
squads: Squad[],
|
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
): void {
|
): void {
|
||||||
const dir = path.join(DATA_DIR, String(year))
|
const dir = path.join(DATA_DIR, String(year))
|
||||||
mkdirSync(dir, { recursive: true })
|
mkdirSync(dir, { recursive: true })
|
||||||
|
|
||||||
writeFileSync(
|
writeFileSync(path.join(dir, 'worldcup.meta.json'), JSON.stringify(meta, null, 2), 'utf-8')
|
||||||
path.join(dir, 'worldcup.meta.json'),
|
writeFileSync(path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), 'utf-8')
|
||||||
JSON.stringify(meta, null, 2),
|
|
||||||
'utf-8',
|
|
||||||
)
|
|
||||||
|
|
||||||
writeFileSync(
|
if (stadiums.size > 0)
|
||||||
path.join(dir, 'worldcup.json'),
|
writeFileSync(path.join(dir, 'worldcup.stadiums.json'),
|
||||||
JSON.stringify({ matches }, null, 2),
|
JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), 'utf-8')
|
||||||
'utf-8',
|
|
||||||
)
|
|
||||||
|
|
||||||
if (stadiums.size > 0) {
|
|
||||||
writeFileSync(
|
|
||||||
path.join(dir, 'worldcup.stadiums.json'),
|
|
||||||
JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2),
|
|
||||||
'utf-8',
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
const groupList: Group[] = []
|
const groupList: Group[] = []
|
||||||
groups.forEach((teams, name) => {
|
groups.forEach((teams, name) => groupList.push({ name, teams: Array.from(teams) }))
|
||||||
groupList.push({ name, teams: Array.from(teams) })
|
if (groupList.length > 0)
|
||||||
})
|
writeFileSync(path.join(dir, 'worldcup.groups.json'),
|
||||||
if (groupList.length > 0) {
|
JSON.stringify({ groups: groupList }, null, 2), 'utf-8')
|
||||||
writeFileSync(
|
}
|
||||||
path.join(dir, 'worldcup.groups.json'),
|
|
||||||
JSON.stringify({ groups: groupList }, null, 2),
|
|
||||||
'utf-8',
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (squads.length > 0) {
|
function writeSquads(year: number, squads: Squad[]): void {
|
||||||
writeFileSync(
|
if (squads.length === 0) return
|
||||||
path.join(dir, 'worldcup.squads.json'),
|
const dir = path.join(DATA_DIR, String(year))
|
||||||
JSON.stringify(squads, null, 2),
|
mkdirSync(dir, { recursive: true })
|
||||||
'utf-8',
|
writeFileSync(path.join(dir, 'worldcup.squads.json'), JSON.stringify(squads, null, 2), 'utf-8')
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Entry point ────────────────────────────────────────────────────────────
|
// ── Entry point ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
const onlyYear = process.argv[2] ? parseInt(process.argv[2]) : null
|
const args = process.argv.slice(2)
|
||||||
const yearsToScrape = onlyYear ? [onlyYear] : YEARS
|
const yearArg = args.find(a => /^\d{4}$/.test(a))
|
||||||
|
const doMatches = args.includes('--matches') || !args.includes('--squads')
|
||||||
|
const doSquads = args.includes('--squads') || !args.includes('--matches')
|
||||||
|
|
||||||
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia...`)
|
const yearsToScrape = yearArg ? [parseInt(yearArg)] : YEARS
|
||||||
|
const modeLabel = [doMatches && 'matches', doSquads && 'squads'].filter(Boolean).join(' + ')
|
||||||
|
|
||||||
|
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia [${modeLabel}]...`)
|
||||||
|
|
||||||
for (const year of yearsToScrape) {
|
for (const year of yearsToScrape) {
|
||||||
process.stdout.write(` ${year}... `)
|
process.stdout.write(` ${year}... `)
|
||||||
|
|
||||||
|
if (doMatches) {
|
||||||
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
|
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
|
||||||
if (!mainHtml) { console.log('FAILED'); continue }
|
if (!mainHtml) { console.log('FAILED (main page)'); continue }
|
||||||
|
|
||||||
const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
|
const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
|
||||||
|
writeMatches(year, matches, stadiums, groups, meta)
|
||||||
|
process.stdout.write(`${matches.length} matches`)
|
||||||
await delay(600)
|
await delay(600)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doSquads) {
|
||||||
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
|
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
|
||||||
const squads = squadHtml ? scrapeSquads(squadHtml) : []
|
const squads = squadHtml ? scrapeSquads(squadHtml) : []
|
||||||
|
writeSquads(year, squads)
|
||||||
writeOutput(year, matches, stadiums, groups, squads, meta)
|
process.stdout.write(`${doMatches ? ', ' : ''}${squads.length} squads`)
|
||||||
|
|
||||||
console.log(`${matches.length} matches, ${stadiums.size} stadiums, ${groups.size} groups, ${squads.length} teams`)
|
|
||||||
|
|
||||||
await delay(600)
|
await delay(600)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log()
|
||||||
|
}
|
||||||
|
|
||||||
console.log('\nDone! Files written to app/data/wikipedia/{year}/')
|
console.log('\nDone! Files written to app/data/wikipedia/{year}/')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+101
-127
@@ -1,40 +1,16 @@
|
|||||||
import postgres from 'postgres'
|
import postgres from 'postgres'
|
||||||
import { drizzle } from 'drizzle-orm/postgres-js'
|
import { drizzle } from 'drizzle-orm/postgres-js'
|
||||||
import { sql } from 'drizzle-orm'
|
import { sql } from 'drizzle-orm'
|
||||||
import { TEAM_ISO, getIso } from '../lib/iso-codes'
|
import { fetchWikiHtml, scrapeYear, scrapeSquads } from '../lib/wiki-scraper'
|
||||||
|
import { getIso } from '../lib/iso-codes'
|
||||||
|
|
||||||
const DATABASE_URL = process.env.DATABASE_URL
|
const DATABASE_URL = process.env.DATABASE_URL
|
||||||
if (!DATABASE_URL) {
|
if (!DATABASE_URL) {
|
||||||
console.error('ERROR: DATABASE_URL environment variable is not set')
|
console.error('ERROR: DATABASE_URL environment variable is not set')
|
||||||
process.exit(1)
|
process.exit(1)
|
||||||
}
|
}
|
||||||
const BASE = 'https://raw.githubusercontent.com/openfootball/worldcup.json/master'
|
|
||||||
|
|
||||||
async function fetchJson(url: string): Promise<unknown> {
|
// ── DB helpers ─────────────────────────────────────────────────────────────
|
||||||
try {
|
|
||||||
const res = await fetch(url)
|
|
||||||
if (!res.ok) return null
|
|
||||||
return res.json()
|
|
||||||
} catch {
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type RawGoal = { name: string; minute?: string | number; offset?: number; penalty?: boolean; owngoal?: boolean }
|
|
||||||
type RawScore = { ft?: number[]; ht?: number[]; et?: number[]; p?: number[] } | number[]
|
|
||||||
type RawMatch = {
|
|
||||||
round?: string; date?: string; time?: string;
|
|
||||||
team1: string; team2: string; score?: RawScore;
|
|
||||||
goals1?: RawGoal[]; goals2?: RawGoal[];
|
|
||||||
group?: string; ground?: string;
|
|
||||||
}
|
|
||||||
type RawData = { matches: RawMatch[] }
|
|
||||||
|
|
||||||
function parseScore(score: RawScore | undefined) {
|
|
||||||
if (!score) return {}
|
|
||||||
if (Array.isArray(score)) return { ft: score }
|
|
||||||
return { ft: score.ft, ht: score.ht, et: score.et, p: score.p }
|
|
||||||
}
|
|
||||||
|
|
||||||
async function run() {
|
async function run() {
|
||||||
const client = postgres(DATABASE_URL!, { max: 2 })
|
const client = postgres(DATABASE_URL!, { max: 2 })
|
||||||
@@ -42,17 +18,13 @@ async function run() {
|
|||||||
|
|
||||||
const teamCache = new Map<string, number>()
|
const teamCache = new Map<string, number>()
|
||||||
|
|
||||||
async function upsertTeam(rawName: string, extra?: { iso2?: string | null; fifaCode?: string; continent?: string; confederation?: string }) {
|
async function upsertTeam(rawName: string) {
|
||||||
if (teamCache.has(rawName)) return teamCache.get(rawName)!
|
if (teamCache.has(rawName)) return teamCache.get(rawName)!
|
||||||
const iso2 = (extra && 'iso2' in extra) ? extra.iso2 : getIso(rawName)
|
const iso2 = getIso(rawName)
|
||||||
const [row] = await db.execute(sql`
|
const [row] = await db.execute(sql`
|
||||||
INSERT INTO teams (name, iso2, fifa_code, continent, confederation)
|
INSERT INTO teams (name, iso2)
|
||||||
VALUES (${rawName}, ${iso2 ?? null}, ${extra?.fifaCode ?? null}, ${extra?.continent ?? null}, ${extra?.confederation ?? null})
|
VALUES (${rawName}, ${iso2 ?? null})
|
||||||
ON CONFLICT (name) DO UPDATE SET
|
ON CONFLICT (name) DO UPDATE SET iso2 = COALESCE(EXCLUDED.iso2, teams.iso2)
|
||||||
iso2 = COALESCE(EXCLUDED.iso2, teams.iso2),
|
|
||||||
fifa_code = COALESCE(EXCLUDED.fifa_code, teams.fifa_code),
|
|
||||||
continent = COALESCE(EXCLUDED.continent, teams.continent),
|
|
||||||
confederation = COALESCE(EXCLUDED.confederation, teams.confederation)
|
|
||||||
RETURNING id
|
RETURNING id
|
||||||
`)
|
`)
|
||||||
const id = (row as { id: number }).id
|
const id = (row as { id: number }).id
|
||||||
@@ -62,20 +34,19 @@ async function run() {
|
|||||||
|
|
||||||
async function upsertMatch(
|
async function upsertMatch(
|
||||||
year: number, round: string, group: string | null, dateStr: string | null,
|
year: number, round: string, group: string | null, dateStr: string | null,
|
||||||
timeStr: string | null, team1Id: number, team2Id: number, score: ReturnType<typeof parseScore>,
|
timeStr: string | null, team1Id: number, team2Id: number,
|
||||||
isQuali: boolean
|
ft: [number, number] | undefined, et: [number, number] | undefined, p: [number, number] | undefined,
|
||||||
|
isQuali: boolean,
|
||||||
) {
|
) {
|
||||||
const rows = await db.execute(sql`
|
const rows = await db.execute(sql`
|
||||||
INSERT INTO matches (tournament_year, round, group_name, date, time_local, team1_id, team2_id,
|
INSERT INTO matches (tournament_year, round, group_name, date, time_local, team1_id, team2_id,
|
||||||
score_ft_home, score_ft_away, score_ht_home, score_ht_away,
|
score_ft_home, score_ft_away, score_et_home, score_et_away,
|
||||||
score_et_home, score_et_away, score_p_home, score_p_away, is_quali_playoff)
|
score_p_home, score_p_away, is_quali_playoff)
|
||||||
VALUES (
|
VALUES (
|
||||||
${year}, ${round}, ${group}, ${dateStr ?? null}, ${timeStr ?? null},
|
${year}, ${round}, ${group}, ${dateStr}, ${timeStr}, ${team1Id}, ${team2Id},
|
||||||
${team1Id}, ${team2Id},
|
${ft?.[0] ?? null}, ${ft?.[1] ?? null},
|
||||||
${score.ft?.[0] ?? null}, ${score.ft?.[1] ?? null},
|
${et?.[0] ?? null}, ${et?.[1] ?? null},
|
||||||
${score.ht?.[0] ?? null}, ${score.ht?.[1] ?? null},
|
${p?.[0] ?? null}, ${p?.[1] ?? null},
|
||||||
${score.et?.[0] ?? null}, ${score.et?.[1] ?? null},
|
|
||||||
${score.p?.[0] ?? null}, ${score.p?.[1] ?? null},
|
|
||||||
${isQuali}
|
${isQuali}
|
||||||
)
|
)
|
||||||
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
|
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
|
||||||
@@ -83,8 +54,6 @@ async function run() {
|
|||||||
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
|
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
|
||||||
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
|
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
|
||||||
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
|
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
|
||||||
score_ht_home = COALESCE(EXCLUDED.score_ht_home, matches.score_ht_home),
|
|
||||||
score_ht_away = COALESCE(EXCLUDED.score_ht_away, matches.score_ht_away),
|
|
||||||
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
|
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
|
||||||
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
|
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
|
||||||
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
|
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
|
||||||
@@ -94,24 +63,13 @@ async function run() {
|
|||||||
return (rows[0] as { id: number }).id
|
return (rows[0] as { id: number }).id
|
||||||
}
|
}
|
||||||
|
|
||||||
type GoalRow = { teamId: number; name: string; minute: number | null; offset: number; penalty: boolean; owngoal: boolean }
|
async function replaceGoals(matchId: number, goals: Array<{
|
||||||
|
teamId: number; name: string; minute: number | null; offset: number; penalty: boolean; owngoal: boolean
|
||||||
function collectGoals(teamId: number, rawGoals: RawGoal[], isOwnGoalTeamId: number): GoalRow[] {
|
}>) {
|
||||||
return rawGoals.flatMap(g => {
|
|
||||||
if (!g.name) return []
|
|
||||||
const minute = g.minute != null ? parseInt(String(g.minute)) : null
|
|
||||||
return [{ teamId: g.owngoal ? isOwnGoalTeamId : teamId, name: g.name,
|
|
||||||
minute: isNaN(minute!) ? null : minute, offset: g.offset ?? 0,
|
|
||||||
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false }]
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async function replaceGoals(matchId: number, rows: GoalRow[]) {
|
|
||||||
await db.transaction(async tx => {
|
await db.transaction(async tx => {
|
||||||
await tx.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
|
await tx.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
|
||||||
if (rows.length > 0) {
|
if (goals.length > 0) {
|
||||||
// Single bulk INSERT — readers see old goals until commit, never an empty window
|
const vals = goals.map(g =>
|
||||||
const vals = rows.map(g =>
|
|
||||||
sql`(${matchId}, ${g.teamId}, ${g.name}, ${g.minute}, ${g.offset}, ${g.penalty}, ${g.owngoal})`
|
sql`(${matchId}, ${g.teamId}, ${g.name}, ${g.minute}, ${g.offset}, ${g.penalty}, ${g.owngoal})`
|
||||||
)
|
)
|
||||||
await tx.execute(sql`
|
await tx.execute(sql`
|
||||||
@@ -122,101 +80,117 @@ async function run() {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log('\nSyncing 2026...')
|
// ── Incremental group detection ────────────────────────────────────────────
|
||||||
|
// Groups where every known match already has a FT score — no need to re-fetch their sub-page.
|
||||||
|
|
||||||
|
async function getCompletedGroups(): Promise<Set<string>> {
|
||||||
|
const rows = await db.execute(sql`
|
||||||
|
SELECT group_name
|
||||||
|
FROM matches
|
||||||
|
WHERE tournament_year = 2026
|
||||||
|
AND group_name IS NOT NULL
|
||||||
|
AND is_quali_playoff = false
|
||||||
|
GROUP BY group_name
|
||||||
|
HAVING COUNT(*) > 0
|
||||||
|
AND COUNT(*) = SUM(CASE WHEN score_ft_home IS NOT NULL THEN 1 ELSE 0 END)
|
||||||
|
`)
|
||||||
|
return new Set(rows.map(r => (r as { group_name: string }).group_name))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Sync 2026 from Wikipedia ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
console.log('\nSyncing 2026 from Wikipedia...')
|
||||||
|
|
||||||
// Upsert 2026 tournament row (no winner yet)
|
|
||||||
await db.execute(sql`
|
await db.execute(sql`
|
||||||
INSERT INTO tournaments (year, host)
|
INSERT INTO tournaments (year, host)
|
||||||
VALUES (2026, 'USA / Canada / Mexico')
|
VALUES (2026, 'USA / Canada / Mexico')
|
||||||
ON CONFLICT (year) DO NOTHING
|
ON CONFLICT (year) DO NOTHING
|
||||||
`)
|
`)
|
||||||
|
|
||||||
// Teams enrichment
|
const mainHtml = await fetchWikiHtml('2026_FIFA_World_Cup')
|
||||||
const teamsData = await fetchJson(`${BASE}/2026/worldcup.teams.json`) as Record<string, unknown>[] | null
|
if (!mainHtml) {
|
||||||
if (teamsData && Array.isArray(teamsData)) {
|
console.error(' FAILED to fetch 2026 Wikipedia page')
|
||||||
for (const t of teamsData) {
|
await client.end()
|
||||||
const name = (t.name ?? t.name_normalised) as string
|
process.exit(1)
|
||||||
await upsertTeam(name, {
|
|
||||||
iso2: TEAM_ISO[name] ?? getIso(name),
|
|
||||||
fifaCode: t.fifa_code as string,
|
|
||||||
continent: t.continent as string,
|
|
||||||
confederation: t.confed as string,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const completedGroups = await getCompletedGroups()
|
||||||
|
if (completedGroups.size > 0)
|
||||||
|
console.log(` Skipping completed groups: ${[...completedGroups].sort().join(', ')}`)
|
||||||
|
|
||||||
|
process.stdout.write(' ')
|
||||||
|
const { matches, stadiums, meta } = await scrapeYear(2026, mainHtml, { skipGroups: completedGroups })
|
||||||
|
console.log()
|
||||||
|
|
||||||
// Stadiums
|
// Stadiums
|
||||||
const stadiumsData = await fetchJson(`${BASE}/2026/worldcup.stadiums.json`) as { stadiums?: Record<string, unknown>[] } | null
|
for (const s of stadiums.values()) {
|
||||||
if (stadiumsData?.stadiums) {
|
|
||||||
for (const s of stadiumsData.stadiums) {
|
|
||||||
await db.execute(sql`
|
await db.execute(sql`
|
||||||
INSERT INTO stadiums (tournament_year, name, city, country_code, capacity, timezone, coordinates)
|
INSERT INTO stadiums (tournament_year, name, city)
|
||||||
VALUES (2026, ${s.name as string}, ${s.city as string}, ${(s.cc as string | undefined) ?? null},
|
VALUES (2026, ${s.name}, ${s.city ?? null})
|
||||||
${(s.capacity as number | undefined) ?? null}, ${(s.timezone as string | undefined) ?? null}, ${(s.coords as string | undefined) ?? null})
|
|
||||||
ON CONFLICT DO NOTHING
|
ON CONFLICT DO NOTHING
|
||||||
`)
|
`)
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Main matches
|
// Matches + goals
|
||||||
const mainData = await fetchJson(`${BASE}/2026/worldcup.json`) as RawData | null
|
|
||||||
let matchCount = 0, goalCount = 0
|
let matchCount = 0, goalCount = 0
|
||||||
if (mainData?.matches) {
|
for (const m of matches) {
|
||||||
for (const m of mainData.matches) {
|
|
||||||
const t1Id = await upsertTeam(m.team1)
|
const t1Id = await upsertTeam(m.team1)
|
||||||
const t2Id = await upsertTeam(m.team2)
|
const t2Id = await upsertTeam(m.team2)
|
||||||
const score = parseScore(m.score)
|
const matchId = await upsertMatch(
|
||||||
const matchId = await upsertMatch(2026, m.round ?? 'Unknown', m.group ?? null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, false)
|
2026, m.round, m.group ?? null, m.date ?? null, m.time ?? null,
|
||||||
if (m.goals1?.length || m.goals2?.length) {
|
t1Id, t2Id, m.score?.ft, m.score?.et, m.score?.p, false,
|
||||||
const goalRows = [
|
)
|
||||||
...(m.goals1?.length ? collectGoals(t1Id, m.goals1, t2Id) : []),
|
const goals = [
|
||||||
...(m.goals2?.length ? collectGoals(t2Id, m.goals2, t1Id) : []),
|
...(m.goals1 ?? []).map(g => ({
|
||||||
|
teamId: g.owngoal ? t2Id : t1Id, name: g.name,
|
||||||
|
minute: g.minute ?? null, offset: g.offset ?? 0,
|
||||||
|
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false,
|
||||||
|
})),
|
||||||
|
...(m.goals2 ?? []).map(g => ({
|
||||||
|
teamId: g.owngoal ? t1Id : t2Id, name: g.name,
|
||||||
|
minute: g.minute ?? null, offset: g.offset ?? 0,
|
||||||
|
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false,
|
||||||
|
})),
|
||||||
]
|
]
|
||||||
await replaceGoals(matchId, goalRows)
|
if (goals.length > 0) await replaceGoals(matchId, goals)
|
||||||
}
|
|
||||||
matchCount++
|
matchCount++
|
||||||
goalCount += (m.goals1?.length ?? 0) + (m.goals2?.length ?? 0)
|
goalCount += goals.length
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Squads
|
// Squads (fetch once; idempotent upsert so safe to re-run)
|
||||||
const squadsData = await fetchJson(`${BASE}/2026/worldcup.squads.json`) as Record<string, unknown>[] | null
|
const squadHtml = await fetchWikiHtml('2026_FIFA_World_Cup_squads')
|
||||||
if (squadsData && Array.isArray(squadsData)) {
|
if (squadHtml) {
|
||||||
for (const sq of squadsData) {
|
const squads = scrapeSquads(squadHtml)
|
||||||
const teamId = await upsertTeam(sq.name as string)
|
for (const sq of squads) {
|
||||||
for (const p of (sq.players as Record<string, unknown>[])) {
|
const teamId = await upsertTeam(sq.name)
|
||||||
|
for (const p of sq.players) {
|
||||||
|
const dob = p.date_of_birth ? p.date_of_birth.replace(/\s/g, '') : null
|
||||||
await db.execute(sql`
|
await db.execute(sql`
|
||||||
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
|
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
|
||||||
VALUES (2026, ${teamId}, ${p.name as string}, ${p.number as number ?? null},
|
VALUES (2026, ${teamId}, ${p.name}, ${p.number ?? null}, ${p.pos ?? null}, ${dob})
|
||||||
${p.pos as string ?? null}, ${p.date_of_birth as string ?? null})
|
|
||||||
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
|
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
|
||||||
player_name = EXCLUDED.player_name, position = EXCLUDED.position, date_of_birth = EXCLUDED.date_of_birth
|
player_name = EXCLUDED.player_name,
|
||||||
|
position = EXCLUDED.position,
|
||||||
|
date_of_birth = EXCLUDED.date_of_birth
|
||||||
`)
|
`)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
console.log(' Squads loaded for 2026')
|
console.log(` Squads: ${squads.length} teams`)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Quali playoffs
|
// Tournament winner (once the final is played)
|
||||||
const qualiData = await fetchJson(`${BASE}/2026/worldcup.quali_playoffs.json`) as RawData | null
|
if (meta.winner) {
|
||||||
if (qualiData?.matches) {
|
await db.execute(sql`
|
||||||
for (const m of qualiData.matches) {
|
UPDATE tournaments SET
|
||||||
const t1Id = await upsertTeam(m.team1)
|
winner = ${meta.winner},
|
||||||
const t2Id = await upsertTeam(m.team2)
|
runner_up = ${meta.runner_up},
|
||||||
const score = parseScore(m.score)
|
third_place = ${meta.third_place},
|
||||||
const matchId = await upsertMatch(2026, m.round ?? 'Qualifier', null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, true)
|
fourth_place = ${meta.fourth_place}
|
||||||
if (m.goals1?.length || m.goals2?.length) {
|
WHERE year = 2026
|
||||||
const goalRows = [
|
`)
|
||||||
...(m.goals1?.length ? collectGoals(t1Id, m.goals1, t2Id) : []),
|
|
||||||
...(m.goals2?.length ? collectGoals(t2Id, m.goals2, t1Id) : []),
|
|
||||||
]
|
|
||||||
await replaceGoals(matchId, goalRows)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
console.log(` Quali playoffs: ${qualiData.matches.length} matches`)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Group standings from match results
|
// Group standings
|
||||||
await db.execute(sql`
|
await db.execute(sql`
|
||||||
WITH match_results AS (
|
WITH match_results AS (
|
||||||
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga
|
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga
|
||||||
|
|||||||
Reference in New Issue
Block a user