refactor: extract lib/wiki-scraper.ts, make scraper composable, sync from Wikipedia
Move all scraping logic (fetchWikiHtml, scrapeYear, scrapeSquads and all helpers) into lib/wiki-scraper.ts as exported functions shared by both scripts. scrape-wikipedia.ts becomes a composable CLI: pnpm scrape [year] — matches + squads (default) pnpm scrape [year] --matches — matches/meta/stadiums only pnpm scrape [year] --squads — squads only sync.ts drops the openfootball GitHub dependency entirely and scrapes Wikipedia directly. Incremental: completed groups (all matches have FT scores) are detected via DB query and their sub-pages are skipped each run. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+113
-139
@@ -1,40 +1,16 @@
|
||||
import postgres from 'postgres'
|
||||
import { drizzle } from 'drizzle-orm/postgres-js'
|
||||
import { sql } from 'drizzle-orm'
|
||||
import { TEAM_ISO, getIso } from '../lib/iso-codes'
|
||||
import { fetchWikiHtml, scrapeYear, scrapeSquads } from '../lib/wiki-scraper'
|
||||
import { getIso } from '../lib/iso-codes'
|
||||
|
||||
const DATABASE_URL = process.env.DATABASE_URL
|
||||
if (!DATABASE_URL) {
|
||||
console.error('ERROR: DATABASE_URL environment variable is not set')
|
||||
process.exit(1)
|
||||
}
|
||||
const BASE = 'https://raw.githubusercontent.com/openfootball/worldcup.json/master'
|
||||
|
||||
async function fetchJson(url: string): Promise<unknown> {
|
||||
try {
|
||||
const res = await fetch(url)
|
||||
if (!res.ok) return null
|
||||
return res.json()
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
type RawGoal = { name: string; minute?: string | number; offset?: number; penalty?: boolean; owngoal?: boolean }
|
||||
type RawScore = { ft?: number[]; ht?: number[]; et?: number[]; p?: number[] } | number[]
|
||||
type RawMatch = {
|
||||
round?: string; date?: string; time?: string;
|
||||
team1: string; team2: string; score?: RawScore;
|
||||
goals1?: RawGoal[]; goals2?: RawGoal[];
|
||||
group?: string; ground?: string;
|
||||
}
|
||||
type RawData = { matches: RawMatch[] }
|
||||
|
||||
function parseScore(score: RawScore | undefined) {
|
||||
if (!score) return {}
|
||||
if (Array.isArray(score)) return { ft: score }
|
||||
return { ft: score.ft, ht: score.ht, et: score.et, p: score.p }
|
||||
}
|
||||
// ── DB helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
async function run() {
|
||||
const client = postgres(DATABASE_URL!, { max: 2 })
|
||||
@@ -42,17 +18,13 @@ async function run() {
|
||||
|
||||
const teamCache = new Map<string, number>()
|
||||
|
||||
async function upsertTeam(rawName: string, extra?: { iso2?: string | null; fifaCode?: string; continent?: string; confederation?: string }) {
|
||||
async function upsertTeam(rawName: string) {
|
||||
if (teamCache.has(rawName)) return teamCache.get(rawName)!
|
||||
const iso2 = (extra && 'iso2' in extra) ? extra.iso2 : getIso(rawName)
|
||||
const iso2 = getIso(rawName)
|
||||
const [row] = await db.execute(sql`
|
||||
INSERT INTO teams (name, iso2, fifa_code, continent, confederation)
|
||||
VALUES (${rawName}, ${iso2 ?? null}, ${extra?.fifaCode ?? null}, ${extra?.continent ?? null}, ${extra?.confederation ?? null})
|
||||
ON CONFLICT (name) DO UPDATE SET
|
||||
iso2 = COALESCE(EXCLUDED.iso2, teams.iso2),
|
||||
fifa_code = COALESCE(EXCLUDED.fifa_code, teams.fifa_code),
|
||||
continent = COALESCE(EXCLUDED.continent, teams.continent),
|
||||
confederation = COALESCE(EXCLUDED.confederation, teams.confederation)
|
||||
INSERT INTO teams (name, iso2)
|
||||
VALUES (${rawName}, ${iso2 ?? null})
|
||||
ON CONFLICT (name) DO UPDATE SET iso2 = COALESCE(EXCLUDED.iso2, teams.iso2)
|
||||
RETURNING id
|
||||
`)
|
||||
const id = (row as { id: number }).id
|
||||
@@ -62,56 +34,42 @@ async function run() {
|
||||
|
||||
async function upsertMatch(
|
||||
year: number, round: string, group: string | null, dateStr: string | null,
|
||||
timeStr: string | null, team1Id: number, team2Id: number, score: ReturnType<typeof parseScore>,
|
||||
isQuali: boolean
|
||||
timeStr: string | null, team1Id: number, team2Id: number,
|
||||
ft: [number, number] | undefined, et: [number, number] | undefined, p: [number, number] | undefined,
|
||||
isQuali: boolean,
|
||||
) {
|
||||
const rows = await db.execute(sql`
|
||||
INSERT INTO matches (tournament_year, round, group_name, date, time_local, team1_id, team2_id,
|
||||
score_ft_home, score_ft_away, score_ht_home, score_ht_away,
|
||||
score_et_home, score_et_away, score_p_home, score_p_away, is_quali_playoff)
|
||||
score_ft_home, score_ft_away, score_et_home, score_et_away,
|
||||
score_p_home, score_p_away, is_quali_playoff)
|
||||
VALUES (
|
||||
${year}, ${round}, ${group}, ${dateStr ?? null}, ${timeStr ?? null},
|
||||
${team1Id}, ${team2Id},
|
||||
${score.ft?.[0] ?? null}, ${score.ft?.[1] ?? null},
|
||||
${score.ht?.[0] ?? null}, ${score.ht?.[1] ?? null},
|
||||
${score.et?.[0] ?? null}, ${score.et?.[1] ?? null},
|
||||
${score.p?.[0] ?? null}, ${score.p?.[1] ?? null},
|
||||
${year}, ${round}, ${group}, ${dateStr}, ${timeStr}, ${team1Id}, ${team2Id},
|
||||
${ft?.[0] ?? null}, ${ft?.[1] ?? null},
|
||||
${et?.[0] ?? null}, ${et?.[1] ?? null},
|
||||
${p?.[0] ?? null}, ${p?.[1] ?? null},
|
||||
${isQuali}
|
||||
)
|
||||
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
|
||||
round = EXCLUDED.round,
|
||||
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
|
||||
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
|
||||
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
|
||||
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
|
||||
score_ht_home = COALESCE(EXCLUDED.score_ht_home, matches.score_ht_home),
|
||||
score_ht_away = COALESCE(EXCLUDED.score_ht_away, matches.score_ht_away),
|
||||
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
|
||||
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
|
||||
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
|
||||
score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away)
|
||||
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
|
||||
score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away)
|
||||
RETURNING id
|
||||
`)
|
||||
return (rows[0] as { id: number }).id
|
||||
}
|
||||
|
||||
type GoalRow = { teamId: number; name: string; minute: number | null; offset: number; penalty: boolean; owngoal: boolean }
|
||||
|
||||
function collectGoals(teamId: number, rawGoals: RawGoal[], isOwnGoalTeamId: number): GoalRow[] {
|
||||
return rawGoals.flatMap(g => {
|
||||
if (!g.name) return []
|
||||
const minute = g.minute != null ? parseInt(String(g.minute)) : null
|
||||
return [{ teamId: g.owngoal ? isOwnGoalTeamId : teamId, name: g.name,
|
||||
minute: isNaN(minute!) ? null : minute, offset: g.offset ?? 0,
|
||||
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false }]
|
||||
})
|
||||
}
|
||||
|
||||
async function replaceGoals(matchId: number, rows: GoalRow[]) {
|
||||
async function replaceGoals(matchId: number, goals: Array<{
|
||||
teamId: number; name: string; minute: number | null; offset: number; penalty: boolean; owngoal: boolean
|
||||
}>) {
|
||||
await db.transaction(async tx => {
|
||||
await tx.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
|
||||
if (rows.length > 0) {
|
||||
// Single bulk INSERT — readers see old goals until commit, never an empty window
|
||||
const vals = rows.map(g =>
|
||||
if (goals.length > 0) {
|
||||
const vals = goals.map(g =>
|
||||
sql`(${matchId}, ${g.teamId}, ${g.name}, ${g.minute}, ${g.offset}, ${g.penalty}, ${g.owngoal})`
|
||||
)
|
||||
await tx.execute(sql`
|
||||
@@ -122,101 +80,117 @@ async function run() {
|
||||
})
|
||||
}
|
||||
|
||||
console.log('\nSyncing 2026...')
|
||||
// ── Incremental group detection ────────────────────────────────────────────
|
||||
// Groups where every known match already has a FT score — no need to re-fetch their sub-page.
|
||||
|
||||
async function getCompletedGroups(): Promise<Set<string>> {
|
||||
const rows = await db.execute(sql`
|
||||
SELECT group_name
|
||||
FROM matches
|
||||
WHERE tournament_year = 2026
|
||||
AND group_name IS NOT NULL
|
||||
AND is_quali_playoff = false
|
||||
GROUP BY group_name
|
||||
HAVING COUNT(*) > 0
|
||||
AND COUNT(*) = SUM(CASE WHEN score_ft_home IS NOT NULL THEN 1 ELSE 0 END)
|
||||
`)
|
||||
return new Set(rows.map(r => (r as { group_name: string }).group_name))
|
||||
}
|
||||
|
||||
// ── Sync 2026 from Wikipedia ───────────────────────────────────────────────
|
||||
|
||||
console.log('\nSyncing 2026 from Wikipedia...')
|
||||
|
||||
// Upsert 2026 tournament row (no winner yet)
|
||||
await db.execute(sql`
|
||||
INSERT INTO tournaments (year, host)
|
||||
VALUES (2026, 'USA / Canada / Mexico')
|
||||
ON CONFLICT (year) DO NOTHING
|
||||
`)
|
||||
|
||||
// Teams enrichment
|
||||
const teamsData = await fetchJson(`${BASE}/2026/worldcup.teams.json`) as Record<string, unknown>[] | null
|
||||
if (teamsData && Array.isArray(teamsData)) {
|
||||
for (const t of teamsData) {
|
||||
const name = (t.name ?? t.name_normalised) as string
|
||||
await upsertTeam(name, {
|
||||
iso2: TEAM_ISO[name] ?? getIso(name),
|
||||
fifaCode: t.fifa_code as string,
|
||||
continent: t.continent as string,
|
||||
confederation: t.confed as string,
|
||||
})
|
||||
}
|
||||
const mainHtml = await fetchWikiHtml('2026_FIFA_World_Cup')
|
||||
if (!mainHtml) {
|
||||
console.error(' FAILED to fetch 2026 Wikipedia page')
|
||||
await client.end()
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const completedGroups = await getCompletedGroups()
|
||||
if (completedGroups.size > 0)
|
||||
console.log(` Skipping completed groups: ${[...completedGroups].sort().join(', ')}`)
|
||||
|
||||
process.stdout.write(' ')
|
||||
const { matches, stadiums, meta } = await scrapeYear(2026, mainHtml, { skipGroups: completedGroups })
|
||||
console.log()
|
||||
|
||||
// Stadiums
|
||||
const stadiumsData = await fetchJson(`${BASE}/2026/worldcup.stadiums.json`) as { stadiums?: Record<string, unknown>[] } | null
|
||||
if (stadiumsData?.stadiums) {
|
||||
for (const s of stadiumsData.stadiums) {
|
||||
await db.execute(sql`
|
||||
INSERT INTO stadiums (tournament_year, name, city, country_code, capacity, timezone, coordinates)
|
||||
VALUES (2026, ${s.name as string}, ${s.city as string}, ${(s.cc as string | undefined) ?? null},
|
||||
${(s.capacity as number | undefined) ?? null}, ${(s.timezone as string | undefined) ?? null}, ${(s.coords as string | undefined) ?? null})
|
||||
ON CONFLICT DO NOTHING
|
||||
`)
|
||||
}
|
||||
for (const s of stadiums.values()) {
|
||||
await db.execute(sql`
|
||||
INSERT INTO stadiums (tournament_year, name, city)
|
||||
VALUES (2026, ${s.name}, ${s.city ?? null})
|
||||
ON CONFLICT DO NOTHING
|
||||
`)
|
||||
}
|
||||
|
||||
// Main matches
|
||||
const mainData = await fetchJson(`${BASE}/2026/worldcup.json`) as RawData | null
|
||||
// Matches + goals
|
||||
let matchCount = 0, goalCount = 0
|
||||
if (mainData?.matches) {
|
||||
for (const m of mainData.matches) {
|
||||
const t1Id = await upsertTeam(m.team1)
|
||||
const t2Id = await upsertTeam(m.team2)
|
||||
const score = parseScore(m.score)
|
||||
const matchId = await upsertMatch(2026, m.round ?? 'Unknown', m.group ?? null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, false)
|
||||
if (m.goals1?.length || m.goals2?.length) {
|
||||
const goalRows = [
|
||||
...(m.goals1?.length ? collectGoals(t1Id, m.goals1, t2Id) : []),
|
||||
...(m.goals2?.length ? collectGoals(t2Id, m.goals2, t1Id) : []),
|
||||
]
|
||||
await replaceGoals(matchId, goalRows)
|
||||
}
|
||||
matchCount++
|
||||
goalCount += (m.goals1?.length ?? 0) + (m.goals2?.length ?? 0)
|
||||
}
|
||||
for (const m of matches) {
|
||||
const t1Id = await upsertTeam(m.team1)
|
||||
const t2Id = await upsertTeam(m.team2)
|
||||
const matchId = await upsertMatch(
|
||||
2026, m.round, m.group ?? null, m.date ?? null, m.time ?? null,
|
||||
t1Id, t2Id, m.score?.ft, m.score?.et, m.score?.p, false,
|
||||
)
|
||||
const goals = [
|
||||
...(m.goals1 ?? []).map(g => ({
|
||||
teamId: g.owngoal ? t2Id : t1Id, name: g.name,
|
||||
minute: g.minute ?? null, offset: g.offset ?? 0,
|
||||
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false,
|
||||
})),
|
||||
...(m.goals2 ?? []).map(g => ({
|
||||
teamId: g.owngoal ? t1Id : t2Id, name: g.name,
|
||||
minute: g.minute ?? null, offset: g.offset ?? 0,
|
||||
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false,
|
||||
})),
|
||||
]
|
||||
if (goals.length > 0) await replaceGoals(matchId, goals)
|
||||
matchCount++
|
||||
goalCount += goals.length
|
||||
}
|
||||
|
||||
// Squads
|
||||
const squadsData = await fetchJson(`${BASE}/2026/worldcup.squads.json`) as Record<string, unknown>[] | null
|
||||
if (squadsData && Array.isArray(squadsData)) {
|
||||
for (const sq of squadsData) {
|
||||
const teamId = await upsertTeam(sq.name as string)
|
||||
for (const p of (sq.players as Record<string, unknown>[])) {
|
||||
// Squads (fetch once; idempotent upsert so safe to re-run)
|
||||
const squadHtml = await fetchWikiHtml('2026_FIFA_World_Cup_squads')
|
||||
if (squadHtml) {
|
||||
const squads = scrapeSquads(squadHtml)
|
||||
for (const sq of squads) {
|
||||
const teamId = await upsertTeam(sq.name)
|
||||
for (const p of sq.players) {
|
||||
const dob = p.date_of_birth ? p.date_of_birth.replace(/\s/g, '') : null
|
||||
await db.execute(sql`
|
||||
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
|
||||
VALUES (2026, ${teamId}, ${p.name as string}, ${p.number as number ?? null},
|
||||
${p.pos as string ?? null}, ${p.date_of_birth as string ?? null})
|
||||
VALUES (2026, ${teamId}, ${p.name}, ${p.number ?? null}, ${p.pos ?? null}, ${dob})
|
||||
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
|
||||
player_name = EXCLUDED.player_name, position = EXCLUDED.position, date_of_birth = EXCLUDED.date_of_birth
|
||||
player_name = EXCLUDED.player_name,
|
||||
position = EXCLUDED.position,
|
||||
date_of_birth = EXCLUDED.date_of_birth
|
||||
`)
|
||||
}
|
||||
}
|
||||
console.log(' Squads loaded for 2026')
|
||||
console.log(` Squads: ${squads.length} teams`)
|
||||
}
|
||||
|
||||
// Quali playoffs
|
||||
const qualiData = await fetchJson(`${BASE}/2026/worldcup.quali_playoffs.json`) as RawData | null
|
||||
if (qualiData?.matches) {
|
||||
for (const m of qualiData.matches) {
|
||||
const t1Id = await upsertTeam(m.team1)
|
||||
const t2Id = await upsertTeam(m.team2)
|
||||
const score = parseScore(m.score)
|
||||
const matchId = await upsertMatch(2026, m.round ?? 'Qualifier', null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, true)
|
||||
if (m.goals1?.length || m.goals2?.length) {
|
||||
const goalRows = [
|
||||
...(m.goals1?.length ? collectGoals(t1Id, m.goals1, t2Id) : []),
|
||||
...(m.goals2?.length ? collectGoals(t2Id, m.goals2, t1Id) : []),
|
||||
]
|
||||
await replaceGoals(matchId, goalRows)
|
||||
}
|
||||
}
|
||||
console.log(` Quali playoffs: ${qualiData.matches.length} matches`)
|
||||
// Tournament winner (once the final is played)
|
||||
if (meta.winner) {
|
||||
await db.execute(sql`
|
||||
UPDATE tournaments SET
|
||||
winner = ${meta.winner},
|
||||
runner_up = ${meta.runner_up},
|
||||
third_place = ${meta.third_place},
|
||||
fourth_place = ${meta.fourth_place}
|
||||
WHERE year = 2026
|
||||
`)
|
||||
}
|
||||
|
||||
// Group standings from match results
|
||||
// Group standings
|
||||
await db.execute(sql`
|
||||
WITH match_results AS (
|
||||
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga
|
||||
@@ -244,8 +218,8 @@ async function run() {
|
||||
// Tournament aggregates
|
||||
await db.execute(sql`
|
||||
UPDATE tournaments SET
|
||||
matches_count = (SELECT COUNT(*)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false),
|
||||
total_goals = (SELECT COALESCE(SUM(score_ft_home + score_ft_away), 0)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL),
|
||||
matches_count = (SELECT COUNT(*)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false),
|
||||
total_goals = (SELECT COALESCE(SUM(score_ft_home + score_ft_away), 0)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL),
|
||||
avg_goals_per_game = (
|
||||
SELECT ROUND(COALESCE(SUM(score_ft_home + score_ft_away), 0)::numeric / NULLIF(COUNT(*), 0), 2)
|
||||
FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL
|
||||
|
||||
Reference in New Issue
Block a user