f885e4312c
Move all scraping logic (fetchWikiHtml, scrapeYear, scrapeSquads and all helpers) into lib/wiki-scraper.ts as exported functions shared by both scripts. scrape-wikipedia.ts becomes a composable CLI: pnpm scrape [year] — matches + squads (default) pnpm scrape [year] --matches — matches/meta/stadiums only pnpm scrape [year] --squads — squads only sync.ts drops the openfootball GitHub dependency entirely and scrapes Wikipedia directly. Incremental: completed groups (all matches have FT scores) are detected via DB query and their sub-pages are skipped each run. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
236 lines
9.6 KiB
TypeScript
236 lines
9.6 KiB
TypeScript
import postgres from 'postgres'
|
|
import { drizzle } from 'drizzle-orm/postgres-js'
|
|
import { sql } from 'drizzle-orm'
|
|
import { fetchWikiHtml, scrapeYear, scrapeSquads } from '../lib/wiki-scraper'
|
|
import { getIso } from '../lib/iso-codes'
|
|
|
|
const DATABASE_URL = process.env.DATABASE_URL
|
|
if (!DATABASE_URL) {
|
|
console.error('ERROR: DATABASE_URL environment variable is not set')
|
|
process.exit(1)
|
|
}
|
|
|
|
// ── DB helpers ─────────────────────────────────────────────────────────────
|
|
|
|
async function run() {
|
|
const client = postgres(DATABASE_URL!, { max: 2 })
|
|
const db = drizzle(client)
|
|
|
|
const teamCache = new Map<string, number>()
|
|
|
|
async function upsertTeam(rawName: string) {
|
|
if (teamCache.has(rawName)) return teamCache.get(rawName)!
|
|
const iso2 = getIso(rawName)
|
|
const [row] = await db.execute(sql`
|
|
INSERT INTO teams (name, iso2)
|
|
VALUES (${rawName}, ${iso2 ?? null})
|
|
ON CONFLICT (name) DO UPDATE SET iso2 = COALESCE(EXCLUDED.iso2, teams.iso2)
|
|
RETURNING id
|
|
`)
|
|
const id = (row as { id: number }).id
|
|
teamCache.set(rawName, id)
|
|
return id
|
|
}
|
|
|
|
async function upsertMatch(
|
|
year: number, round: string, group: string | null, dateStr: string | null,
|
|
timeStr: string | null, team1Id: number, team2Id: number,
|
|
ft: [number, number] | undefined, et: [number, number] | undefined, p: [number, number] | undefined,
|
|
isQuali: boolean,
|
|
) {
|
|
const rows = await db.execute(sql`
|
|
INSERT INTO matches (tournament_year, round, group_name, date, time_local, team1_id, team2_id,
|
|
score_ft_home, score_ft_away, score_et_home, score_et_away,
|
|
score_p_home, score_p_away, is_quali_playoff)
|
|
VALUES (
|
|
${year}, ${round}, ${group}, ${dateStr}, ${timeStr}, ${team1Id}, ${team2Id},
|
|
${ft?.[0] ?? null}, ${ft?.[1] ?? null},
|
|
${et?.[0] ?? null}, ${et?.[1] ?? null},
|
|
${p?.[0] ?? null}, ${p?.[1] ?? null},
|
|
${isQuali}
|
|
)
|
|
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
|
|
round = EXCLUDED.round,
|
|
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
|
|
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
|
|
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
|
|
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
|
|
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
|
|
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
|
|
score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away)
|
|
RETURNING id
|
|
`)
|
|
return (rows[0] as { id: number }).id
|
|
}
|
|
|
|
async function replaceGoals(matchId: number, goals: Array<{
|
|
teamId: number; name: string; minute: number | null; offset: number; penalty: boolean; owngoal: boolean
|
|
}>) {
|
|
await db.transaction(async tx => {
|
|
await tx.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
|
|
if (goals.length > 0) {
|
|
const vals = goals.map(g =>
|
|
sql`(${matchId}, ${g.teamId}, ${g.name}, ${g.minute}, ${g.offset}, ${g.penalty}, ${g.owngoal})`
|
|
)
|
|
await tx.execute(sql`
|
|
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
|
|
VALUES ${sql.join(vals, sql`, `)}
|
|
`)
|
|
}
|
|
})
|
|
}
|
|
|
|
// ── Incremental group detection ────────────────────────────────────────────
|
|
// Groups where every known match already has a FT score — no need to re-fetch their sub-page.
|
|
|
|
async function getCompletedGroups(): Promise<Set<string>> {
|
|
const rows = await db.execute(sql`
|
|
SELECT group_name
|
|
FROM matches
|
|
WHERE tournament_year = 2026
|
|
AND group_name IS NOT NULL
|
|
AND is_quali_playoff = false
|
|
GROUP BY group_name
|
|
HAVING COUNT(*) > 0
|
|
AND COUNT(*) = SUM(CASE WHEN score_ft_home IS NOT NULL THEN 1 ELSE 0 END)
|
|
`)
|
|
return new Set(rows.map(r => (r as { group_name: string }).group_name))
|
|
}
|
|
|
|
// ── Sync 2026 from Wikipedia ───────────────────────────────────────────────
|
|
|
|
console.log('\nSyncing 2026 from Wikipedia...')
|
|
|
|
await db.execute(sql`
|
|
INSERT INTO tournaments (year, host)
|
|
VALUES (2026, 'USA / Canada / Mexico')
|
|
ON CONFLICT (year) DO NOTHING
|
|
`)
|
|
|
|
const mainHtml = await fetchWikiHtml('2026_FIFA_World_Cup')
|
|
if (!mainHtml) {
|
|
console.error(' FAILED to fetch 2026 Wikipedia page')
|
|
await client.end()
|
|
process.exit(1)
|
|
}
|
|
|
|
const completedGroups = await getCompletedGroups()
|
|
if (completedGroups.size > 0)
|
|
console.log(` Skipping completed groups: ${[...completedGroups].sort().join(', ')}`)
|
|
|
|
process.stdout.write(' ')
|
|
const { matches, stadiums, meta } = await scrapeYear(2026, mainHtml, { skipGroups: completedGroups })
|
|
console.log()
|
|
|
|
// Stadiums
|
|
for (const s of stadiums.values()) {
|
|
await db.execute(sql`
|
|
INSERT INTO stadiums (tournament_year, name, city)
|
|
VALUES (2026, ${s.name}, ${s.city ?? null})
|
|
ON CONFLICT DO NOTHING
|
|
`)
|
|
}
|
|
|
|
// Matches + goals
|
|
let matchCount = 0, goalCount = 0
|
|
for (const m of matches) {
|
|
const t1Id = await upsertTeam(m.team1)
|
|
const t2Id = await upsertTeam(m.team2)
|
|
const matchId = await upsertMatch(
|
|
2026, m.round, m.group ?? null, m.date ?? null, m.time ?? null,
|
|
t1Id, t2Id, m.score?.ft, m.score?.et, m.score?.p, false,
|
|
)
|
|
const goals = [
|
|
...(m.goals1 ?? []).map(g => ({
|
|
teamId: g.owngoal ? t2Id : t1Id, name: g.name,
|
|
minute: g.minute ?? null, offset: g.offset ?? 0,
|
|
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false,
|
|
})),
|
|
...(m.goals2 ?? []).map(g => ({
|
|
teamId: g.owngoal ? t1Id : t2Id, name: g.name,
|
|
minute: g.minute ?? null, offset: g.offset ?? 0,
|
|
penalty: g.penalty ?? false, owngoal: g.owngoal ?? false,
|
|
})),
|
|
]
|
|
if (goals.length > 0) await replaceGoals(matchId, goals)
|
|
matchCount++
|
|
goalCount += goals.length
|
|
}
|
|
|
|
// Squads (fetch once; idempotent upsert so safe to re-run)
|
|
const squadHtml = await fetchWikiHtml('2026_FIFA_World_Cup_squads')
|
|
if (squadHtml) {
|
|
const squads = scrapeSquads(squadHtml)
|
|
for (const sq of squads) {
|
|
const teamId = await upsertTeam(sq.name)
|
|
for (const p of sq.players) {
|
|
const dob = p.date_of_birth ? p.date_of_birth.replace(/\s/g, '') : null
|
|
await db.execute(sql`
|
|
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
|
|
VALUES (2026, ${teamId}, ${p.name}, ${p.number ?? null}, ${p.pos ?? null}, ${dob})
|
|
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
|
|
player_name = EXCLUDED.player_name,
|
|
position = EXCLUDED.position,
|
|
date_of_birth = EXCLUDED.date_of_birth
|
|
`)
|
|
}
|
|
}
|
|
console.log(` Squads: ${squads.length} teams`)
|
|
}
|
|
|
|
// Tournament winner (once the final is played)
|
|
if (meta.winner) {
|
|
await db.execute(sql`
|
|
UPDATE tournaments SET
|
|
winner = ${meta.winner},
|
|
runner_up = ${meta.runner_up},
|
|
third_place = ${meta.third_place},
|
|
fourth_place = ${meta.fourth_place}
|
|
WHERE year = 2026
|
|
`)
|
|
}
|
|
|
|
// Group standings
|
|
await db.execute(sql`
|
|
WITH match_results AS (
|
|
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga
|
|
FROM matches WHERE tournament_year = 2026 AND group_name IS NOT NULL AND is_quali_playoff = false AND score_ft_home IS NOT NULL
|
|
UNION ALL
|
|
SELECT tournament_year, group_name, team2_id, score_ft_away, score_ft_home
|
|
FROM matches WHERE tournament_year = 2026 AND group_name IS NOT NULL AND is_quali_playoff = false AND score_ft_home IS NOT NULL
|
|
)
|
|
INSERT INTO group_standings (tournament_year, group_name, team_id, played, won, drawn, lost, goals_for, goals_against, goal_diff, pts)
|
|
SELECT tournament_year, group_name, team_id,
|
|
COUNT(*)::int,
|
|
SUM(CASE WHEN gf > ga THEN 1 ELSE 0 END)::int,
|
|
SUM(CASE WHEN gf = ga THEN 1 ELSE 0 END)::int,
|
|
SUM(CASE WHEN gf < ga THEN 1 ELSE 0 END)::int,
|
|
SUM(gf)::int, SUM(ga)::int, SUM(gf - ga)::int,
|
|
SUM(CASE WHEN gf > ga THEN 3 WHEN gf = ga THEN 1 ELSE 0 END)::int
|
|
FROM match_results
|
|
GROUP BY tournament_year, group_name, team_id
|
|
ON CONFLICT (tournament_year, group_name, team_id) DO UPDATE SET
|
|
played = EXCLUDED.played, won = EXCLUDED.won, drawn = EXCLUDED.drawn,
|
|
lost = EXCLUDED.lost, goals_for = EXCLUDED.goals_for, goals_against = EXCLUDED.goals_against,
|
|
goal_diff = EXCLUDED.goal_diff, pts = EXCLUDED.pts
|
|
`)
|
|
|
|
// Tournament aggregates
|
|
await db.execute(sql`
|
|
UPDATE tournaments SET
|
|
matches_count = (SELECT COUNT(*)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false),
|
|
total_goals = (SELECT COALESCE(SUM(score_ft_home + score_ft_away), 0)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL),
|
|
avg_goals_per_game = (
|
|
SELECT ROUND(COALESCE(SUM(score_ft_home + score_ft_away), 0)::numeric / NULLIF(COUNT(*), 0), 2)
|
|
FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL
|
|
)
|
|
WHERE year = 2026
|
|
`)
|
|
|
|
console.log(` ✓ ${matchCount} matches, ${goalCount} goals`)
|
|
console.log('\n✅ Sync complete!')
|
|
await client.end()
|
|
}
|
|
|
|
run().catch(e => { console.error(e); process.exit(1) })
|