feat: replace Kaggle CSV with Wikipedia scraper for historical match data

Add scripts/scrape-wikipedia.ts that fetches all 22 World Cups (1930–2022)
from English Wikipedia via MediaWiki API, handles group sub-pages, AET/penalty
detection, and goal parsing, writing openfootball-format JSON to app/data/openfootball/.

Rewrite scripts/seed.ts to read these local JSON files instead of the Kaggle
CSV, producing 965 matches and 2716 goals with per-group assignments for all
historical tournaments (enabling group standings on tournament pages).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 11:39:53 +02:00
parent 83b1ad3e35
commit 5dcd22ad22
88 changed files with 95625 additions and 127 deletions
+193 -127
View File
@@ -1,16 +1,23 @@
import postgres from 'postgres'
import { drizzle } from 'drizzle-orm/postgres-js'
import { sql } from 'drizzle-orm'
import { readFileSync } from 'fs'
import { readFileSync, existsSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
import { getIso } from '../lib/iso-codes'
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const DATA_DIR = path.join(__dirname, '../app/data/kaggle')
const DATA_DIR = path.join(__dirname, '../app/data')
const KAGGLE_DIR = path.join(DATA_DIR, 'kaggle')
const WC_DIR = path.join(DATA_DIR, 'openfootball')
// Third/fourth place not present in Kaggle world_cup.csv
const YEARS = [
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
]
// Third/fourth place not reliably in source data for older years
const PLACEMENTS: Record<number, { third?: string; fourth?: string }> = {
1930: { third: 'USA', fourth: 'Yugoslavia' },
1934: { third: 'Germany', fourth: 'Austria' },
@@ -35,7 +42,7 @@ const PLACEMENTS: Record<number, { third?: string; fourth?: string }> = {
2022: { third: 'Croatia', fourth: 'Morocco' },
}
// Normalize Kaggle team names to match openfootball / our canonical names
// Normalize team names from Wikipedia to canonical DB names
const TEAM_ALIASES: Record<string, string> = {
'West Germany': 'Germany',
'Korea Republic': 'South Korea',
@@ -46,7 +53,7 @@ function normTeam(name: string): string {
return TEAM_ALIASES[name] ?? name
}
// Minimal RFC-4180 CSV parser — no external dependency needed
// Minimal RFC-4180 CSV parser
function parseCsv(content: string): Record<string, string>[] {
const rows: string[][] = []
let row: string[] = []
@@ -78,38 +85,36 @@ function parseCsv(content: string): Record<string, string>[] {
.map(r => Object.fromEntries(headers.map((h, i) => [h.trim(), (r[i] ?? '').trim()])))
}
type GoalEntry = { name: string; minute: number | null; offset: number; isPenalty: boolean; isOwnGoal: boolean }
// Parse "Player Name · 57" or "Player (OG) · 90+3" → GoalEntry
function parseGoalStr(entry: string, isPenalty = false, isOwnGoal = false): GoalEntry | null {
const dot = entry.lastIndexOf('·')
if (dot === -1) return null
const name = entry.slice(0, dot).trim()
.replace(/\s*\(P\)\s*$/, '').replace(/\s*\(OG\)\s*$/, '').trim()
if (!name) return null
const minRaw = entry.slice(dot + 1).trim()
const plusIdx = minRaw.indexOf('+')
let minute: number | null, offset = 0
if (plusIdx !== -1) {
minute = parseInt(minRaw.slice(0, plusIdx))
offset = parseInt(minRaw.slice(plusIdx + 1)) || 0
} else {
const m = parseInt(minRaw)
minute = isNaN(m) ? null : m
}
return { name, minute, offset, isPenalty, isOwnGoal }
function readJson<T>(filePath: string): T | null {
if (!existsSync(filePath)) return null
try { return JSON.parse(readFileSync(filePath, 'utf-8')) as T } catch { return null }
}
function parseGoalCol(col: string, isPenalty = false, isOwnGoal = false): GoalEntry[] {
if (!col?.trim()) return []
return col.split('|').map(e => parseGoalStr(e.trim(), isPenalty, isOwnGoal)).filter(Boolean) as GoalEntry[]
// ── Types matching scrape-wikipedia.ts output ──────────────────────────────
type RawGoal = { name: string; minute?: string | number; offset?: number; penalty?: boolean; owngoal?: boolean }
type RawScore = { ft?: number[]; ht?: number[]; et?: number[]; p?: number[] }
type RawMatch = {
round?: string; date?: string; time?: string;
team1: string; team2: string; score?: RawScore;
goals1?: RawGoal[]; goals2?: RawGoal[];
group?: string; ground?: string;
}
type RawData = { matches: RawMatch[] }
type RawStadiums = { stadiums: { name: string; city: string; cc?: string; capacity?: number; timezone?: string; coords?: string }[] }
type RawSquad = { name: string; players: { name: string; number?: number; pos?: string; date_of_birth?: string }[] }
function parseScore(score: RawScore | undefined) {
if (!score) return {}
if (Array.isArray(score)) return { ft: score as number[] }
return { ft: score.ft, ht: score.ht, et: score.et, p: score.p }
}
async function run() {
const client = postgres(DATABASE_URL, { max: 5 })
const db = drizzle(client)
// Create tables (mirrors sync.ts DDL — runs first on a fresh DB)
// Create tables
await db.execute(sql`
CREATE TABLE IF NOT EXISTS tournaments (
year INTEGER PRIMARY KEY,
@@ -203,7 +208,6 @@ async function run() {
const force = process.argv.includes('--force') || process.argv.includes('-f')
// Skip if already seeded (idempotency check)
if (!force) {
const existing = await db.execute(sql`SELECT COUNT(*)::int AS cnt FROM tournaments WHERE year < 2026`)
if ((existing[0] as { cnt: number }).cnt > 0) {
@@ -216,20 +220,24 @@ async function run() {
if (force) {
console.log('--force: clearing historical data...')
await db.execute(sql`DELETE FROM goals WHERE match_id IN (SELECT id FROM matches WHERE tournament_year < 2026)`)
await db.execute(sql`DELETE FROM squads WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM group_standings WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM stadiums WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM matches WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM tournaments WHERE year < 2026`)
}
console.log('Seeding from Kaggle data (19302022)...')
console.log('Seeding historical data (19302022)...')
const teamCache = new Map<string, number>()
async function upsertTeam(rawName: string): Promise<number> {
const name = normTeam(rawName)
if (teamCache.has(name)) return teamCache.get(name)!
const iso2 = getIso(name)
const [row] = await db.execute(sql`
INSERT INTO teams (name, iso2)
VALUES (${name}, ${getIso(name) ?? null})
VALUES (${name}, ${iso2 ?? null})
ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
RETURNING id
`)
@@ -238,8 +246,8 @@ async function run() {
return id
}
// 1. Tournaments from world_cup.csv
const wcRows = parseCsv(readFileSync(path.join(DATA_DIR, 'world_cup.csv'), 'utf-8'))
// 1. Tournaments from world_cup.csv (host, winner, runner_up)
const wcRows = parseCsv(readFileSync(path.join(KAGGLE_DIR, 'world_cup.csv'), 'utf-8'))
for (const r of wcRows) {
const year = parseInt(r['Year'])
if (isNaN(year)) continue
@@ -247,135 +255,193 @@ async function run() {
const runnerUp = normTeam(r['Runner-Up'] || '')
const p = PLACEMENTS[year] ?? {}
await db.execute(sql`
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count, matches_count)
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
VALUES (
${year}, ${r['Host']},
${winner || null}, ${runnerUp || null},
${p.third ?? null}, ${p.fourth ?? null},
${parseInt(r['Teams']) || null}, ${parseInt(r['Matches']) || null}
${parseInt(r['Teams']) || null}
)
ON CONFLICT (year) DO UPDATE SET
host = EXCLUDED.host,
winner = EXCLUDED.winner,
runner_up = EXCLUDED.runner_up,
host = EXCLUDED.host,
winner = EXCLUDED.winner,
runner_up = EXCLUDED.runner_up,
third_place = EXCLUDED.third_place,
fourth_place = EXCLUDED.fourth_place,
teams_count = EXCLUDED.teams_count,
matches_count = EXCLUDED.matches_count
teams_count = EXCLUDED.teams_count
`)
}
// 2. Matches + goals from matches_1930_2022.csv
const matchRows = parseCsv(readFileSync(path.join(DATA_DIR, 'matches_1930_2022.csv'), 'utf-8'))
// 2. Per-year match/stadium/squad data from openfootball JSON files
let totalMatches = 0
let totalGoals = 0
let totalMatches = 0, totalGoals = 0
for (const r of matchRows) {
const year = parseInt(r['Year'])
if (isNaN(year)) continue
const t1Id = await upsertTeam(r['home_team'])
const t2Id = await upsertTeam(r['away_team'])
const homeScore = r['home_score'] !== '' ? parseInt(r['home_score']) : null
const awayScore = r['away_score'] !== '' ? parseInt(r['away_score']) : null
const homePen = r['home_penalty'] !== '' ? parseInt(r['home_penalty']) : null
const awayPen = r['away_penalty'] !== '' ? parseInt(r['away_penalty']) : null
const dateStr = r['Date'] || null
// Parse all goal columns
const homeGoals = parseGoalCol(r['home_goal'])
const awayGoals = parseGoalCol(r['away_goal'])
const homePenGoals = parseGoalCol(r['home_penalty_goal'], true)
const awayPenGoals = parseGoalCol(r['away_penalty_goal'], true)
// home_own_goal = home player scored OG → goal credited to AWAY team
const homeOgGoals = parseGoalCol(r['home_own_goal'], false, true)
// away_own_goal = away player scored OG → goal credited to HOME team
const awayOgGoals = parseGoalCol(r['away_own_goal'], false, true)
// Determine FT vs ET score split from goal minutes
const allGoals = [...homeGoals, ...awayGoals, ...homePenGoals, ...awayPenGoals]
const hasEt = allGoals.some(g => g.minute !== null && g.minute > 90)
let scoreFtHome: number | null, scoreFtAway: number | null
let scoreEtHome: number | null = null, scoreEtAway: number | null = null
if (hasEt) {
// Compute FT from goals in minutes 190
const ftGoalCount = (goals: GoalEntry[]) =>
goals.filter(g => g.minute === null || g.minute <= 90).length
scoreFtHome = ftGoalCount(homeGoals) + ftGoalCount(homePenGoals) + ftGoalCount(awayOgGoals)
scoreFtAway = ftGoalCount(awayGoals) + ftGoalCount(awayPenGoals) + ftGoalCount(homeOgGoals)
scoreEtHome = homeScore
scoreEtAway = awayScore
} else {
scoreFtHome = homeScore
scoreFtAway = awayScore
for (const year of YEARS) {
const yearDir = path.join(WC_DIR, String(year))
const mainData = readJson<RawData>(path.join(yearDir, 'worldcup.json'))
if (!mainData?.matches) {
console.log(` ${year}: no data file, skipping`)
continue
}
const [matchRow] = await db.execute(sql`
INSERT INTO matches (
tournament_year, round, date, team1_id, team2_id,
score_ft_home, score_ft_away, score_et_home, score_et_away,
score_p_home, score_p_away, is_quali_playoff
) VALUES (
${year}, ${r['Round'] || 'Unknown'}, ${dateStr},
${t1Id}, ${t2Id},
${scoreFtHome}, ${scoreFtAway}, ${scoreEtHome}, ${scoreEtAway},
${homePen}, ${awayPen}, false
)
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
round = EXCLUDED.round,
score_ft_home = EXCLUDED.score_ft_home,
score_ft_away = EXCLUDED.score_ft_away,
score_et_home = EXCLUDED.score_et_home,
score_et_away = EXCLUDED.score_et_away,
score_p_home = EXCLUDED.score_p_home,
score_p_away = EXCLUDED.score_p_away
RETURNING id
`)
const matchId = (matchRow as { id: number }).id
let matchCount = 0, goalCount = 0
await db.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
// Stadiums
const stadiumsData = readJson<RawStadiums>(path.join(yearDir, 'worldcup.stadiums.json'))
if (stadiumsData?.stadiums) {
for (const s of stadiumsData.stadiums) {
await db.execute(sql`
INSERT INTO stadiums (tournament_year, name, city)
VALUES (${year}, ${s.name}, ${s.city ?? null})
ON CONFLICT DO NOTHING
`)
}
}
// home team goals (+ away player own goals that benefit home)
for (const g of [...homeGoals, ...homePenGoals, ...awayOgGoals]) {
await db.execute(sql`
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
VALUES (${matchId}, ${t1Id}, ${g.name}, ${g.minute}, ${g.offset}, ${g.isPenalty}, ${g.isOwnGoal})
// Matches and goals
for (const m of mainData.matches) {
const t1Id = await upsertTeam(m.team1)
const t2Id = await upsertTeam(m.team2)
const score = parseScore(m.score)
const [matchRow] = await db.execute(sql`
INSERT INTO matches (
tournament_year, round, group_name, date, time_local,
team1_id, team2_id,
score_ft_home, score_ft_away,
score_ht_home, score_ht_away,
score_et_home, score_et_away,
score_p_home, score_p_away,
is_quali_playoff
) VALUES (
${year}, ${m.round ?? 'Unknown'}, ${m.group ?? null},
${m.date ?? null}, ${m.time ?? null},
${t1Id}, ${t2Id},
${score.ft?.[0] ?? null}, ${score.ft?.[1] ?? null},
${score.ht?.[0] ?? null}, ${score.ht?.[1] ?? null},
${score.et?.[0] ?? null}, ${score.et?.[1] ?? null},
${score.p?.[0] ?? null}, ${score.p?.[1] ?? null},
false
)
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
round = EXCLUDED.round,
group_name = COALESCE(EXCLUDED.group_name, matches.group_name),
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
score_ht_home = COALESCE(EXCLUDED.score_ht_home, matches.score_ht_home),
score_ht_away = COALESCE(EXCLUDED.score_ht_away, matches.score_ht_away),
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away)
RETURNING id
`)
totalGoals++
const matchId = (matchRow as { id: number }).id
// Goals (delete + re-insert)
await db.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
for (const [rawGoals, teamId, ogTeamId] of [
[m.goals1 ?? [], t1Id, t2Id],
[m.goals2 ?? [], t2Id, t1Id],
] as [RawGoal[], number, number][]) {
for (const g of rawGoals) {
if (!g.name) continue
const minute = g.minute != null ? parseInt(String(g.minute)) : null
const actualTeamId = g.owngoal ? ogTeamId : teamId
await db.execute(sql`
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
VALUES (${matchId}, ${actualTeamId}, ${g.name}, ${!minute || isNaN(minute) ? null : minute},
${g.offset ?? 0}, ${g.penalty ?? false}, ${g.owngoal ?? false})
`)
goalCount++
}
}
matchCount++
}
// away team goals (+ home player own goals that benefit away)
for (const g of [...awayGoals, ...awayPenGoals, ...homeOgGoals]) {
await db.execute(sql`
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
VALUES (${matchId}, ${t2Id}, ${g.name}, ${g.minute}, ${g.offset}, ${g.isPenalty}, ${g.isOwnGoal})
`)
totalGoals++
// Squads
const squadsData = readJson<RawSquad[]>(path.join(yearDir, 'worldcup.squads.json'))
if (squadsData && Array.isArray(squadsData)) {
for (const sq of squadsData) {
const teamId = await upsertTeam(sq.name)
for (const p of sq.players) {
if (!p.name) continue
const dob = p.date_of_birth ? p.date_of_birth.replace(/\s/g, '') : null
await db.execute(sql`
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
VALUES (${year}, ${teamId}, ${p.name}, ${p.number ?? null},
${p.pos ?? null}, ${dob})
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
player_name = EXCLUDED.player_name,
position = EXCLUDED.position,
date_of_birth = EXCLUDED.date_of_birth
`)
}
}
}
totalMatches++
console.log(` ${year}: ${matchCount} matches, ${goalCount} goals`)
totalMatches += matchCount
totalGoals += goalCount
}
// 3. Update tournament aggregates
// 3. Group standings (computed from match results)
console.log('Computing group standings...')
await db.execute(sql`
DELETE FROM group_standings WHERE tournament_year < 2026
`)
await db.execute(sql`
INSERT INTO group_standings (tournament_year, group_name, team_id, played, won, drawn, lost,
goals_for, goals_against, goal_diff, pts)
WITH match_results AS (
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga
FROM matches WHERE tournament_year < 2026 AND group_name IS NOT NULL
AND is_quali_playoff = false AND score_ft_home IS NOT NULL
UNION ALL
SELECT tournament_year, group_name, team2_id, score_ft_away, score_ft_home
FROM matches WHERE tournament_year < 2026 AND group_name IS NOT NULL
AND is_quali_playoff = false AND score_ft_home IS NOT NULL
)
SELECT tournament_year, group_name, team_id,
COUNT(*)::int,
SUM(CASE WHEN gf > ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf = ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf < ga THEN 1 ELSE 0 END)::int,
SUM(gf)::int, SUM(ga)::int, SUM(gf - ga)::int,
SUM(CASE WHEN gf > ga THEN 3 WHEN gf = ga THEN 1 ELSE 0 END)::int
FROM match_results
GROUP BY tournament_year, group_name, team_id
ON CONFLICT (tournament_year, group_name, team_id) DO UPDATE SET
played = EXCLUDED.played, won = EXCLUDED.won, drawn = EXCLUDED.drawn,
lost = EXCLUDED.lost, goals_for = EXCLUDED.goals_for,
goals_against = EXCLUDED.goals_against, goal_diff = EXCLUDED.goal_diff,
pts = EXCLUDED.pts
`)
// 4. Tournament aggregates
await db.execute(sql`
UPDATE tournaments t SET
matches_count = (
SELECT COUNT(*)::int FROM matches WHERE tournament_year = t.year AND is_quali_playoff = false
),
total_goals = (
SELECT COUNT(g.id)::int
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
),
matches_count = (
SELECT COUNT(*)::int FROM matches WHERE tournament_year = t.year AND is_quali_playoff = false
),
avg_goals_per_game = (
SELECT ROUND(COUNT(g.id)::numeric / NULLIF(COUNT(DISTINCT m.id), 0), 2)
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
AND m.score_ft_home IS NOT NULL
)
WHERE t.year < 2026
`)
console.log(`✅ Seed complete: ${totalMatches} matches, ${totalGoals} goals (19302022)`)
console.log(`\n✅ Seed complete: ${totalMatches} matches, ${totalGoals} goals (19302022)`)
await client.end()
}