Files
worldcup/scripts/seed.ts
T
valknar ff4989f39f refactor: rename data/openfootball → data/wikipedia, drop data/kaggle
Move world_cup.csv to app/data/ directly (the only remaining Kaggle file
used by seed.ts for tournament metadata). Delete the rest of the Kaggle CSVs.
Update path constants in scrape-wikipedia.ts and seed.ts accordingly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-15 16:10:21 +02:00

448 lines
16 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import postgres from 'postgres'
import { drizzle } from 'drizzle-orm/postgres-js'
import { sql } from 'drizzle-orm'
import { readFileSync, existsSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
import { getIso } from '../lib/iso-codes'
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const DATA_DIR = path.join(__dirname, '../app/data')
const WC_DIR = path.join(DATA_DIR, 'wikipedia')
const YEARS = [
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
]
// Third/fourth place not reliably in source data for older years
const PLACEMENTS: Record<number, { third?: string; fourth?: string }> = {
1930: { third: 'USA', fourth: 'Yugoslavia' },
1934: { third: 'Germany', fourth: 'Austria' },
1938: { third: 'Brazil', fourth: 'Sweden' },
1954: { third: 'Austria', fourth: 'Uruguay' },
1958: { third: 'France', fourth: 'Germany' },
1962: { third: 'Chile', fourth: 'Yugoslavia' },
1966: { third: 'Portugal', fourth: 'Soviet Union' },
1970: { third: 'Germany', fourth: 'Uruguay' },
1974: { third: 'Poland', fourth: 'Brazil' },
1978: { third: 'Brazil', fourth: 'Italy' },
1982: { third: 'Poland', fourth: 'France' },
1986: { third: 'France', fourth: 'Belgium' },
1990: { third: 'Italy', fourth: 'England' },
1994: { third: 'Sweden', fourth: 'Bulgaria' },
1998: { third: 'Croatia', fourth: 'Netherlands' },
2002: { third: 'Turkey', fourth: 'South Korea' },
2006: { third: 'Germany', fourth: 'Portugal' },
2010: { third: 'Germany', fourth: 'Uruguay' },
2014: { third: 'Netherlands', fourth: 'Brazil' },
2018: { third: 'Belgium', fourth: 'England' },
2022: { third: 'Croatia', fourth: 'Morocco' },
}
// Normalize team names from Wikipedia to canonical DB names
const TEAM_ALIASES: Record<string, string> = {
'West Germany': 'Germany',
'Korea Republic': 'South Korea',
'IR Iran': 'Iran',
}
function normTeam(name: string): string {
return TEAM_ALIASES[name] ?? name
}
// Minimal RFC-4180 CSV parser
function parseCsv(content: string): Record<string, string>[] {
const rows: string[][] = []
let row: string[] = []
let field = ''
let inQ = false
for (let i = 0; i < content.length; i++) {
const ch = content[i]
if (inQ) {
if (ch === '"') {
if (content[i + 1] === '"') { field += '"'; i++ }
else inQ = false
} else {
field += ch
}
} else if (ch === '"') {
inQ = true
} else if (ch === ',') {
row.push(field); field = ''
} else if (ch === '\n') {
row.push(field); rows.push(row); row = []; field = ''
} else if (ch !== '\r') {
field += ch
}
}
if (field || row.length) { row.push(field); rows.push(row) }
const headers = rows[0]
return rows.slice(1)
.filter(r => r.some(f => f.trim()))
.map(r => Object.fromEntries(headers.map((h, i) => [h.trim(), (r[i] ?? '').trim()])))
}
function readJson<T>(filePath: string): T | null {
if (!existsSync(filePath)) return null
try { return JSON.parse(readFileSync(filePath, 'utf-8')) as T } catch { return null }
}
// ── Types matching scrape-wikipedia.ts output ──────────────────────────────
type RawGoal = { name: string; minute?: string | number; offset?: number; penalty?: boolean; owngoal?: boolean }
type RawScore = { ft?: number[]; ht?: number[]; et?: number[]; p?: number[] }
type RawMatch = {
round?: string; date?: string; time?: string;
team1: string; team2: string; score?: RawScore;
goals1?: RawGoal[]; goals2?: RawGoal[];
group?: string; ground?: string;
}
type RawData = { matches: RawMatch[] }
type RawStadiums = { stadiums: { name: string; city: string; cc?: string; capacity?: number; timezone?: string; coords?: string }[] }
type RawSquad = { name: string; players: { name: string; number?: number; pos?: string; date_of_birth?: string }[] }
function parseScore(score: RawScore | undefined) {
if (!score) return {}
if (Array.isArray(score)) return { ft: score as number[] }
return { ft: score.ft, ht: score.ht, et: score.et, p: score.p }
}
async function run() {
const client = postgres(DATABASE_URL, { max: 5 })
const db = drizzle(client)
// Create tables
await db.execute(sql`
CREATE TABLE IF NOT EXISTS tournaments (
year INTEGER PRIMARY KEY,
host TEXT NOT NULL,
winner TEXT,
runner_up TEXT,
third_place TEXT,
fourth_place TEXT,
teams_count INTEGER,
matches_count INTEGER,
total_goals INTEGER,
avg_goals_per_game NUMERIC(4,2)
);
CREATE TABLE IF NOT EXISTS teams (
id SERIAL PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
iso2 TEXT,
fifa_code TEXT,
continent TEXT,
confederation TEXT
);
CREATE TABLE IF NOT EXISTS stadiums (
id SERIAL PRIMARY KEY,
tournament_year INTEGER,
name TEXT NOT NULL,
city TEXT,
country_code TEXT,
capacity INTEGER,
timezone TEXT,
coordinates TEXT
);
CREATE TABLE IF NOT EXISTS matches (
id SERIAL PRIMARY KEY,
tournament_year INTEGER NOT NULL,
round TEXT NOT NULL,
group_name TEXT,
date DATE,
time_local TEXT,
stadium_id INTEGER,
team1_id INTEGER NOT NULL,
team2_id INTEGER NOT NULL,
score_ft_home INTEGER,
score_ft_away INTEGER,
score_ht_home INTEGER,
score_ht_away INTEGER,
score_et_home INTEGER,
score_et_away INTEGER,
score_p_home INTEGER,
score_p_away INTEGER,
is_quali_playoff BOOLEAN DEFAULT false
);
CREATE UNIQUE INDEX IF NOT EXISTS matches_unique
ON matches (tournament_year, team1_id, team2_id, date, is_quali_playoff);
CREATE TABLE IF NOT EXISTS goals (
id SERIAL PRIMARY KEY,
match_id INTEGER NOT NULL,
team_id INTEGER NOT NULL,
player_name TEXT NOT NULL,
minute INTEGER,
minute_offset INTEGER DEFAULT 0,
is_penalty BOOLEAN DEFAULT false,
is_own_goal BOOLEAN DEFAULT false
);
CREATE TABLE IF NOT EXISTS group_standings (
tournament_year INTEGER NOT NULL,
group_name TEXT NOT NULL,
team_id INTEGER NOT NULL,
pos INTEGER,
played INTEGER DEFAULT 0,
won INTEGER DEFAULT 0,
drawn INTEGER DEFAULT 0,
lost INTEGER DEFAULT 0,
goals_for INTEGER DEFAULT 0,
goals_against INTEGER DEFAULT 0,
goal_diff INTEGER DEFAULT 0,
pts INTEGER DEFAULT 0,
PRIMARY KEY (tournament_year, group_name, team_id)
);
CREATE TABLE IF NOT EXISTS squads (
id SERIAL PRIMARY KEY,
tournament_year INTEGER NOT NULL,
team_id INTEGER NOT NULL,
player_name TEXT NOT NULL,
shirt_number INTEGER,
position TEXT,
date_of_birth DATE
);
CREATE UNIQUE INDEX IF NOT EXISTS squads_unique
ON squads (tournament_year, team_id, shirt_number);
`)
const force = process.argv.includes('--force') || process.argv.includes('-f')
if (!force) {
const existing = await db.execute(sql`SELECT COUNT(*)::int AS cnt FROM tournaments WHERE year < 2026`)
if ((existing[0] as { cnt: number }).cnt > 0) {
console.log('✓ Already seeded (historical data present), skipping. Use --force to re-import.')
await client.end()
return
}
}
if (force) {
console.log('--force: clearing historical data...')
await db.execute(sql`DELETE FROM goals WHERE match_id IN (SELECT id FROM matches WHERE tournament_year < 2026)`)
await db.execute(sql`DELETE FROM squads WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM group_standings WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM stadiums WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM matches WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM tournaments WHERE year < 2026`)
}
console.log('Seeding historical data (19302022)...')
const teamCache = new Map<string, number>()
async function upsertTeam(rawName: string): Promise<number> {
const name = normTeam(rawName)
if (teamCache.has(name)) return teamCache.get(name)!
const iso2 = getIso(name)
const [row] = await db.execute(sql`
INSERT INTO teams (name, iso2)
VALUES (${name}, ${iso2 ?? null})
ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
RETURNING id
`)
const id = (row as { id: number }).id
teamCache.set(name, id)
return id
}
// 1. Tournaments from world_cup.csv (host, winner, runner_up)
const wcRows = parseCsv(readFileSync(path.join(DATA_DIR, 'world_cup.csv'), 'utf-8'))
for (const r of wcRows) {
const year = parseInt(r['Year'])
if (isNaN(year)) continue
const winner = normTeam(r['Champion'] || '')
const runnerUp = normTeam(r['Runner-Up'] || '')
const p = PLACEMENTS[year] ?? {}
await db.execute(sql`
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
VALUES (
${year}, ${r['Host']},
${winner || null}, ${runnerUp || null},
${p.third ?? null}, ${p.fourth ?? null},
${parseInt(r['Teams']) || null}
)
ON CONFLICT (year) DO UPDATE SET
host = EXCLUDED.host,
winner = EXCLUDED.winner,
runner_up = EXCLUDED.runner_up,
third_place = EXCLUDED.third_place,
fourth_place = EXCLUDED.fourth_place,
teams_count = EXCLUDED.teams_count
`)
}
// 2. Per-year match/stadium/squad data from openfootball JSON files
let totalMatches = 0
let totalGoals = 0
for (const year of YEARS) {
const yearDir = path.join(WC_DIR, String(year))
const mainData = readJson<RawData>(path.join(yearDir, 'worldcup.json'))
if (!mainData?.matches) {
console.log(` ${year}: no data file, skipping`)
continue
}
let matchCount = 0, goalCount = 0
// Stadiums
const stadiumsData = readJson<RawStadiums>(path.join(yearDir, 'worldcup.stadiums.json'))
if (stadiumsData?.stadiums) {
for (const s of stadiumsData.stadiums) {
await db.execute(sql`
INSERT INTO stadiums (tournament_year, name, city)
VALUES (${year}, ${s.name}, ${s.city ?? null})
ON CONFLICT DO NOTHING
`)
}
}
// Matches and goals
for (const m of mainData.matches) {
const t1Id = await upsertTeam(m.team1)
const t2Id = await upsertTeam(m.team2)
const score = parseScore(m.score)
const [matchRow] = await db.execute(sql`
INSERT INTO matches (
tournament_year, round, group_name, date, time_local,
team1_id, team2_id,
score_ft_home, score_ft_away,
score_ht_home, score_ht_away,
score_et_home, score_et_away,
score_p_home, score_p_away,
is_quali_playoff
) VALUES (
${year}, ${m.round ?? 'Unknown'}, ${m.group ?? null},
${m.date ?? null}, ${m.time ?? null},
${t1Id}, ${t2Id},
${score.ft?.[0] ?? null}, ${score.ft?.[1] ?? null},
${score.ht?.[0] ?? null}, ${score.ht?.[1] ?? null},
${score.et?.[0] ?? null}, ${score.et?.[1] ?? null},
${score.p?.[0] ?? null}, ${score.p?.[1] ?? null},
false
)
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
round = EXCLUDED.round,
group_name = COALESCE(EXCLUDED.group_name, matches.group_name),
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
score_ht_home = COALESCE(EXCLUDED.score_ht_home, matches.score_ht_home),
score_ht_away = COALESCE(EXCLUDED.score_ht_away, matches.score_ht_away),
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away)
RETURNING id
`)
const matchId = (matchRow as { id: number }).id
// Goals (delete + re-insert)
await db.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
for (const [rawGoals, teamId, ogTeamId] of [
[m.goals1 ?? [], t1Id, t2Id],
[m.goals2 ?? [], t2Id, t1Id],
] as [RawGoal[], number, number][]) {
for (const g of rawGoals) {
if (!g.name) continue
const minute = g.minute != null ? parseInt(String(g.minute)) : null
const actualTeamId = g.owngoal ? ogTeamId : teamId
await db.execute(sql`
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
VALUES (${matchId}, ${actualTeamId}, ${g.name}, ${!minute || isNaN(minute) ? null : minute},
${g.offset ?? 0}, ${g.penalty ?? false}, ${g.owngoal ?? false})
`)
goalCount++
}
}
matchCount++
}
// Squads
const squadsData = readJson<RawSquad[]>(path.join(yearDir, 'worldcup.squads.json'))
if (squadsData && Array.isArray(squadsData)) {
for (const sq of squadsData) {
const teamId = await upsertTeam(sq.name)
for (const p of sq.players) {
if (!p.name) continue
const dob = p.date_of_birth ? p.date_of_birth.replace(/\s/g, '') : null
await db.execute(sql`
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
VALUES (${year}, ${teamId}, ${p.name}, ${p.number ?? null},
${p.pos ?? null}, ${dob})
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
player_name = EXCLUDED.player_name,
position = EXCLUDED.position,
date_of_birth = EXCLUDED.date_of_birth
`)
}
}
}
console.log(` ${year}: ${matchCount} matches, ${goalCount} goals`)
totalMatches += matchCount
totalGoals += goalCount
}
// 3. Group standings (computed from match results)
console.log('Computing group standings...')
await db.execute(sql`
DELETE FROM group_standings WHERE tournament_year < 2026
`)
await db.execute(sql`
INSERT INTO group_standings (tournament_year, group_name, team_id, played, won, drawn, lost,
goals_for, goals_against, goal_diff, pts)
WITH match_results AS (
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga
FROM matches WHERE tournament_year < 2026 AND group_name IS NOT NULL
AND is_quali_playoff = false AND score_ft_home IS NOT NULL
UNION ALL
SELECT tournament_year, group_name, team2_id, score_ft_away, score_ft_home
FROM matches WHERE tournament_year < 2026 AND group_name IS NOT NULL
AND is_quali_playoff = false AND score_ft_home IS NOT NULL
)
SELECT tournament_year, group_name, team_id,
COUNT(*)::int,
SUM(CASE WHEN gf > ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf = ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf < ga THEN 1 ELSE 0 END)::int,
SUM(gf)::int, SUM(ga)::int, SUM(gf - ga)::int,
SUM(CASE WHEN gf > ga THEN 3 WHEN gf = ga THEN 1 ELSE 0 END)::int
FROM match_results
GROUP BY tournament_year, group_name, team_id
ON CONFLICT (tournament_year, group_name, team_id) DO UPDATE SET
played = EXCLUDED.played, won = EXCLUDED.won, drawn = EXCLUDED.drawn,
lost = EXCLUDED.lost, goals_for = EXCLUDED.goals_for,
goals_against = EXCLUDED.goals_against, goal_diff = EXCLUDED.goal_diff,
pts = EXCLUDED.pts
`)
// 4. Tournament aggregates
await db.execute(sql`
UPDATE tournaments t SET
matches_count = (
SELECT COUNT(*)::int FROM matches WHERE tournament_year = t.year AND is_quali_playoff = false
),
total_goals = (
SELECT COUNT(g.id)::int
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
),
avg_goals_per_game = (
SELECT ROUND(COUNT(g.id)::numeric / NULLIF(COUNT(DISTINCT m.id), 0), 2)
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
AND m.score_ft_home IS NOT NULL
)
WHERE t.year < 2026
`)
console.log(`\n✅ Seed complete: ${totalMatches} matches, ${totalGoals} goals (19302022)`)
await client.end()
}
run().catch(e => { console.error('Seed failed:', e); process.exit(1) })