Files
worldcup/scripts/seed.ts
T
valknar d37ebe201e refactor: consolidate data/ into single root directory, fix historical player names
Merge data/wikipedia/{year}/ into data/{year}/ so there is a single
canonical location for World Cup JSON files. Update scrape and seed
scripts to use data/ instead of data/wikipedia/.

Re-scraped all 22 years (1930-2022) with fixed player name extraction
(full name from <a title="..."> rather than abbreviated display text)
so historical goals now show e.g. "Thomas Müller" not "Müller".

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-15 18:27:35 +02:00

377 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import postgres from 'postgres'
import { drizzle } from 'drizzle-orm/postgres-js'
import { sql } from 'drizzle-orm'
import { readFileSync, existsSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
import { getIso } from '../lib/iso-codes'
import { normalizeTeam } from '../lib/wiki-scraper'
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const WC_DIR = path.join(__dirname, '../data')
const YEARS = [
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
]
function readJson<T>(filePath: string): T | null {
if (!existsSync(filePath)) return null
try { return JSON.parse(readFileSync(filePath, 'utf-8')) as T } catch { return null }
}
// ── Types matching scrape-wikipedia.ts output ──────────────────────────────
type RawGoal = { name: string; minute?: string | number; offset?: number; penalty?: boolean; owngoal?: boolean }
type RawScore = { ft?: number[]; ht?: number[]; et?: number[]; p?: number[] }
type RawMatch = {
round?: string; date?: string; time?: string;
team1: string; team2: string; score?: RawScore;
goals1?: RawGoal[]; goals2?: RawGoal[];
group?: string; ground?: string;
}
type RawData = { matches: RawMatch[] }
type RawMeta = { host: string; teams_count: number | null; winner: string | null; runner_up: string | null; third_place: string | null; fourth_place: string | null }
type RawStadiums = { stadiums: { name: string; city: string; cc?: string; capacity?: number; timezone?: string; coords?: string }[] }
type RawSquad = { name: string; players: { name: string; number?: number; pos?: string; date_of_birth?: string }[] }
function parseScore(score: RawScore | undefined) {
if (!score) return {}
if (Array.isArray(score)) return { ft: score as number[] }
return { ft: score.ft, ht: score.ht, et: score.et, p: score.p }
}
async function run() {
const client = postgres(DATABASE_URL, { max: 5 })
const db = drizzle(client)
// Create tables
await db.execute(sql`
CREATE TABLE IF NOT EXISTS tournaments (
year INTEGER PRIMARY KEY,
host TEXT NOT NULL,
winner TEXT,
runner_up TEXT,
third_place TEXT,
fourth_place TEXT,
teams_count INTEGER,
matches_count INTEGER,
total_goals INTEGER,
avg_goals_per_game NUMERIC(4,2)
);
CREATE TABLE IF NOT EXISTS teams (
id SERIAL PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
iso2 TEXT,
fifa_code TEXT,
continent TEXT,
confederation TEXT
);
CREATE TABLE IF NOT EXISTS stadiums (
id SERIAL PRIMARY KEY,
tournament_year INTEGER,
name TEXT NOT NULL,
city TEXT,
country_code TEXT,
capacity INTEGER,
timezone TEXT,
coordinates TEXT
);
CREATE TABLE IF NOT EXISTS matches (
id SERIAL PRIMARY KEY,
tournament_year INTEGER NOT NULL,
round TEXT NOT NULL,
group_name TEXT,
date DATE,
time_local TEXT,
stadium_id INTEGER,
team1_id INTEGER NOT NULL,
team2_id INTEGER NOT NULL,
score_ft_home INTEGER,
score_ft_away INTEGER,
score_ht_home INTEGER,
score_ht_away INTEGER,
score_et_home INTEGER,
score_et_away INTEGER,
score_p_home INTEGER,
score_p_away INTEGER,
is_quali_playoff BOOLEAN DEFAULT false
);
CREATE UNIQUE INDEX IF NOT EXISTS matches_unique
ON matches (tournament_year, team1_id, team2_id, date, is_quali_playoff);
CREATE TABLE IF NOT EXISTS goals (
id SERIAL PRIMARY KEY,
match_id INTEGER NOT NULL,
team_id INTEGER NOT NULL,
player_name TEXT NOT NULL,
minute INTEGER,
minute_offset INTEGER DEFAULT 0,
is_penalty BOOLEAN DEFAULT false,
is_own_goal BOOLEAN DEFAULT false
);
CREATE TABLE IF NOT EXISTS group_standings (
tournament_year INTEGER NOT NULL,
group_name TEXT NOT NULL,
team_id INTEGER NOT NULL,
pos INTEGER,
played INTEGER DEFAULT 0,
won INTEGER DEFAULT 0,
drawn INTEGER DEFAULT 0,
lost INTEGER DEFAULT 0,
goals_for INTEGER DEFAULT 0,
goals_against INTEGER DEFAULT 0,
goal_diff INTEGER DEFAULT 0,
pts INTEGER DEFAULT 0,
PRIMARY KEY (tournament_year, group_name, team_id)
);
CREATE TABLE IF NOT EXISTS squads (
id SERIAL PRIMARY KEY,
tournament_year INTEGER NOT NULL,
team_id INTEGER NOT NULL,
player_name TEXT NOT NULL,
shirt_number INTEGER,
position TEXT,
date_of_birth DATE
);
CREATE UNIQUE INDEX IF NOT EXISTS squads_unique
ON squads (tournament_year, team_id, shirt_number);
`)
const force = process.argv.includes('--force') || process.argv.includes('-f')
if (!force) {
const existing = await db.execute(sql`SELECT COUNT(*)::int AS cnt FROM tournaments WHERE year < 2026`)
if ((existing[0] as { cnt: number }).cnt > 0) {
console.log('✓ Already seeded (historical data present), skipping. Use --force to re-import.')
await client.end()
return
}
}
if (force) {
console.log('--force: clearing historical data...')
await db.execute(sql`DELETE FROM goals WHERE match_id IN (SELECT id FROM matches WHERE tournament_year < 2026)`)
await db.execute(sql`DELETE FROM squads WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM group_standings WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM stadiums WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM matches WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM tournaments WHERE year < 2026`)
}
console.log('Seeding historical data (19302022)...')
const teamCache = new Map<string, number>()
async function upsertTeam(rawName: string): Promise<number> {
const name = normalizeTeam(rawName)
if (teamCache.has(name)) return teamCache.get(name)!
const iso2 = getIso(name)
const [row] = await db.execute(sql`
INSERT INTO teams (name, iso2)
VALUES (${name}, ${iso2 ?? null})
ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
RETURNING id
`)
const id = (row as { id: number }).id
teamCache.set(name, id)
return id
}
// Per-year data from Wikipedia JSON files
let totalMatches = 0
let totalGoals = 0
for (const year of YEARS) {
const yearDir = path.join(WC_DIR, String(year))
const mainData = readJson<RawData>(path.join(yearDir, 'worldcup.json'))
if (!mainData?.matches) {
console.log(` ${year}: no data file, skipping`)
continue
}
// Tournament row from meta.json
const meta = readJson<RawMeta>(path.join(yearDir, 'worldcup.meta.json'))
if (meta) {
await db.execute(sql`
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
VALUES (
${year}, ${meta.host || null},
${normalizeTeam(meta.winner ?? '') || null}, ${normalizeTeam(meta.runner_up ?? '') || null},
${normalizeTeam(meta.third_place ?? '') || null}, ${normalizeTeam(meta.fourth_place ?? '') || null},
${meta.teams_count ?? null}
)
ON CONFLICT (year) DO UPDATE SET
host = EXCLUDED.host,
winner = EXCLUDED.winner,
runner_up = EXCLUDED.runner_up,
third_place = EXCLUDED.third_place,
fourth_place = EXCLUDED.fourth_place,
teams_count = EXCLUDED.teams_count
`)
}
let matchCount = 0, goalCount = 0
// Stadiums
const stadiumsData = readJson<RawStadiums>(path.join(yearDir, 'worldcup.stadiums.json'))
if (stadiumsData?.stadiums) {
for (const s of stadiumsData.stadiums) {
await db.execute(sql`
INSERT INTO stadiums (tournament_year, name, city)
VALUES (${year}, ${s.name}, ${s.city ?? null})
ON CONFLICT DO NOTHING
`)
}
}
// Matches and goals
for (const m of mainData.matches) {
const t1Id = await upsertTeam(m.team1)
const t2Id = await upsertTeam(m.team2)
const score = parseScore(m.score)
const [matchRow] = await db.execute(sql`
INSERT INTO matches (
tournament_year, round, group_name, date, time_local,
team1_id, team2_id,
score_ft_home, score_ft_away,
score_ht_home, score_ht_away,
score_et_home, score_et_away,
score_p_home, score_p_away,
is_quali_playoff
) VALUES (
${year}, ${m.round ?? 'Unknown'}, ${m.group ?? null},
${m.date ?? null}, ${m.time ?? null},
${t1Id}, ${t2Id},
${score.ft?.[0] ?? null}, ${score.ft?.[1] ?? null},
${score.ht?.[0] ?? null}, ${score.ht?.[1] ?? null},
${score.et?.[0] ?? null}, ${score.et?.[1] ?? null},
${score.p?.[0] ?? null}, ${score.p?.[1] ?? null},
false
)
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
round = EXCLUDED.round,
group_name = COALESCE(EXCLUDED.group_name, matches.group_name),
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
score_ht_home = COALESCE(EXCLUDED.score_ht_home, matches.score_ht_home),
score_ht_away = COALESCE(EXCLUDED.score_ht_away, matches.score_ht_away),
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away)
RETURNING id
`)
const matchId = (matchRow as { id: number }).id
// Goals (delete + re-insert)
await db.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
for (const [rawGoals, teamId, ogTeamId] of [
[m.goals1 ?? [], t1Id, t2Id],
[m.goals2 ?? [], t2Id, t1Id],
] as [RawGoal[], number, number][]) {
for (const g of rawGoals) {
if (!g.name) continue
const minute = g.minute != null ? parseInt(String(g.minute)) : null
const actualTeamId = g.owngoal ? ogTeamId : teamId
await db.execute(sql`
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
VALUES (${matchId}, ${actualTeamId}, ${g.name}, ${!minute || isNaN(minute) ? null : minute},
${g.offset ?? 0}, ${g.penalty ?? false}, ${g.owngoal ?? false})
`)
goalCount++
}
}
matchCount++
}
// Squads
const squadsData = readJson<RawSquad[]>(path.join(yearDir, 'worldcup.squads.json'))
if (squadsData && Array.isArray(squadsData)) {
for (const sq of squadsData) {
const teamId = await upsertTeam(sq.name)
for (const p of sq.players) {
if (!p.name) continue
const dob = p.date_of_birth ? p.date_of_birth.replace(/\s/g, '') : null
await db.execute(sql`
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
VALUES (${year}, ${teamId}, ${p.name}, ${p.number ?? null},
${p.pos ?? null}, ${dob})
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
player_name = EXCLUDED.player_name,
position = EXCLUDED.position,
date_of_birth = EXCLUDED.date_of_birth
`)
}
}
}
console.log(` ${year}: ${matchCount} matches, ${goalCount} goals`)
totalMatches += matchCount
totalGoals += goalCount
}
// 3. Group standings (computed from match results)
console.log('Computing group standings...')
await db.execute(sql`
DELETE FROM group_standings WHERE tournament_year < 2026
`)
await db.execute(sql`
INSERT INTO group_standings (tournament_year, group_name, team_id, played, won, drawn, lost,
goals_for, goals_against, goal_diff, pts)
WITH match_results AS (
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga
FROM matches WHERE tournament_year < 2026 AND group_name IS NOT NULL
AND is_quali_playoff = false AND score_ft_home IS NOT NULL
UNION ALL
SELECT tournament_year, group_name, team2_id, score_ft_away, score_ft_home
FROM matches WHERE tournament_year < 2026 AND group_name IS NOT NULL
AND is_quali_playoff = false AND score_ft_home IS NOT NULL
)
SELECT tournament_year, group_name, team_id,
COUNT(*)::int,
SUM(CASE WHEN gf > ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf = ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf < ga THEN 1 ELSE 0 END)::int,
SUM(gf)::int, SUM(ga)::int, SUM(gf - ga)::int,
SUM(CASE WHEN gf > ga THEN 3 WHEN gf = ga THEN 1 ELSE 0 END)::int
FROM match_results
GROUP BY tournament_year, group_name, team_id
ON CONFLICT (tournament_year, group_name, team_id) DO UPDATE SET
played = EXCLUDED.played, won = EXCLUDED.won, drawn = EXCLUDED.drawn,
lost = EXCLUDED.lost, goals_for = EXCLUDED.goals_for,
goals_against = EXCLUDED.goals_against, goal_diff = EXCLUDED.goal_diff,
pts = EXCLUDED.pts
`)
// 4. Tournament aggregates
await db.execute(sql`
UPDATE tournaments t SET
matches_count = (
SELECT COUNT(*)::int FROM matches WHERE tournament_year = t.year AND is_quali_playoff = false
),
total_goals = (
SELECT COUNT(g.id)::int
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
),
avg_goals_per_game = (
SELECT ROUND(COUNT(g.id)::numeric / NULLIF(COUNT(DISTINCT m.id), 0), 2)
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
AND m.score_ft_home IS NOT NULL
)
WHERE t.year < 2026
`)
console.log(`\n✅ Seed complete: ${totalMatches} matches, ${totalGoals} goals (19302022)`)
await client.end()
}
run().catch(e => { console.error('Seed failed:', e); process.exit(1) })