Files

377 lines
14 KiB
TypeScript
Raw Permalink Normal View History

import postgres from 'postgres'
import { drizzle } from 'drizzle-orm/postgres-js'
import { sql } from 'drizzle-orm'
import { readFileSync, existsSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
import { getIso } from '../lib/iso-codes'
import { normalizeTeam } from '../lib/wiki-scraper'
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const WC_DIR = path.join(__dirname, '../data')
const YEARS = [
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
]
function readJson<T>(filePath: string): T | null {
if (!existsSync(filePath)) return null
try { return JSON.parse(readFileSync(filePath, 'utf-8')) as T } catch { return null }
}
// ── Types matching scrape-wikipedia.ts output ──────────────────────────────
type RawGoal = { name: string; minute?: string | number; offset?: number; penalty?: boolean; owngoal?: boolean }
type RawScore = { ft?: number[]; ht?: number[]; et?: number[]; p?: number[] }
type RawMatch = {
round?: string; date?: string; time?: string;
team1: string; team2: string; score?: RawScore;
goals1?: RawGoal[]; goals2?: RawGoal[];
group?: string; ground?: string;
}
type RawData = { matches: RawMatch[] }
type RawMeta = { host: string; teams_count: number | null; winner: string | null; runner_up: string | null; third_place: string | null; fourth_place: string | null }
type RawStadiums = { stadiums: { name: string; city: string; cc?: string; capacity?: number; timezone?: string; coords?: string }[] }
type RawSquad = { name: string; players: { name: string; number?: number; pos?: string; date_of_birth?: string }[] }
function parseScore(score: RawScore | undefined) {
if (!score) return {}
if (Array.isArray(score)) return { ft: score as number[] }
return { ft: score.ft, ht: score.ht, et: score.et, p: score.p }
}
async function run() {
const client = postgres(DATABASE_URL, { max: 5 })
const db = drizzle(client)
// Create tables
await db.execute(sql`
CREATE TABLE IF NOT EXISTS tournaments (
year INTEGER PRIMARY KEY,
host TEXT NOT NULL,
winner TEXT,
runner_up TEXT,
third_place TEXT,
fourth_place TEXT,
teams_count INTEGER,
matches_count INTEGER,
total_goals INTEGER,
avg_goals_per_game NUMERIC(4,2)
);
CREATE TABLE IF NOT EXISTS teams (
id SERIAL PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
iso2 TEXT,
fifa_code TEXT,
continent TEXT,
confederation TEXT
);
CREATE TABLE IF NOT EXISTS stadiums (
id SERIAL PRIMARY KEY,
tournament_year INTEGER,
name TEXT NOT NULL,
city TEXT,
country_code TEXT,
capacity INTEGER,
timezone TEXT,
coordinates TEXT
);
CREATE TABLE IF NOT EXISTS matches (
id SERIAL PRIMARY KEY,
tournament_year INTEGER NOT NULL,
round TEXT NOT NULL,
group_name TEXT,
date DATE,
time_local TEXT,
stadium_id INTEGER,
team1_id INTEGER NOT NULL,
team2_id INTEGER NOT NULL,
score_ft_home INTEGER,
score_ft_away INTEGER,
score_ht_home INTEGER,
score_ht_away INTEGER,
score_et_home INTEGER,
score_et_away INTEGER,
score_p_home INTEGER,
score_p_away INTEGER,
is_quali_playoff BOOLEAN DEFAULT false
);
CREATE UNIQUE INDEX IF NOT EXISTS matches_unique
ON matches (tournament_year, team1_id, team2_id, date, is_quali_playoff);
CREATE TABLE IF NOT EXISTS goals (
id SERIAL PRIMARY KEY,
match_id INTEGER NOT NULL,
team_id INTEGER NOT NULL,
player_name TEXT NOT NULL,
minute INTEGER,
minute_offset INTEGER DEFAULT 0,
is_penalty BOOLEAN DEFAULT false,
is_own_goal BOOLEAN DEFAULT false
);
CREATE TABLE IF NOT EXISTS group_standings (
tournament_year INTEGER NOT NULL,
group_name TEXT NOT NULL,
team_id INTEGER NOT NULL,
pos INTEGER,
played INTEGER DEFAULT 0,
won INTEGER DEFAULT 0,
drawn INTEGER DEFAULT 0,
lost INTEGER DEFAULT 0,
goals_for INTEGER DEFAULT 0,
goals_against INTEGER DEFAULT 0,
goal_diff INTEGER DEFAULT 0,
pts INTEGER DEFAULT 0,
PRIMARY KEY (tournament_year, group_name, team_id)
);
CREATE TABLE IF NOT EXISTS squads (
id SERIAL PRIMARY KEY,
tournament_year INTEGER NOT NULL,
team_id INTEGER NOT NULL,
player_name TEXT NOT NULL,
shirt_number INTEGER,
position TEXT,
date_of_birth DATE
);
CREATE UNIQUE INDEX IF NOT EXISTS squads_unique
ON squads (tournament_year, team_id, shirt_number);
`)
const force = process.argv.includes('--force') || process.argv.includes('-f')
if (!force) {
const existing = await db.execute(sql`SELECT COUNT(*)::int AS cnt FROM tournaments WHERE year < 2026`)
if ((existing[0] as { cnt: number }).cnt > 0) {
console.log('✓ Already seeded (historical data present), skipping. Use --force to re-import.')
await client.end()
return
}
}
if (force) {
console.log('--force: clearing historical data...')
await db.execute(sql`DELETE FROM goals WHERE match_id IN (SELECT id FROM matches WHERE tournament_year < 2026)`)
await db.execute(sql`DELETE FROM squads WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM group_standings WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM stadiums WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM matches WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM tournaments WHERE year < 2026`)
}
console.log('Seeding historical data (19302022)...')
const teamCache = new Map<string, number>()
async function upsertTeam(rawName: string): Promise<number> {
const name = normalizeTeam(rawName)
if (teamCache.has(name)) return teamCache.get(name)!
const iso2 = getIso(name)
const [row] = await db.execute(sql`
INSERT INTO teams (name, iso2)
VALUES (${name}, ${iso2 ?? null})
ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
RETURNING id
`)
const id = (row as { id: number }).id
teamCache.set(name, id)
return id
}
// Per-year data from Wikipedia JSON files
let totalMatches = 0
let totalGoals = 0
for (const year of YEARS) {
const yearDir = path.join(WC_DIR, String(year))
const mainData = readJson<RawData>(path.join(yearDir, 'worldcup.json'))
if (!mainData?.matches) {
console.log(` ${year}: no data file, skipping`)
continue
}
// Tournament row from meta.json
const meta = readJson<RawMeta>(path.join(yearDir, 'worldcup.meta.json'))
if (meta) {
await db.execute(sql`
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
VALUES (
${year}, ${meta.host || null},
${normalizeTeam(meta.winner ?? '') || null}, ${normalizeTeam(meta.runner_up ?? '') || null},
${normalizeTeam(meta.third_place ?? '') || null}, ${normalizeTeam(meta.fourth_place ?? '') || null},
${meta.teams_count ?? null}
)
ON CONFLICT (year) DO UPDATE SET
host = EXCLUDED.host,
winner = EXCLUDED.winner,
runner_up = EXCLUDED.runner_up,
third_place = EXCLUDED.third_place,
fourth_place = EXCLUDED.fourth_place,
teams_count = EXCLUDED.teams_count
`)
}
let matchCount = 0, goalCount = 0
// Stadiums
const stadiumsData = readJson<RawStadiums>(path.join(yearDir, 'worldcup.stadiums.json'))
if (stadiumsData?.stadiums) {
for (const s of stadiumsData.stadiums) {
await db.execute(sql`
INSERT INTO stadiums (tournament_year, name, city)
VALUES (${year}, ${s.name}, ${s.city ?? null})
ON CONFLICT DO NOTHING
`)
}
}
// Matches and goals
for (const m of mainData.matches) {
const t1Id = await upsertTeam(m.team1)
const t2Id = await upsertTeam(m.team2)
const score = parseScore(m.score)
const [matchRow] = await db.execute(sql`
INSERT INTO matches (
tournament_year, round, group_name, date, time_local,
team1_id, team2_id,
score_ft_home, score_ft_away,
score_ht_home, score_ht_away,
score_et_home, score_et_away,
score_p_home, score_p_away,
is_quali_playoff
) VALUES (
${year}, ${m.round ?? 'Unknown'}, ${m.group ?? null},
${m.date ?? null}, ${m.time ?? null},
${t1Id}, ${t2Id},
${score.ft?.[0] ?? null}, ${score.ft?.[1] ?? null},
${score.ht?.[0] ?? null}, ${score.ht?.[1] ?? null},
${score.et?.[0] ?? null}, ${score.et?.[1] ?? null},
${score.p?.[0] ?? null}, ${score.p?.[1] ?? null},
false
)
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
round = EXCLUDED.round,
group_name = COALESCE(EXCLUDED.group_name, matches.group_name),
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
score_ht_home = COALESCE(EXCLUDED.score_ht_home, matches.score_ht_home),
score_ht_away = COALESCE(EXCLUDED.score_ht_away, matches.score_ht_away),
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away)
RETURNING id
`)
const matchId = (matchRow as { id: number }).id
// Goals (delete + re-insert)
await db.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
for (const [rawGoals, teamId, ogTeamId] of [
[m.goals1 ?? [], t1Id, t2Id],
[m.goals2 ?? [], t2Id, t1Id],
] as [RawGoal[], number, number][]) {
for (const g of rawGoals) {
if (!g.name) continue
const minute = g.minute != null ? parseInt(String(g.minute)) : null
const actualTeamId = g.owngoal ? ogTeamId : teamId
await db.execute(sql`
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
VALUES (${matchId}, ${actualTeamId}, ${g.name}, ${!minute || isNaN(minute) ? null : minute},
${g.offset ?? 0}, ${g.penalty ?? false}, ${g.owngoal ?? false})
`)
goalCount++
}
}
matchCount++
}
// Squads
const squadsData = readJson<RawSquad[]>(path.join(yearDir, 'worldcup.squads.json'))
if (squadsData && Array.isArray(squadsData)) {
for (const sq of squadsData) {
const teamId = await upsertTeam(sq.name)
for (const p of sq.players) {
if (!p.name) continue
const dob = p.date_of_birth ? p.date_of_birth.replace(/\s/g, '') : null
await db.execute(sql`
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
VALUES (${year}, ${teamId}, ${p.name}, ${p.number ?? null},
${p.pos ?? null}, ${dob})
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
player_name = EXCLUDED.player_name,
position = EXCLUDED.position,
date_of_birth = EXCLUDED.date_of_birth
`)
}
}
}
console.log(` ${year}: ${matchCount} matches, ${goalCount} goals`)
totalMatches += matchCount
totalGoals += goalCount
}
// 3. Group standings (computed from match results)
console.log('Computing group standings...')
await db.execute(sql`
DELETE FROM group_standings WHERE tournament_year < 2026
`)
await db.execute(sql`
INSERT INTO group_standings (tournament_year, group_name, team_id, played, won, drawn, lost,
goals_for, goals_against, goal_diff, pts)
WITH match_results AS (
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga
FROM matches WHERE tournament_year < 2026 AND group_name IS NOT NULL
AND is_quali_playoff = false AND score_ft_home IS NOT NULL
UNION ALL
SELECT tournament_year, group_name, team2_id, score_ft_away, score_ft_home
FROM matches WHERE tournament_year < 2026 AND group_name IS NOT NULL
AND is_quali_playoff = false AND score_ft_home IS NOT NULL
)
SELECT tournament_year, group_name, team_id,
COUNT(*)::int,
SUM(CASE WHEN gf > ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf = ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf < ga THEN 1 ELSE 0 END)::int,
SUM(gf)::int, SUM(ga)::int, SUM(gf - ga)::int,
SUM(CASE WHEN gf > ga THEN 3 WHEN gf = ga THEN 1 ELSE 0 END)::int
FROM match_results
GROUP BY tournament_year, group_name, team_id
ON CONFLICT (tournament_year, group_name, team_id) DO UPDATE SET
played = EXCLUDED.played, won = EXCLUDED.won, drawn = EXCLUDED.drawn,
lost = EXCLUDED.lost, goals_for = EXCLUDED.goals_for,
goals_against = EXCLUDED.goals_against, goal_diff = EXCLUDED.goal_diff,
pts = EXCLUDED.pts
`)
// 4. Tournament aggregates
await db.execute(sql`
UPDATE tournaments t SET
matches_count = (
SELECT COUNT(*)::int FROM matches WHERE tournament_year = t.year AND is_quali_playoff = false
),
total_goals = (
SELECT COUNT(g.id)::int
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
),
avg_goals_per_game = (
SELECT ROUND(COUNT(g.id)::numeric / NULLIF(COUNT(DISTINCT m.id), 0), 2)
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
AND m.score_ft_home IS NOT NULL
)
WHERE t.year < 2026
`)
console.log(`\n✅ Seed complete: ${totalMatches} matches, ${totalGoals} goals (19302022)`)
await client.end()
}
run().catch(e => { console.error('Seed failed:', e); process.exit(1) })