Files
worldcup/scripts/seed.ts
T
valknar 3955c7492b feat: replace historical sync with Kaggle seed for complete 1930-2022 goal data
- scripts/seed.ts: one-time import of Kaggle FIFA dataset (matches_1930_2022.csv,
  world_cup.csv) covering all 964 matches and 2720 goals from 1930-2022 with full
  scorer names, minutes, penalties, and own goals for every tournament
- scripts/sync.ts: stripped to 2026 only (openfootball live data); historical years
  removed since Kaggle is now authoritative for 1930-2022
- Dockerfile: copy app/data into runner image; CMD runs seed.ts before server.js so
  a fresh deployment auto-seeds on first start (skips if already seeded)
- package.json: add 'seed' script; use --force to re-import from updated CSV files
- app/data/kaggle/: bundle Kaggle CSV files in repo

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-14 18:43:43 +02:00

383 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import postgres from 'postgres'
import { drizzle } from 'drizzle-orm/postgres-js'
import { sql } from 'drizzle-orm'
import { readFileSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
import { getIso } from '../lib/iso-codes'
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const DATA_DIR = path.join(__dirname, '../app/data/kaggle')
// Third/fourth place not present in Kaggle world_cup.csv
const PLACEMENTS: Record<number, { third?: string; fourth?: string }> = {
1930: { third: 'USA', fourth: 'Yugoslavia' },
1934: { third: 'Germany', fourth: 'Austria' },
1938: { third: 'Brazil', fourth: 'Sweden' },
1954: { third: 'Austria', fourth: 'Uruguay' },
1958: { third: 'France', fourth: 'Germany' },
1962: { third: 'Chile', fourth: 'Yugoslavia' },
1966: { third: 'Portugal', fourth: 'Soviet Union' },
1970: { third: 'Germany', fourth: 'Uruguay' },
1974: { third: 'Poland', fourth: 'Brazil' },
1978: { third: 'Brazil', fourth: 'Italy' },
1982: { third: 'Poland', fourth: 'France' },
1986: { third: 'France', fourth: 'Belgium' },
1990: { third: 'Italy', fourth: 'England' },
1994: { third: 'Sweden', fourth: 'Bulgaria' },
1998: { third: 'Croatia', fourth: 'Netherlands' },
2002: { third: 'Turkey', fourth: 'South Korea' },
2006: { third: 'Germany', fourth: 'Portugal' },
2010: { third: 'Germany', fourth: 'Uruguay' },
2014: { third: 'Netherlands', fourth: 'Brazil' },
2018: { third: 'Belgium', fourth: 'England' },
2022: { third: 'Croatia', fourth: 'Morocco' },
}
// Normalize Kaggle team names to match openfootball / our canonical names
const TEAM_ALIASES: Record<string, string> = {
'West Germany': 'Germany',
'Korea Republic': 'South Korea',
'IR Iran': 'Iran',
}
function normTeam(name: string): string {
return TEAM_ALIASES[name] ?? name
}
// Minimal RFC-4180 CSV parser — no external dependency needed
function parseCsv(content: string): Record<string, string>[] {
const rows: string[][] = []
let row: string[] = []
let field = ''
let inQ = false
for (let i = 0; i < content.length; i++) {
const ch = content[i]
if (inQ) {
if (ch === '"') {
if (content[i + 1] === '"') { field += '"'; i++ }
else inQ = false
} else {
field += ch
}
} else if (ch === '"') {
inQ = true
} else if (ch === ',') {
row.push(field); field = ''
} else if (ch === '\n') {
row.push(field); rows.push(row); row = []; field = ''
} else if (ch !== '\r') {
field += ch
}
}
if (field || row.length) { row.push(field); rows.push(row) }
const headers = rows[0]
return rows.slice(1)
.filter(r => r.some(f => f.trim()))
.map(r => Object.fromEntries(headers.map((h, i) => [h.trim(), (r[i] ?? '').trim()])))
}
type GoalEntry = { name: string; minute: number | null; offset: number; isPenalty: boolean; isOwnGoal: boolean }
// Parse "Player Name · 57" or "Player (OG) · 90+3" → GoalEntry
function parseGoalStr(entry: string, isPenalty = false, isOwnGoal = false): GoalEntry | null {
const dot = entry.lastIndexOf('·')
if (dot === -1) return null
const name = entry.slice(0, dot).trim()
.replace(/\s*\(P\)\s*$/, '').replace(/\s*\(OG\)\s*$/, '').trim()
if (!name) return null
const minRaw = entry.slice(dot + 1).trim()
const plusIdx = minRaw.indexOf('+')
let minute: number | null, offset = 0
if (plusIdx !== -1) {
minute = parseInt(minRaw.slice(0, plusIdx))
offset = parseInt(minRaw.slice(plusIdx + 1)) || 0
} else {
const m = parseInt(minRaw)
minute = isNaN(m) ? null : m
}
return { name, minute, offset, isPenalty, isOwnGoal }
}
function parseGoalCol(col: string, isPenalty = false, isOwnGoal = false): GoalEntry[] {
if (!col?.trim()) return []
return col.split('|').map(e => parseGoalStr(e.trim(), isPenalty, isOwnGoal)).filter(Boolean) as GoalEntry[]
}
async function run() {
const client = postgres(DATABASE_URL, { max: 5 })
const db = drizzle(client)
// Create tables (mirrors sync.ts DDL — runs first on a fresh DB)
await db.execute(sql`
CREATE TABLE IF NOT EXISTS tournaments (
year INTEGER PRIMARY KEY,
host TEXT NOT NULL,
winner TEXT,
runner_up TEXT,
third_place TEXT,
fourth_place TEXT,
teams_count INTEGER,
matches_count INTEGER,
total_goals INTEGER,
avg_goals_per_game NUMERIC(4,2)
);
CREATE TABLE IF NOT EXISTS teams (
id SERIAL PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
iso2 TEXT,
fifa_code TEXT,
continent TEXT,
confederation TEXT
);
CREATE TABLE IF NOT EXISTS stadiums (
id SERIAL PRIMARY KEY,
tournament_year INTEGER,
name TEXT NOT NULL,
city TEXT,
country_code TEXT,
capacity INTEGER,
timezone TEXT,
coordinates TEXT
);
CREATE TABLE IF NOT EXISTS matches (
id SERIAL PRIMARY KEY,
tournament_year INTEGER NOT NULL,
round TEXT NOT NULL,
group_name TEXT,
date DATE,
time_local TEXT,
stadium_id INTEGER,
team1_id INTEGER NOT NULL,
team2_id INTEGER NOT NULL,
score_ft_home INTEGER,
score_ft_away INTEGER,
score_ht_home INTEGER,
score_ht_away INTEGER,
score_et_home INTEGER,
score_et_away INTEGER,
score_p_home INTEGER,
score_p_away INTEGER,
is_quali_playoff BOOLEAN DEFAULT false
);
CREATE UNIQUE INDEX IF NOT EXISTS matches_unique
ON matches (tournament_year, team1_id, team2_id, date, is_quali_playoff);
CREATE TABLE IF NOT EXISTS goals (
id SERIAL PRIMARY KEY,
match_id INTEGER NOT NULL,
team_id INTEGER NOT NULL,
player_name TEXT NOT NULL,
minute INTEGER,
minute_offset INTEGER DEFAULT 0,
is_penalty BOOLEAN DEFAULT false,
is_own_goal BOOLEAN DEFAULT false
);
CREATE TABLE IF NOT EXISTS group_standings (
tournament_year INTEGER NOT NULL,
group_name TEXT NOT NULL,
team_id INTEGER NOT NULL,
pos INTEGER,
played INTEGER DEFAULT 0,
won INTEGER DEFAULT 0,
drawn INTEGER DEFAULT 0,
lost INTEGER DEFAULT 0,
goals_for INTEGER DEFAULT 0,
goals_against INTEGER DEFAULT 0,
goal_diff INTEGER DEFAULT 0,
pts INTEGER DEFAULT 0,
PRIMARY KEY (tournament_year, group_name, team_id)
);
CREATE TABLE IF NOT EXISTS squads (
id SERIAL PRIMARY KEY,
tournament_year INTEGER NOT NULL,
team_id INTEGER NOT NULL,
player_name TEXT NOT NULL,
shirt_number INTEGER,
position TEXT,
date_of_birth DATE
);
CREATE UNIQUE INDEX IF NOT EXISTS squads_unique
ON squads (tournament_year, team_id, shirt_number);
`)
const force = process.argv.includes('--force') || process.argv.includes('-f')
// Skip if already seeded (idempotency check)
if (!force) {
const existing = await db.execute(sql`SELECT COUNT(*)::int AS cnt FROM tournaments WHERE year < 2026`)
if ((existing[0] as { cnt: number }).cnt > 0) {
console.log('✓ Already seeded (historical data present), skipping. Use --force to re-import.')
await client.end()
return
}
}
if (force) {
console.log('--force: clearing historical data...')
await db.execute(sql`DELETE FROM goals WHERE match_id IN (SELECT id FROM matches WHERE tournament_year < 2026)`)
await db.execute(sql`DELETE FROM matches WHERE tournament_year < 2026`)
await db.execute(sql`DELETE FROM tournaments WHERE year < 2026`)
}
console.log('Seeding from Kaggle data (19302022)...')
const teamCache = new Map<string, number>()
async function upsertTeam(rawName: string): Promise<number> {
const name = normTeam(rawName)
if (teamCache.has(name)) return teamCache.get(name)!
const [row] = await db.execute(sql`
INSERT INTO teams (name, iso2)
VALUES (${name}, ${getIso(name) ?? null})
ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
RETURNING id
`)
const id = (row as { id: number }).id
teamCache.set(name, id)
return id
}
// 1. Tournaments from world_cup.csv
const wcRows = parseCsv(readFileSync(path.join(DATA_DIR, 'world_cup.csv'), 'utf-8'))
for (const r of wcRows) {
const year = parseInt(r['Year'])
if (isNaN(year)) continue
const winner = normTeam(r['Champion'] || '')
const runnerUp = normTeam(r['Runner-Up'] || '')
const p = PLACEMENTS[year] ?? {}
await db.execute(sql`
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count, matches_count)
VALUES (
${year}, ${r['Host']},
${winner || null}, ${runnerUp || null},
${p.third ?? null}, ${p.fourth ?? null},
${parseInt(r['Teams']) || null}, ${parseInt(r['Matches']) || null}
)
ON CONFLICT (year) DO UPDATE SET
host = EXCLUDED.host,
winner = EXCLUDED.winner,
runner_up = EXCLUDED.runner_up,
third_place = EXCLUDED.third_place,
fourth_place = EXCLUDED.fourth_place,
teams_count = EXCLUDED.teams_count,
matches_count = EXCLUDED.matches_count
`)
}
// 2. Matches + goals from matches_1930_2022.csv
const matchRows = parseCsv(readFileSync(path.join(DATA_DIR, 'matches_1930_2022.csv'), 'utf-8'))
let totalMatches = 0, totalGoals = 0
for (const r of matchRows) {
const year = parseInt(r['Year'])
if (isNaN(year)) continue
const t1Id = await upsertTeam(r['home_team'])
const t2Id = await upsertTeam(r['away_team'])
const homeScore = r['home_score'] !== '' ? parseInt(r['home_score']) : null
const awayScore = r['away_score'] !== '' ? parseInt(r['away_score']) : null
const homePen = r['home_penalty'] !== '' ? parseInt(r['home_penalty']) : null
const awayPen = r['away_penalty'] !== '' ? parseInt(r['away_penalty']) : null
const dateStr = r['Date'] || null
// Parse all goal columns
const homeGoals = parseGoalCol(r['home_goal'])
const awayGoals = parseGoalCol(r['away_goal'])
const homePenGoals = parseGoalCol(r['home_penalty_goal'], true)
const awayPenGoals = parseGoalCol(r['away_penalty_goal'], true)
// home_own_goal = home player scored OG → goal credited to AWAY team
const homeOgGoals = parseGoalCol(r['home_own_goal'], false, true)
// away_own_goal = away player scored OG → goal credited to HOME team
const awayOgGoals = parseGoalCol(r['away_own_goal'], false, true)
// Determine FT vs ET score split from goal minutes
const allGoals = [...homeGoals, ...awayGoals, ...homePenGoals, ...awayPenGoals]
const hasEt = allGoals.some(g => g.minute !== null && g.minute > 90)
let scoreFtHome: number | null, scoreFtAway: number | null
let scoreEtHome: number | null = null, scoreEtAway: number | null = null
if (hasEt) {
// Compute FT from goals in minutes 190
const ftGoalCount = (goals: GoalEntry[]) =>
goals.filter(g => g.minute === null || g.minute <= 90).length
scoreFtHome = ftGoalCount(homeGoals) + ftGoalCount(homePenGoals) + ftGoalCount(awayOgGoals)
scoreFtAway = ftGoalCount(awayGoals) + ftGoalCount(awayPenGoals) + ftGoalCount(homeOgGoals)
scoreEtHome = homeScore
scoreEtAway = awayScore
} else {
scoreFtHome = homeScore
scoreFtAway = awayScore
}
const [matchRow] = await db.execute(sql`
INSERT INTO matches (
tournament_year, round, date, team1_id, team2_id,
score_ft_home, score_ft_away, score_et_home, score_et_away,
score_p_home, score_p_away, is_quali_playoff
) VALUES (
${year}, ${r['Round'] || 'Unknown'}, ${dateStr},
${t1Id}, ${t2Id},
${scoreFtHome}, ${scoreFtAway}, ${scoreEtHome}, ${scoreEtAway},
${homePen}, ${awayPen}, false
)
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
round = EXCLUDED.round,
score_ft_home = EXCLUDED.score_ft_home,
score_ft_away = EXCLUDED.score_ft_away,
score_et_home = EXCLUDED.score_et_home,
score_et_away = EXCLUDED.score_et_away,
score_p_home = EXCLUDED.score_p_home,
score_p_away = EXCLUDED.score_p_away
RETURNING id
`)
const matchId = (matchRow as { id: number }).id
await db.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
// home team goals (+ away player own goals that benefit home)
for (const g of [...homeGoals, ...homePenGoals, ...awayOgGoals]) {
await db.execute(sql`
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
VALUES (${matchId}, ${t1Id}, ${g.name}, ${g.minute}, ${g.offset}, ${g.isPenalty}, ${g.isOwnGoal})
`)
totalGoals++
}
// away team goals (+ home player own goals that benefit away)
for (const g of [...awayGoals, ...awayPenGoals, ...homeOgGoals]) {
await db.execute(sql`
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
VALUES (${matchId}, ${t2Id}, ${g.name}, ${g.minute}, ${g.offset}, ${g.isPenalty}, ${g.isOwnGoal})
`)
totalGoals++
}
totalMatches++
}
// 3. Update tournament aggregates
await db.execute(sql`
UPDATE tournaments t SET
total_goals = (
SELECT COUNT(g.id)::int
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
),
matches_count = (
SELECT COUNT(*)::int FROM matches WHERE tournament_year = t.year AND is_quali_playoff = false
),
avg_goals_per_game = (
SELECT ROUND(COUNT(g.id)::numeric / NULLIF(COUNT(DISTINCT m.id), 0), 2)
FROM goals g JOIN matches m ON g.match_id = m.id
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
)
WHERE t.year < 2026
`)
console.log(`✅ Seed complete: ${totalMatches} matches, ${totalGoals} goals (19302022)`)
await client.end()
}
run().catch(e => { console.error('Seed failed:', e); process.exit(1) })