import postgres from 'postgres' import { drizzle } from 'drizzle-orm/postgres-js' import { sql } from 'drizzle-orm' import { readFileSync } from 'fs' import path from 'path' import { fileURLToPath } from 'url' import { getIso } from '../lib/iso-codes' const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup' const __dirname = path.dirname(fileURLToPath(import.meta.url)) const DATA_DIR = path.join(__dirname, '../app/data/kaggle') // Third/fourth place not present in Kaggle world_cup.csv const PLACEMENTS: Record = { 1930: { third: 'USA', fourth: 'Yugoslavia' }, 1934: { third: 'Germany', fourth: 'Austria' }, 1938: { third: 'Brazil', fourth: 'Sweden' }, 1954: { third: 'Austria', fourth: 'Uruguay' }, 1958: { third: 'France', fourth: 'Germany' }, 1962: { third: 'Chile', fourth: 'Yugoslavia' }, 1966: { third: 'Portugal', fourth: 'Soviet Union' }, 1970: { third: 'Germany', fourth: 'Uruguay' }, 1974: { third: 'Poland', fourth: 'Brazil' }, 1978: { third: 'Brazil', fourth: 'Italy' }, 1982: { third: 'Poland', fourth: 'France' }, 1986: { third: 'France', fourth: 'Belgium' }, 1990: { third: 'Italy', fourth: 'England' }, 1994: { third: 'Sweden', fourth: 'Bulgaria' }, 1998: { third: 'Croatia', fourth: 'Netherlands' }, 2002: { third: 'Turkey', fourth: 'South Korea' }, 2006: { third: 'Germany', fourth: 'Portugal' }, 2010: { third: 'Germany', fourth: 'Uruguay' }, 2014: { third: 'Netherlands', fourth: 'Brazil' }, 2018: { third: 'Belgium', fourth: 'England' }, 2022: { third: 'Croatia', fourth: 'Morocco' }, } // Normalize Kaggle team names to match openfootball / our canonical names const TEAM_ALIASES: Record = { 'West Germany': 'Germany', 'Korea Republic': 'South Korea', 'IR Iran': 'Iran', } function normTeam(name: string): string { return TEAM_ALIASES[name] ?? name } // Minimal RFC-4180 CSV parser — no external dependency needed function parseCsv(content: string): Record[] { const rows: string[][] = [] let row: string[] = [] let field = '' let inQ = false for (let i = 0; i < content.length; i++) { const ch = content[i] if (inQ) { if (ch === '"') { if (content[i + 1] === '"') { field += '"'; i++ } else inQ = false } else { field += ch } } else if (ch === '"') { inQ = true } else if (ch === ',') { row.push(field); field = '' } else if (ch === '\n') { row.push(field); rows.push(row); row = []; field = '' } else if (ch !== '\r') { field += ch } } if (field || row.length) { row.push(field); rows.push(row) } const headers = rows[0] return rows.slice(1) .filter(r => r.some(f => f.trim())) .map(r => Object.fromEntries(headers.map((h, i) => [h.trim(), (r[i] ?? '').trim()]))) } type GoalEntry = { name: string; minute: number | null; offset: number; isPenalty: boolean; isOwnGoal: boolean } // Parse "Player Name · 57" or "Player (OG) · 90+3" → GoalEntry function parseGoalStr(entry: string, isPenalty = false, isOwnGoal = false): GoalEntry | null { const dot = entry.lastIndexOf('·') if (dot === -1) return null const name = entry.slice(0, dot).trim() .replace(/\s*\(P\)\s*$/, '').replace(/\s*\(OG\)\s*$/, '').trim() if (!name) return null const minRaw = entry.slice(dot + 1).trim() const plusIdx = minRaw.indexOf('+') let minute: number | null, offset = 0 if (plusIdx !== -1) { minute = parseInt(minRaw.slice(0, plusIdx)) offset = parseInt(minRaw.slice(plusIdx + 1)) || 0 } else { const m = parseInt(minRaw) minute = isNaN(m) ? null : m } return { name, minute, offset, isPenalty, isOwnGoal } } function parseGoalCol(col: string, isPenalty = false, isOwnGoal = false): GoalEntry[] { if (!col?.trim()) return [] return col.split('|').map(e => parseGoalStr(e.trim(), isPenalty, isOwnGoal)).filter(Boolean) as GoalEntry[] } async function run() { const client = postgres(DATABASE_URL, { max: 5 }) const db = drizzle(client) // Create tables (mirrors sync.ts DDL — runs first on a fresh DB) await db.execute(sql` CREATE TABLE IF NOT EXISTS tournaments ( year INTEGER PRIMARY KEY, host TEXT NOT NULL, winner TEXT, runner_up TEXT, third_place TEXT, fourth_place TEXT, teams_count INTEGER, matches_count INTEGER, total_goals INTEGER, avg_goals_per_game NUMERIC(4,2) ); CREATE TABLE IF NOT EXISTS teams ( id SERIAL PRIMARY KEY, name TEXT UNIQUE NOT NULL, iso2 TEXT, fifa_code TEXT, continent TEXT, confederation TEXT ); CREATE TABLE IF NOT EXISTS stadiums ( id SERIAL PRIMARY KEY, tournament_year INTEGER, name TEXT NOT NULL, city TEXT, country_code TEXT, capacity INTEGER, timezone TEXT, coordinates TEXT ); CREATE TABLE IF NOT EXISTS matches ( id SERIAL PRIMARY KEY, tournament_year INTEGER NOT NULL, round TEXT NOT NULL, group_name TEXT, date DATE, time_local TEXT, stadium_id INTEGER, team1_id INTEGER NOT NULL, team2_id INTEGER NOT NULL, score_ft_home INTEGER, score_ft_away INTEGER, score_ht_home INTEGER, score_ht_away INTEGER, score_et_home INTEGER, score_et_away INTEGER, score_p_home INTEGER, score_p_away INTEGER, is_quali_playoff BOOLEAN DEFAULT false ); CREATE UNIQUE INDEX IF NOT EXISTS matches_unique ON matches (tournament_year, team1_id, team2_id, date, is_quali_playoff); CREATE TABLE IF NOT EXISTS goals ( id SERIAL PRIMARY KEY, match_id INTEGER NOT NULL, team_id INTEGER NOT NULL, player_name TEXT NOT NULL, minute INTEGER, minute_offset INTEGER DEFAULT 0, is_penalty BOOLEAN DEFAULT false, is_own_goal BOOLEAN DEFAULT false ); CREATE TABLE IF NOT EXISTS group_standings ( tournament_year INTEGER NOT NULL, group_name TEXT NOT NULL, team_id INTEGER NOT NULL, pos INTEGER, played INTEGER DEFAULT 0, won INTEGER DEFAULT 0, drawn INTEGER DEFAULT 0, lost INTEGER DEFAULT 0, goals_for INTEGER DEFAULT 0, goals_against INTEGER DEFAULT 0, goal_diff INTEGER DEFAULT 0, pts INTEGER DEFAULT 0, PRIMARY KEY (tournament_year, group_name, team_id) ); CREATE TABLE IF NOT EXISTS squads ( id SERIAL PRIMARY KEY, tournament_year INTEGER NOT NULL, team_id INTEGER NOT NULL, player_name TEXT NOT NULL, shirt_number INTEGER, position TEXT, date_of_birth DATE ); CREATE UNIQUE INDEX IF NOT EXISTS squads_unique ON squads (tournament_year, team_id, shirt_number); `) const force = process.argv.includes('--force') || process.argv.includes('-f') // Skip if already seeded (idempotency check) if (!force) { const existing = await db.execute(sql`SELECT COUNT(*)::int AS cnt FROM tournaments WHERE year < 2026`) if ((existing[0] as { cnt: number }).cnt > 0) { console.log('✓ Already seeded (historical data present), skipping. Use --force to re-import.') await client.end() return } } if (force) { console.log('--force: clearing historical data...') await db.execute(sql`DELETE FROM goals WHERE match_id IN (SELECT id FROM matches WHERE tournament_year < 2026)`) await db.execute(sql`DELETE FROM matches WHERE tournament_year < 2026`) await db.execute(sql`DELETE FROM tournaments WHERE year < 2026`) } console.log('Seeding from Kaggle data (1930–2022)...') const teamCache = new Map() async function upsertTeam(rawName: string): Promise { const name = normTeam(rawName) if (teamCache.has(name)) return teamCache.get(name)! const [row] = await db.execute(sql` INSERT INTO teams (name, iso2) VALUES (${name}, ${getIso(name) ?? null}) ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name RETURNING id `) const id = (row as { id: number }).id teamCache.set(name, id) return id } // 1. Tournaments from world_cup.csv const wcRows = parseCsv(readFileSync(path.join(DATA_DIR, 'world_cup.csv'), 'utf-8')) for (const r of wcRows) { const year = parseInt(r['Year']) if (isNaN(year)) continue const winner = normTeam(r['Champion'] || '') const runnerUp = normTeam(r['Runner-Up'] || '') const p = PLACEMENTS[year] ?? {} await db.execute(sql` INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count, matches_count) VALUES ( ${year}, ${r['Host']}, ${winner || null}, ${runnerUp || null}, ${p.third ?? null}, ${p.fourth ?? null}, ${parseInt(r['Teams']) || null}, ${parseInt(r['Matches']) || null} ) ON CONFLICT (year) DO UPDATE SET host = EXCLUDED.host, winner = EXCLUDED.winner, runner_up = EXCLUDED.runner_up, third_place = EXCLUDED.third_place, fourth_place = EXCLUDED.fourth_place, teams_count = EXCLUDED.teams_count, matches_count = EXCLUDED.matches_count `) } // 2. Matches + goals from matches_1930_2022.csv const matchRows = parseCsv(readFileSync(path.join(DATA_DIR, 'matches_1930_2022.csv'), 'utf-8')) let totalMatches = 0, totalGoals = 0 for (const r of matchRows) { const year = parseInt(r['Year']) if (isNaN(year)) continue const t1Id = await upsertTeam(r['home_team']) const t2Id = await upsertTeam(r['away_team']) const homeScore = r['home_score'] !== '' ? parseInt(r['home_score']) : null const awayScore = r['away_score'] !== '' ? parseInt(r['away_score']) : null const homePen = r['home_penalty'] !== '' ? parseInt(r['home_penalty']) : null const awayPen = r['away_penalty'] !== '' ? parseInt(r['away_penalty']) : null const dateStr = r['Date'] || null // Parse all goal columns const homeGoals = parseGoalCol(r['home_goal']) const awayGoals = parseGoalCol(r['away_goal']) const homePenGoals = parseGoalCol(r['home_penalty_goal'], true) const awayPenGoals = parseGoalCol(r['away_penalty_goal'], true) // home_own_goal = home player scored OG → goal credited to AWAY team const homeOgGoals = parseGoalCol(r['home_own_goal'], false, true) // away_own_goal = away player scored OG → goal credited to HOME team const awayOgGoals = parseGoalCol(r['away_own_goal'], false, true) // Determine FT vs ET score split from goal minutes const allGoals = [...homeGoals, ...awayGoals, ...homePenGoals, ...awayPenGoals] const hasEt = allGoals.some(g => g.minute !== null && g.minute > 90) let scoreFtHome: number | null, scoreFtAway: number | null let scoreEtHome: number | null = null, scoreEtAway: number | null = null if (hasEt) { // Compute FT from goals in minutes 1–90 const ftGoalCount = (goals: GoalEntry[]) => goals.filter(g => g.minute === null || g.minute <= 90).length scoreFtHome = ftGoalCount(homeGoals) + ftGoalCount(homePenGoals) + ftGoalCount(awayOgGoals) scoreFtAway = ftGoalCount(awayGoals) + ftGoalCount(awayPenGoals) + ftGoalCount(homeOgGoals) scoreEtHome = homeScore scoreEtAway = awayScore } else { scoreFtHome = homeScore scoreFtAway = awayScore } const [matchRow] = await db.execute(sql` INSERT INTO matches ( tournament_year, round, date, team1_id, team2_id, score_ft_home, score_ft_away, score_et_home, score_et_away, score_p_home, score_p_away, is_quali_playoff ) VALUES ( ${year}, ${r['Round'] || 'Unknown'}, ${dateStr}, ${t1Id}, ${t2Id}, ${scoreFtHome}, ${scoreFtAway}, ${scoreEtHome}, ${scoreEtAway}, ${homePen}, ${awayPen}, false ) ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET round = EXCLUDED.round, score_ft_home = EXCLUDED.score_ft_home, score_ft_away = EXCLUDED.score_ft_away, score_et_home = EXCLUDED.score_et_home, score_et_away = EXCLUDED.score_et_away, score_p_home = EXCLUDED.score_p_home, score_p_away = EXCLUDED.score_p_away RETURNING id `) const matchId = (matchRow as { id: number }).id await db.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`) // home team goals (+ away player own goals that benefit home) for (const g of [...homeGoals, ...homePenGoals, ...awayOgGoals]) { await db.execute(sql` INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal) VALUES (${matchId}, ${t1Id}, ${g.name}, ${g.minute}, ${g.offset}, ${g.isPenalty}, ${g.isOwnGoal}) `) totalGoals++ } // away team goals (+ home player own goals that benefit away) for (const g of [...awayGoals, ...awayPenGoals, ...homeOgGoals]) { await db.execute(sql` INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal) VALUES (${matchId}, ${t2Id}, ${g.name}, ${g.minute}, ${g.offset}, ${g.isPenalty}, ${g.isOwnGoal}) `) totalGoals++ } totalMatches++ } // 3. Update tournament aggregates await db.execute(sql` UPDATE tournaments t SET total_goals = ( SELECT COUNT(g.id)::int FROM goals g JOIN matches m ON g.match_id = m.id WHERE m.tournament_year = t.year AND m.is_quali_playoff = false ), matches_count = ( SELECT COUNT(*)::int FROM matches WHERE tournament_year = t.year AND is_quali_playoff = false ), avg_goals_per_game = ( SELECT ROUND(COUNT(g.id)::numeric / NULLIF(COUNT(DISTINCT m.id), 0), 2) FROM goals g JOIN matches m ON g.match_id = m.id WHERE m.tournament_year = t.year AND m.is_quali_playoff = false ) WHERE t.year < 2026 `) console.log(`✅ Seed complete: ${totalMatches} matches, ${totalGoals} goals (1930–2022)`) await client.end() } run().catch(e => { console.error('Seed failed:', e); process.exit(1) })