383 lines
14 KiB
TypeScript
383 lines
14 KiB
TypeScript
|
|
import postgres from 'postgres'
|
|||
|
|
import { drizzle } from 'drizzle-orm/postgres-js'
|
|||
|
|
import { sql } from 'drizzle-orm'
|
|||
|
|
import { readFileSync } from 'fs'
|
|||
|
|
import path from 'path'
|
|||
|
|
import { fileURLToPath } from 'url'
|
|||
|
|
import { getIso } from '../lib/iso-codes'
|
|||
|
|
|
|||
|
|
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
|
|||
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url))
|
|||
|
|
const DATA_DIR = path.join(__dirname, '../app/data/kaggle')
|
|||
|
|
|
|||
|
|
// Third/fourth place not present in Kaggle world_cup.csv
|
|||
|
|
const PLACEMENTS: Record<number, { third?: string; fourth?: string }> = {
|
|||
|
|
1930: { third: 'USA', fourth: 'Yugoslavia' },
|
|||
|
|
1934: { third: 'Germany', fourth: 'Austria' },
|
|||
|
|
1938: { third: 'Brazil', fourth: 'Sweden' },
|
|||
|
|
1954: { third: 'Austria', fourth: 'Uruguay' },
|
|||
|
|
1958: { third: 'France', fourth: 'Germany' },
|
|||
|
|
1962: { third: 'Chile', fourth: 'Yugoslavia' },
|
|||
|
|
1966: { third: 'Portugal', fourth: 'Soviet Union' },
|
|||
|
|
1970: { third: 'Germany', fourth: 'Uruguay' },
|
|||
|
|
1974: { third: 'Poland', fourth: 'Brazil' },
|
|||
|
|
1978: { third: 'Brazil', fourth: 'Italy' },
|
|||
|
|
1982: { third: 'Poland', fourth: 'France' },
|
|||
|
|
1986: { third: 'France', fourth: 'Belgium' },
|
|||
|
|
1990: { third: 'Italy', fourth: 'England' },
|
|||
|
|
1994: { third: 'Sweden', fourth: 'Bulgaria' },
|
|||
|
|
1998: { third: 'Croatia', fourth: 'Netherlands' },
|
|||
|
|
2002: { third: 'Turkey', fourth: 'South Korea' },
|
|||
|
|
2006: { third: 'Germany', fourth: 'Portugal' },
|
|||
|
|
2010: { third: 'Germany', fourth: 'Uruguay' },
|
|||
|
|
2014: { third: 'Netherlands', fourth: 'Brazil' },
|
|||
|
|
2018: { third: 'Belgium', fourth: 'England' },
|
|||
|
|
2022: { third: 'Croatia', fourth: 'Morocco' },
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Normalize Kaggle team names to match openfootball / our canonical names
|
|||
|
|
const TEAM_ALIASES: Record<string, string> = {
|
|||
|
|
'West Germany': 'Germany',
|
|||
|
|
'Korea Republic': 'South Korea',
|
|||
|
|
'IR Iran': 'Iran',
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function normTeam(name: string): string {
|
|||
|
|
return TEAM_ALIASES[name] ?? name
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Minimal RFC-4180 CSV parser — no external dependency needed
|
|||
|
|
function parseCsv(content: string): Record<string, string>[] {
|
|||
|
|
const rows: string[][] = []
|
|||
|
|
let row: string[] = []
|
|||
|
|
let field = ''
|
|||
|
|
let inQ = false
|
|||
|
|
for (let i = 0; i < content.length; i++) {
|
|||
|
|
const ch = content[i]
|
|||
|
|
if (inQ) {
|
|||
|
|
if (ch === '"') {
|
|||
|
|
if (content[i + 1] === '"') { field += '"'; i++ }
|
|||
|
|
else inQ = false
|
|||
|
|
} else {
|
|||
|
|
field += ch
|
|||
|
|
}
|
|||
|
|
} else if (ch === '"') {
|
|||
|
|
inQ = true
|
|||
|
|
} else if (ch === ',') {
|
|||
|
|
row.push(field); field = ''
|
|||
|
|
} else if (ch === '\n') {
|
|||
|
|
row.push(field); rows.push(row); row = []; field = ''
|
|||
|
|
} else if (ch !== '\r') {
|
|||
|
|
field += ch
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (field || row.length) { row.push(field); rows.push(row) }
|
|||
|
|
const headers = rows[0]
|
|||
|
|
return rows.slice(1)
|
|||
|
|
.filter(r => r.some(f => f.trim()))
|
|||
|
|
.map(r => Object.fromEntries(headers.map((h, i) => [h.trim(), (r[i] ?? '').trim()])))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type GoalEntry = { name: string; minute: number | null; offset: number; isPenalty: boolean; isOwnGoal: boolean }
|
|||
|
|
|
|||
|
|
// Parse "Player Name · 57" or "Player (OG) · 90+3" → GoalEntry
|
|||
|
|
function parseGoalStr(entry: string, isPenalty = false, isOwnGoal = false): GoalEntry | null {
|
|||
|
|
const dot = entry.lastIndexOf('·')
|
|||
|
|
if (dot === -1) return null
|
|||
|
|
const name = entry.slice(0, dot).trim()
|
|||
|
|
.replace(/\s*\(P\)\s*$/, '').replace(/\s*\(OG\)\s*$/, '').trim()
|
|||
|
|
if (!name) return null
|
|||
|
|
const minRaw = entry.slice(dot + 1).trim()
|
|||
|
|
const plusIdx = minRaw.indexOf('+')
|
|||
|
|
let minute: number | null, offset = 0
|
|||
|
|
if (plusIdx !== -1) {
|
|||
|
|
minute = parseInt(minRaw.slice(0, plusIdx))
|
|||
|
|
offset = parseInt(minRaw.slice(plusIdx + 1)) || 0
|
|||
|
|
} else {
|
|||
|
|
const m = parseInt(minRaw)
|
|||
|
|
minute = isNaN(m) ? null : m
|
|||
|
|
}
|
|||
|
|
return { name, minute, offset, isPenalty, isOwnGoal }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function parseGoalCol(col: string, isPenalty = false, isOwnGoal = false): GoalEntry[] {
|
|||
|
|
if (!col?.trim()) return []
|
|||
|
|
return col.split('|').map(e => parseGoalStr(e.trim(), isPenalty, isOwnGoal)).filter(Boolean) as GoalEntry[]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
async function run() {
|
|||
|
|
const client = postgres(DATABASE_URL, { max: 5 })
|
|||
|
|
const db = drizzle(client)
|
|||
|
|
|
|||
|
|
// Create tables (mirrors sync.ts DDL — runs first on a fresh DB)
|
|||
|
|
await db.execute(sql`
|
|||
|
|
CREATE TABLE IF NOT EXISTS tournaments (
|
|||
|
|
year INTEGER PRIMARY KEY,
|
|||
|
|
host TEXT NOT NULL,
|
|||
|
|
winner TEXT,
|
|||
|
|
runner_up TEXT,
|
|||
|
|
third_place TEXT,
|
|||
|
|
fourth_place TEXT,
|
|||
|
|
teams_count INTEGER,
|
|||
|
|
matches_count INTEGER,
|
|||
|
|
total_goals INTEGER,
|
|||
|
|
avg_goals_per_game NUMERIC(4,2)
|
|||
|
|
);
|
|||
|
|
CREATE TABLE IF NOT EXISTS teams (
|
|||
|
|
id SERIAL PRIMARY KEY,
|
|||
|
|
name TEXT UNIQUE NOT NULL,
|
|||
|
|
iso2 TEXT,
|
|||
|
|
fifa_code TEXT,
|
|||
|
|
continent TEXT,
|
|||
|
|
confederation TEXT
|
|||
|
|
);
|
|||
|
|
CREATE TABLE IF NOT EXISTS stadiums (
|
|||
|
|
id SERIAL PRIMARY KEY,
|
|||
|
|
tournament_year INTEGER,
|
|||
|
|
name TEXT NOT NULL,
|
|||
|
|
city TEXT,
|
|||
|
|
country_code TEXT,
|
|||
|
|
capacity INTEGER,
|
|||
|
|
timezone TEXT,
|
|||
|
|
coordinates TEXT
|
|||
|
|
);
|
|||
|
|
CREATE TABLE IF NOT EXISTS matches (
|
|||
|
|
id SERIAL PRIMARY KEY,
|
|||
|
|
tournament_year INTEGER NOT NULL,
|
|||
|
|
round TEXT NOT NULL,
|
|||
|
|
group_name TEXT,
|
|||
|
|
date DATE,
|
|||
|
|
time_local TEXT,
|
|||
|
|
stadium_id INTEGER,
|
|||
|
|
team1_id INTEGER NOT NULL,
|
|||
|
|
team2_id INTEGER NOT NULL,
|
|||
|
|
score_ft_home INTEGER,
|
|||
|
|
score_ft_away INTEGER,
|
|||
|
|
score_ht_home INTEGER,
|
|||
|
|
score_ht_away INTEGER,
|
|||
|
|
score_et_home INTEGER,
|
|||
|
|
score_et_away INTEGER,
|
|||
|
|
score_p_home INTEGER,
|
|||
|
|
score_p_away INTEGER,
|
|||
|
|
is_quali_playoff BOOLEAN DEFAULT false
|
|||
|
|
);
|
|||
|
|
CREATE UNIQUE INDEX IF NOT EXISTS matches_unique
|
|||
|
|
ON matches (tournament_year, team1_id, team2_id, date, is_quali_playoff);
|
|||
|
|
CREATE TABLE IF NOT EXISTS goals (
|
|||
|
|
id SERIAL PRIMARY KEY,
|
|||
|
|
match_id INTEGER NOT NULL,
|
|||
|
|
team_id INTEGER NOT NULL,
|
|||
|
|
player_name TEXT NOT NULL,
|
|||
|
|
minute INTEGER,
|
|||
|
|
minute_offset INTEGER DEFAULT 0,
|
|||
|
|
is_penalty BOOLEAN DEFAULT false,
|
|||
|
|
is_own_goal BOOLEAN DEFAULT false
|
|||
|
|
);
|
|||
|
|
CREATE TABLE IF NOT EXISTS group_standings (
|
|||
|
|
tournament_year INTEGER NOT NULL,
|
|||
|
|
group_name TEXT NOT NULL,
|
|||
|
|
team_id INTEGER NOT NULL,
|
|||
|
|
pos INTEGER,
|
|||
|
|
played INTEGER DEFAULT 0,
|
|||
|
|
won INTEGER DEFAULT 0,
|
|||
|
|
drawn INTEGER DEFAULT 0,
|
|||
|
|
lost INTEGER DEFAULT 0,
|
|||
|
|
goals_for INTEGER DEFAULT 0,
|
|||
|
|
goals_against INTEGER DEFAULT 0,
|
|||
|
|
goal_diff INTEGER DEFAULT 0,
|
|||
|
|
pts INTEGER DEFAULT 0,
|
|||
|
|
PRIMARY KEY (tournament_year, group_name, team_id)
|
|||
|
|
);
|
|||
|
|
CREATE TABLE IF NOT EXISTS squads (
|
|||
|
|
id SERIAL PRIMARY KEY,
|
|||
|
|
tournament_year INTEGER NOT NULL,
|
|||
|
|
team_id INTEGER NOT NULL,
|
|||
|
|
player_name TEXT NOT NULL,
|
|||
|
|
shirt_number INTEGER,
|
|||
|
|
position TEXT,
|
|||
|
|
date_of_birth DATE
|
|||
|
|
);
|
|||
|
|
CREATE UNIQUE INDEX IF NOT EXISTS squads_unique
|
|||
|
|
ON squads (tournament_year, team_id, shirt_number);
|
|||
|
|
`)
|
|||
|
|
|
|||
|
|
const force = process.argv.includes('--force') || process.argv.includes('-f')
|
|||
|
|
|
|||
|
|
// Skip if already seeded (idempotency check)
|
|||
|
|
if (!force) {
|
|||
|
|
const existing = await db.execute(sql`SELECT COUNT(*)::int AS cnt FROM tournaments WHERE year < 2026`)
|
|||
|
|
if ((existing[0] as { cnt: number }).cnt > 0) {
|
|||
|
|
console.log('✓ Already seeded (historical data present), skipping. Use --force to re-import.')
|
|||
|
|
await client.end()
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (force) {
|
|||
|
|
console.log('--force: clearing historical data...')
|
|||
|
|
await db.execute(sql`DELETE FROM goals WHERE match_id IN (SELECT id FROM matches WHERE tournament_year < 2026)`)
|
|||
|
|
await db.execute(sql`DELETE FROM matches WHERE tournament_year < 2026`)
|
|||
|
|
await db.execute(sql`DELETE FROM tournaments WHERE year < 2026`)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
console.log('Seeding from Kaggle data (1930–2022)...')
|
|||
|
|
|
|||
|
|
const teamCache = new Map<string, number>()
|
|||
|
|
|
|||
|
|
async function upsertTeam(rawName: string): Promise<number> {
|
|||
|
|
const name = normTeam(rawName)
|
|||
|
|
if (teamCache.has(name)) return teamCache.get(name)!
|
|||
|
|
const [row] = await db.execute(sql`
|
|||
|
|
INSERT INTO teams (name, iso2)
|
|||
|
|
VALUES (${name}, ${getIso(name) ?? null})
|
|||
|
|
ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
|
|||
|
|
RETURNING id
|
|||
|
|
`)
|
|||
|
|
const id = (row as { id: number }).id
|
|||
|
|
teamCache.set(name, id)
|
|||
|
|
return id
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 1. Tournaments from world_cup.csv
|
|||
|
|
const wcRows = parseCsv(readFileSync(path.join(DATA_DIR, 'world_cup.csv'), 'utf-8'))
|
|||
|
|
for (const r of wcRows) {
|
|||
|
|
const year = parseInt(r['Year'])
|
|||
|
|
if (isNaN(year)) continue
|
|||
|
|
const winner = normTeam(r['Champion'] || '')
|
|||
|
|
const runnerUp = normTeam(r['Runner-Up'] || '')
|
|||
|
|
const p = PLACEMENTS[year] ?? {}
|
|||
|
|
await db.execute(sql`
|
|||
|
|
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count, matches_count)
|
|||
|
|
VALUES (
|
|||
|
|
${year}, ${r['Host']},
|
|||
|
|
${winner || null}, ${runnerUp || null},
|
|||
|
|
${p.third ?? null}, ${p.fourth ?? null},
|
|||
|
|
${parseInt(r['Teams']) || null}, ${parseInt(r['Matches']) || null}
|
|||
|
|
)
|
|||
|
|
ON CONFLICT (year) DO UPDATE SET
|
|||
|
|
host = EXCLUDED.host,
|
|||
|
|
winner = EXCLUDED.winner,
|
|||
|
|
runner_up = EXCLUDED.runner_up,
|
|||
|
|
third_place = EXCLUDED.third_place,
|
|||
|
|
fourth_place = EXCLUDED.fourth_place,
|
|||
|
|
teams_count = EXCLUDED.teams_count,
|
|||
|
|
matches_count = EXCLUDED.matches_count
|
|||
|
|
`)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 2. Matches + goals from matches_1930_2022.csv
|
|||
|
|
const matchRows = parseCsv(readFileSync(path.join(DATA_DIR, 'matches_1930_2022.csv'), 'utf-8'))
|
|||
|
|
|
|||
|
|
let totalMatches = 0, totalGoals = 0
|
|||
|
|
for (const r of matchRows) {
|
|||
|
|
const year = parseInt(r['Year'])
|
|||
|
|
if (isNaN(year)) continue
|
|||
|
|
|
|||
|
|
const t1Id = await upsertTeam(r['home_team'])
|
|||
|
|
const t2Id = await upsertTeam(r['away_team'])
|
|||
|
|
|
|||
|
|
const homeScore = r['home_score'] !== '' ? parseInt(r['home_score']) : null
|
|||
|
|
const awayScore = r['away_score'] !== '' ? parseInt(r['away_score']) : null
|
|||
|
|
const homePen = r['home_penalty'] !== '' ? parseInt(r['home_penalty']) : null
|
|||
|
|
const awayPen = r['away_penalty'] !== '' ? parseInt(r['away_penalty']) : null
|
|||
|
|
const dateStr = r['Date'] || null
|
|||
|
|
|
|||
|
|
// Parse all goal columns
|
|||
|
|
const homeGoals = parseGoalCol(r['home_goal'])
|
|||
|
|
const awayGoals = parseGoalCol(r['away_goal'])
|
|||
|
|
const homePenGoals = parseGoalCol(r['home_penalty_goal'], true)
|
|||
|
|
const awayPenGoals = parseGoalCol(r['away_penalty_goal'], true)
|
|||
|
|
// home_own_goal = home player scored OG → goal credited to AWAY team
|
|||
|
|
const homeOgGoals = parseGoalCol(r['home_own_goal'], false, true)
|
|||
|
|
// away_own_goal = away player scored OG → goal credited to HOME team
|
|||
|
|
const awayOgGoals = parseGoalCol(r['away_own_goal'], false, true)
|
|||
|
|
|
|||
|
|
// Determine FT vs ET score split from goal minutes
|
|||
|
|
const allGoals = [...homeGoals, ...awayGoals, ...homePenGoals, ...awayPenGoals]
|
|||
|
|
const hasEt = allGoals.some(g => g.minute !== null && g.minute > 90)
|
|||
|
|
|
|||
|
|
let scoreFtHome: number | null, scoreFtAway: number | null
|
|||
|
|
let scoreEtHome: number | null = null, scoreEtAway: number | null = null
|
|||
|
|
|
|||
|
|
if (hasEt) {
|
|||
|
|
// Compute FT from goals in minutes 1–90
|
|||
|
|
const ftGoalCount = (goals: GoalEntry[]) =>
|
|||
|
|
goals.filter(g => g.minute === null || g.minute <= 90).length
|
|||
|
|
scoreFtHome = ftGoalCount(homeGoals) + ftGoalCount(homePenGoals) + ftGoalCount(awayOgGoals)
|
|||
|
|
scoreFtAway = ftGoalCount(awayGoals) + ftGoalCount(awayPenGoals) + ftGoalCount(homeOgGoals)
|
|||
|
|
scoreEtHome = homeScore
|
|||
|
|
scoreEtAway = awayScore
|
|||
|
|
} else {
|
|||
|
|
scoreFtHome = homeScore
|
|||
|
|
scoreFtAway = awayScore
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const [matchRow] = await db.execute(sql`
|
|||
|
|
INSERT INTO matches (
|
|||
|
|
tournament_year, round, date, team1_id, team2_id,
|
|||
|
|
score_ft_home, score_ft_away, score_et_home, score_et_away,
|
|||
|
|
score_p_home, score_p_away, is_quali_playoff
|
|||
|
|
) VALUES (
|
|||
|
|
${year}, ${r['Round'] || 'Unknown'}, ${dateStr},
|
|||
|
|
${t1Id}, ${t2Id},
|
|||
|
|
${scoreFtHome}, ${scoreFtAway}, ${scoreEtHome}, ${scoreEtAway},
|
|||
|
|
${homePen}, ${awayPen}, false
|
|||
|
|
)
|
|||
|
|
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
|
|||
|
|
round = EXCLUDED.round,
|
|||
|
|
score_ft_home = EXCLUDED.score_ft_home,
|
|||
|
|
score_ft_away = EXCLUDED.score_ft_away,
|
|||
|
|
score_et_home = EXCLUDED.score_et_home,
|
|||
|
|
score_et_away = EXCLUDED.score_et_away,
|
|||
|
|
score_p_home = EXCLUDED.score_p_home,
|
|||
|
|
score_p_away = EXCLUDED.score_p_away
|
|||
|
|
RETURNING id
|
|||
|
|
`)
|
|||
|
|
const matchId = (matchRow as { id: number }).id
|
|||
|
|
|
|||
|
|
await db.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
|
|||
|
|
|
|||
|
|
// home team goals (+ away player own goals that benefit home)
|
|||
|
|
for (const g of [...homeGoals, ...homePenGoals, ...awayOgGoals]) {
|
|||
|
|
await db.execute(sql`
|
|||
|
|
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
|
|||
|
|
VALUES (${matchId}, ${t1Id}, ${g.name}, ${g.minute}, ${g.offset}, ${g.isPenalty}, ${g.isOwnGoal})
|
|||
|
|
`)
|
|||
|
|
totalGoals++
|
|||
|
|
}
|
|||
|
|
// away team goals (+ home player own goals that benefit away)
|
|||
|
|
for (const g of [...awayGoals, ...awayPenGoals, ...homeOgGoals]) {
|
|||
|
|
await db.execute(sql`
|
|||
|
|
INSERT INTO goals (match_id, team_id, player_name, minute, minute_offset, is_penalty, is_own_goal)
|
|||
|
|
VALUES (${matchId}, ${t2Id}, ${g.name}, ${g.minute}, ${g.offset}, ${g.isPenalty}, ${g.isOwnGoal})
|
|||
|
|
`)
|
|||
|
|
totalGoals++
|
|||
|
|
}
|
|||
|
|
totalMatches++
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 3. Update tournament aggregates
|
|||
|
|
await db.execute(sql`
|
|||
|
|
UPDATE tournaments t SET
|
|||
|
|
total_goals = (
|
|||
|
|
SELECT COUNT(g.id)::int
|
|||
|
|
FROM goals g JOIN matches m ON g.match_id = m.id
|
|||
|
|
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
|
|||
|
|
),
|
|||
|
|
matches_count = (
|
|||
|
|
SELECT COUNT(*)::int FROM matches WHERE tournament_year = t.year AND is_quali_playoff = false
|
|||
|
|
),
|
|||
|
|
avg_goals_per_game = (
|
|||
|
|
SELECT ROUND(COUNT(g.id)::numeric / NULLIF(COUNT(DISTINCT m.id), 0), 2)
|
|||
|
|
FROM goals g JOIN matches m ON g.match_id = m.id
|
|||
|
|
WHERE m.tournament_year = t.year AND m.is_quali_playoff = false
|
|||
|
|
)
|
|||
|
|
WHERE t.year < 2026
|
|||
|
|
`)
|
|||
|
|
|
|||
|
|
console.log(`✅ Seed complete: ${totalMatches} matches, ${totalGoals} goals (1930–2022)`)
|
|||
|
|
await client.end()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
run().catch(e => { console.error('Seed failed:', e); process.exit(1) })
|