feat: replace historical sync with Kaggle seed for complete 1930-2022 goal data

- scripts/seed.ts: one-time import of Kaggle FIFA dataset (matches_1930_2022.csv,
  world_cup.csv) covering all 964 matches and 2720 goals from 1930-2022 with full
  scorer names, minutes, penalties, and own goals for every tournament
- scripts/sync.ts: stripped to 2026 only (openfootball live data); historical years
  removed since Kaggle is now authoritative for 1930-2022
- Dockerfile: copy app/data into runner image; CMD runs seed.ts before server.js so
  a fresh deployment auto-seeds on first start (skips if already seeded)
- package.json: add 'seed' script; use --force to re-import from updated CSV files
- app/data/kaggle/: bundle Kaggle CSV files in repo

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-14 18:43:43 +02:00
parent 191888225f
commit 3955c7492b
9 changed files with 1969 additions and 206 deletions
+99 -205
View File
@@ -6,46 +6,6 @@ import { TEAM_ISO, getIso } from '../lib/iso-codes'
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
const BASE = 'https://raw.githubusercontent.com/openfootball/worldcup.json/master'
const YEARS = [
1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970,
1974, 1978, 1982, 1986, 1990, 1994, 1998, 2002, 2006,
2010, 2014, 2018, 2022, 2026,
]
const HOSTS: Record<number, string> = {
1930: 'Uruguay', 1934: 'Italy', 1938: 'France', 1950: 'Brazil',
1954: 'Switzerland', 1958: 'Sweden', 1962: 'Chile', 1966: 'England',
1970: 'Mexico', 1974: 'Germany', 1978: 'Argentina', 1982: 'Spain',
1986: 'Mexico', 1990: 'Italy', 1994: 'USA', 1998: 'France',
2002: 'South Korea / Japan', 2006: 'Germany', 2010: 'South Africa',
2014: 'Brazil', 2018: 'Russia', 2022: 'Qatar', 2026: 'USA / Canada / Mexico',
}
const WINNERS: Record<number, { winner: string; runnerUp: string; third?: string; fourth?: string }> = {
1930: { winner: 'Uruguay', runnerUp: 'Argentina', third: 'USA', fourth: 'Yugoslavia' },
1934: { winner: 'Italy', runnerUp: 'Czechoslovakia', third: 'Germany', fourth: 'Austria' },
1938: { winner: 'Italy', runnerUp: 'Hungary', third: 'Brazil', fourth: 'Sweden' },
1950: { winner: 'Uruguay', runnerUp: 'Brazil', third: 'Sweden', fourth: 'Spain' },
1954: { winner: 'Germany', runnerUp: 'Hungary', third: 'Austria', fourth: 'Uruguay' },
1958: { winner: 'Brazil', runnerUp: 'Sweden', third: 'France', fourth: 'Germany' },
1962: { winner: 'Brazil', runnerUp: 'Czechoslovakia', third: 'Chile', fourth: 'Yugoslavia' },
1966: { winner: 'England', runnerUp: 'Germany', third: 'Portugal', fourth: 'Soviet Union' },
1970: { winner: 'Brazil', runnerUp: 'Italy', third: 'Germany', fourth: 'Uruguay' },
1974: { winner: 'Germany', runnerUp: 'Netherlands', third: 'Poland', fourth: 'Brazil' },
1978: { winner: 'Argentina', runnerUp: 'Netherlands', third: 'Brazil', fourth: 'Italy' },
1982: { winner: 'Italy', runnerUp: 'Germany', third: 'Poland', fourth: 'France' },
1986: { winner: 'Argentina', runnerUp: 'Germany', third: 'France', fourth: 'Belgium' },
1990: { winner: 'Germany', runnerUp: 'Argentina', third: 'Italy', fourth: 'England' },
1994: { winner: 'Brazil', runnerUp: 'Italy', third: 'Sweden', fourth: 'Bulgaria' },
1998: { winner: 'France', runnerUp: 'Brazil', third: 'Croatia', fourth: 'Netherlands' },
2002: { winner: 'Brazil', runnerUp: 'Germany', third: 'Turkey', fourth: 'South Korea' },
2006: { winner: 'Italy', runnerUp: 'France', third: 'Germany', fourth: 'Portugal' },
2010: { winner: 'Spain', runnerUp: 'Netherlands', third: 'Germany', fourth: 'Uruguay' },
2014: { winner: 'Germany', runnerUp: 'Argentina', third: 'Netherlands', fourth: 'Brazil' },
2018: { winner: 'France', runnerUp: 'Croatia', third: 'Belgium', fourth: 'England' },
2022: { winner: 'Argentina', runnerUp: 'France', third: 'Croatia', fourth: 'Morocco' },
}
async function fetchJson(url: string): Promise<unknown> {
try {
const res = await fetch(url)
@@ -76,7 +36,7 @@ async function run() {
const client = postgres(DATABASE_URL, { max: 5 })
const db = drizzle(client)
console.log('Creating tables...')
// Safety net — seed.ts should have created these already
await db.execute(sql`
CREATE TABLE IF NOT EXISTS tournaments (
year INTEGER PRIMARY KEY,
@@ -128,7 +88,8 @@ async function run() {
score_p_away INTEGER,
is_quali_playoff BOOLEAN DEFAULT false
);
CREATE UNIQUE INDEX IF NOT EXISTS matches_unique ON matches (tournament_year, team1_id, team2_id, date, is_quali_playoff);
CREATE UNIQUE INDEX IF NOT EXISTS matches_unique
ON matches (tournament_year, team1_id, team2_id, date, is_quali_playoff);
CREATE TABLE IF NOT EXISTS goals (
id SERIAL PRIMARY KEY,
match_id INTEGER NOT NULL,
@@ -163,31 +124,27 @@ async function run() {
position TEXT,
date_of_birth DATE
);
CREATE UNIQUE INDEX IF NOT EXISTS squads_unique ON squads (tournament_year, team_id, shirt_number);
CREATE UNIQUE INDEX IF NOT EXISTS squads_unique
ON squads (tournament_year, team_id, shirt_number);
`)
const TEAM_ALIASES: Record<string, string> = {
'West Germany': 'Germany',
}
const teamCache = new Map<string, number>()
async function upsertTeam(rawName: string, extra?: { iso2?: string; fifaCode?: string; continent?: string; confederation?: string }) {
const name = TEAM_ALIASES[rawName] ?? rawName
if (teamCache.has(name)) return teamCache.get(name)!
const iso2 = extra?.iso2 ?? getIso(name)
if (teamCache.has(rawName)) return teamCache.get(rawName)!
const iso2 = extra?.iso2 ?? getIso(rawName)
const [row] = await db.execute(sql`
INSERT INTO teams (name, iso2, fifa_code, continent, confederation)
VALUES (${name}, ${iso2 ?? null}, ${extra?.fifaCode ?? null}, ${extra?.continent ?? null}, ${extra?.confederation ?? null})
VALUES (${rawName}, ${iso2 ?? null}, ${extra?.fifaCode ?? null}, ${extra?.continent ?? null}, ${extra?.confederation ?? null})
ON CONFLICT (name) DO UPDATE SET
iso2 = COALESCE(EXCLUDED.iso2, teams.iso2),
fifa_code = COALESCE(EXCLUDED.fifa_code, teams.fifa_code),
continent = COALESCE(EXCLUDED.continent, teams.continent),
iso2 = COALESCE(EXCLUDED.iso2, teams.iso2),
fifa_code = COALESCE(EXCLUDED.fifa_code, teams.fifa_code),
continent = COALESCE(EXCLUDED.continent, teams.continent),
confederation = COALESCE(EXCLUDED.confederation, teams.confederation)
RETURNING id
`)
const id = (row as { id: number }).id
teamCache.set(name, id)
teamCache.set(rawName, id)
return id
}
@@ -210,16 +167,16 @@ async function run() {
${isQuali}
)
ON CONFLICT (tournament_year, team1_id, team2_id, date, is_quali_playoff) DO UPDATE SET
round = EXCLUDED.round,
round = EXCLUDED.round,
time_local = COALESCE(EXCLUDED.time_local, matches.time_local),
score_ft_home = COALESCE(EXCLUDED.score_ft_home, matches.score_ft_home),
score_ft_away = COALESCE(EXCLUDED.score_ft_away, matches.score_ft_away),
score_ht_home = COALESCE(EXCLUDED.score_ht_home, matches.score_ht_home),
score_ht_away = COALESCE(EXCLUDED.score_ht_away, matches.score_ht_away),
score_et_home = COALESCE(EXCLUDED.score_et_home, matches.score_et_home),
score_et_away = COALESCE(EXCLUDED.score_et_away, matches.score_et_away),
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away),
time_local = COALESCE(EXCLUDED.time_local, matches.time_local)
score_p_home = COALESCE(EXCLUDED.score_p_home, matches.score_p_home),
score_p_away = COALESCE(EXCLUDED.score_p_away, matches.score_p_away)
RETURNING id
`)
return (rows[0] as { id: number }).id
@@ -238,61 +195,51 @@ async function run() {
}
}
for (const year of YEARS) {
console.log(`\nSyncing ${year}...`)
console.log('\nSyncing 2026...')
// 1. Upsert tournament
const winData = WINNERS[year]
await db.execute(sql`
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place)
VALUES (${year}, ${HOSTS[year]}, ${winData?.winner ?? null}, ${winData?.runnerUp ?? null},
${winData?.third ?? null}, ${winData?.fourth ?? null})
ON CONFLICT (year) DO UPDATE SET
winner = COALESCE(EXCLUDED.winner, tournaments.winner),
runner_up = COALESCE(EXCLUDED.runner_up, tournaments.runner_up)
`)
// Upsert 2026 tournament row (no winner yet)
await db.execute(sql`
INSERT INTO tournaments (year, host)
VALUES (2026, 'USA / Canada / Mexico')
ON CONFLICT (year) DO NOTHING
`)
// 2. Teams enrichment
const teamsData = await fetchJson(`${BASE}/${year}/worldcup.teams.json`) as Record<string, unknown>[] | null
if (teamsData && Array.isArray(teamsData)) {
for (const t of teamsData) {
const name = (t.name ?? t.name_normalised) as string
const iso2 = (t.flag_icon as string)?.match(/[\uD83C][\uDDE6-\uDDFF][\uD83C][\uDDE6-\uDDFF]/)?.[0]
? TEAM_ISO[name as string] ?? getIso(name)
: TEAM_ISO[name as string] ?? getIso(name)
await upsertTeam(name, {
iso2: iso2,
fifaCode: t.fifa_code as string,
continent: t.continent as string,
confederation: t.confed as string,
})
}
// Teams enrichment
const teamsData = await fetchJson(`${BASE}/2026/worldcup.teams.json`) as Record<string, unknown>[] | null
if (teamsData && Array.isArray(teamsData)) {
for (const t of teamsData) {
const name = (t.name ?? t.name_normalised) as string
await upsertTeam(name, {
iso2: TEAM_ISO[name] ?? getIso(name),
fifaCode: t.fifa_code as string,
continent: t.continent as string,
confederation: t.confed as string,
})
}
}
// 3. Stadiums
const stadiumsData = await fetchJson(`${BASE}/${year}/worldcup.stadiums.json`) as { stadiums?: Record<string, unknown>[] } | null
if (stadiumsData?.stadiums) {
for (const s of stadiumsData.stadiums) {
await db.execute(sql`
INSERT INTO stadiums (tournament_year, name, city, country_code, capacity, timezone, coordinates)
VALUES (${year}, ${s.name as string}, ${s.city as string}, ${(s.cc as string | undefined) ?? null},
${(s.capacity as number | undefined) ?? null}, ${(s.timezone as string | undefined) ?? null}, ${(s.coords as string | undefined) ?? null})
ON CONFLICT DO NOTHING
`)
}
// Stadiums
const stadiumsData = await fetchJson(`${BASE}/2026/worldcup.stadiums.json`) as { stadiums?: Record<string, unknown>[] } | null
if (stadiumsData?.stadiums) {
for (const s of stadiumsData.stadiums) {
await db.execute(sql`
INSERT INTO stadiums (tournament_year, name, city, country_code, capacity, timezone, coordinates)
VALUES (2026, ${s.name as string}, ${s.city as string}, ${(s.cc as string | undefined) ?? null},
${(s.capacity as number | undefined) ?? null}, ${(s.timezone as string | undefined) ?? null}, ${(s.coords as string | undefined) ?? null})
ON CONFLICT DO NOTHING
`)
}
}
// 4. Main matches
const mainData = await fetchJson(`${BASE}/${year}/worldcup.json`) as RawData | null
if (!mainData?.matches) { console.log(` No match data`); continue }
let matchCount = 0, goalCount = 0
// Main matches
const mainData = await fetchJson(`${BASE}/2026/worldcup.json`) as RawData | null
let matchCount = 0, goalCount = 0
if (mainData?.matches) {
for (const m of mainData.matches) {
const t1Id = await upsertTeam(m.team1)
const t2Id = await upsertTeam(m.team2)
const score = parseScore(m.score)
const group = m.group ?? null
const matchId = await upsertMatch(year, m.round ?? 'Unknown', group, m.date ?? null, m.time ?? null, t1Id, t2Id, score, false)
const matchId = await upsertMatch(2026, m.round ?? 'Unknown', m.group ?? null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, false)
if (m.goals1?.length || m.goals2?.length) {
await db.execute(sql`DELETE FROM goals WHERE match_id = ${matchId}`)
if (m.goals1?.length) await syncGoals(matchId, t1Id, m.goals1, t2Id)
@@ -301,108 +248,41 @@ async function run() {
matchCount++
goalCount += (m.goals1?.length ?? 0) + (m.goals2?.length ?? 0)
}
// 5. Standings (2014, 2018)
const standingsData = await fetchJson(`${BASE}/${year}/worldcup.standings.json`) as { groups?: Record<string, unknown>[] } | null
if (standingsData?.groups) {
for (const grp of standingsData.groups) {
const standings = grp.standings as Record<string, unknown>[]
for (const s of standings) {
const t = s.team as { name: string; code: string }
const teamId = await upsertTeam(t.name, { fifaCode: t.code })
await db.execute(sql`
INSERT INTO group_standings (tournament_year, group_name, team_id, pos, played, won, drawn, lost, goals_for, goals_against, goal_diff, pts)
VALUES (${year}, ${grp.name as string}, ${teamId}, ${s.pos as number ?? null},
${s.played as number ?? 0}, ${s.won as number ?? 0}, ${s.drawn as number ?? 0}, ${s.lost as number ?? 0},
${s.goals_for as number ?? 0}, ${s.goals_against as number ?? 0},
${((s.goals_for as number ?? 0) - (s.goals_against as number ?? 0))},
${s.pts as number ?? 0})
ON CONFLICT (tournament_year, group_name, team_id) DO UPDATE SET
pos = EXCLUDED.pos, played = EXCLUDED.played, won = EXCLUDED.won,
drawn = EXCLUDED.drawn, lost = EXCLUDED.lost, goals_for = EXCLUDED.goals_for,
goals_against = EXCLUDED.goals_against, goal_diff = EXCLUDED.goal_diff, pts = EXCLUDED.pts
`)
}
}
} else if (year !== 2026) {
// Compute standings from match results for years without standings.json
await db.execute(sql`
WITH match_results AS (
SELECT tournament_year, group_name,
team1_id AS team_id,
score_ft_home AS gf, score_ft_away AS ga
FROM matches WHERE tournament_year = ${year} AND group_name IS NOT NULL AND is_quali_playoff = false AND score_ft_home IS NOT NULL
UNION ALL
SELECT tournament_year, group_name,
team2_id, score_ft_away, score_ft_home
FROM matches WHERE tournament_year = ${year} AND group_name IS NOT NULL AND is_quali_playoff = false AND score_ft_home IS NOT NULL
)
INSERT INTO group_standings (tournament_year, group_name, team_id, played, won, drawn, lost, goals_for, goals_against, goal_diff, pts)
SELECT
tournament_year, group_name, team_id,
COUNT(*)::int, SUM(CASE WHEN gf > ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf = ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf < ga THEN 1 ELSE 0 END)::int,
SUM(gf)::int, SUM(ga)::int, SUM(gf - ga)::int,
(SUM(CASE WHEN gf > ga THEN 3 WHEN gf = ga THEN 1 ELSE 0 END))::int
FROM match_results
GROUP BY tournament_year, group_name, team_id
ON CONFLICT (tournament_year, group_name, team_id) DO UPDATE SET
played = EXCLUDED.played, won = EXCLUDED.won, drawn = EXCLUDED.drawn,
lost = EXCLUDED.lost, goals_for = EXCLUDED.goals_for, goals_against = EXCLUDED.goals_against,
goal_diff = EXCLUDED.goal_diff, pts = EXCLUDED.pts
`)
}
// 6. Squads (2026)
const squadsData = await fetchJson(`${BASE}/${year}/worldcup.squads.json`) as Record<string, unknown>[] | null
if (squadsData && Array.isArray(squadsData)) {
for (const sq of squadsData) {
const teamId = await upsertTeam(sq.name as string)
for (const p of (sq.players as Record<string, unknown>[])) {
await db.execute(sql`
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
VALUES (${year}, ${teamId}, ${p.name as string}, ${p.number as number ?? null},
${p.pos as string ?? null}, ${p.date_of_birth as string ?? null})
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
player_name = EXCLUDED.player_name, position = EXCLUDED.position, date_of_birth = EXCLUDED.date_of_birth
`)
}
}
console.log(` Squads loaded for ${year}`)
}
// 7. Quali playoffs (2026)
const qualiData = await fetchJson(`${BASE}/${year}/worldcup.quali_playoffs.json`) as RawData | null
if (qualiData?.matches) {
for (const m of qualiData.matches) {
const t1Id = await upsertTeam(m.team1)
const t2Id = await upsertTeam(m.team2)
const score = parseScore(m.score)
const matchId = await upsertMatch(year, m.round ?? 'Qualifier', null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, true)
if (m.goals1?.length) await syncGoals(matchId, t1Id, m.goals1, t2Id)
if (m.goals2?.length) await syncGoals(matchId, t2Id, m.goals2, t1Id)
}
console.log(` Quali playoffs: ${qualiData.matches.length} matches`)
}
// 8. Recompute tournament aggregates
await db.execute(sql`
UPDATE tournaments SET
matches_count = (SELECT COUNT(*)::int FROM matches WHERE tournament_year = ${year} AND is_quali_playoff = false),
total_goals = (SELECT COALESCE(SUM(score_ft_home + score_ft_away), 0)::int FROM matches WHERE tournament_year = ${year} AND is_quali_playoff = false AND score_ft_home IS NOT NULL),
avg_goals_per_game = (
SELECT ROUND(COALESCE(SUM(score_ft_home + score_ft_away), 0)::numeric / NULLIF(COUNT(*), 0), 2)
FROM matches
WHERE tournament_year = ${year} AND is_quali_playoff = false AND score_ft_home IS NOT NULL
)
WHERE year = ${year}
`)
console.log(`${matchCount} matches, ${goalCount} goals`)
}
// Compute 2026 group standings from match results
// Squads
const squadsData = await fetchJson(`${BASE}/2026/worldcup.squads.json`) as Record<string, unknown>[] | null
if (squadsData && Array.isArray(squadsData)) {
for (const sq of squadsData) {
const teamId = await upsertTeam(sq.name as string)
for (const p of (sq.players as Record<string, unknown>[])) {
await db.execute(sql`
INSERT INTO squads (tournament_year, team_id, player_name, shirt_number, position, date_of_birth)
VALUES (2026, ${teamId}, ${p.name as string}, ${p.number as number ?? null},
${p.pos as string ?? null}, ${p.date_of_birth as string ?? null})
ON CONFLICT (tournament_year, team_id, shirt_number) DO UPDATE SET
player_name = EXCLUDED.player_name, position = EXCLUDED.position, date_of_birth = EXCLUDED.date_of_birth
`)
}
}
console.log(' Squads loaded for 2026')
}
// Quali playoffs
const qualiData = await fetchJson(`${BASE}/2026/worldcup.quali_playoffs.json`) as RawData | null
if (qualiData?.matches) {
for (const m of qualiData.matches) {
const t1Id = await upsertTeam(m.team1)
const t2Id = await upsertTeam(m.team2)
const score = parseScore(m.score)
const matchId = await upsertMatch(2026, m.round ?? 'Qualifier', null, m.date ?? null, m.time ?? null, t1Id, t2Id, score, true)
if (m.goals1?.length) await syncGoals(matchId, t1Id, m.goals1, t2Id)
if (m.goals2?.length) await syncGoals(matchId, t2Id, m.goals2, t1Id)
}
console.log(` Quali playoffs: ${qualiData.matches.length} matches`)
}
// Group standings from match results
await db.execute(sql`
WITH match_results AS (
SELECT tournament_year, group_name, team1_id AS team_id, score_ft_home AS gf, score_ft_away AS ga
@@ -413,7 +293,8 @@ async function run() {
)
INSERT INTO group_standings (tournament_year, group_name, team_id, played, won, drawn, lost, goals_for, goals_against, goal_diff, pts)
SELECT tournament_year, group_name, team_id,
COUNT(*)::int, SUM(CASE WHEN gf > ga THEN 1 ELSE 0 END)::int,
COUNT(*)::int,
SUM(CASE WHEN gf > ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf = ga THEN 1 ELSE 0 END)::int,
SUM(CASE WHEN gf < ga THEN 1 ELSE 0 END)::int,
SUM(gf)::int, SUM(ga)::int, SUM(gf - ga)::int,
@@ -426,6 +307,19 @@ async function run() {
goal_diff = EXCLUDED.goal_diff, pts = EXCLUDED.pts
`)
// Tournament aggregates
await db.execute(sql`
UPDATE tournaments SET
matches_count = (SELECT COUNT(*)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false),
total_goals = (SELECT COALESCE(SUM(score_ft_home + score_ft_away), 0)::int FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL),
avg_goals_per_game = (
SELECT ROUND(COALESCE(SUM(score_ft_home + score_ft_away), 0)::numeric / NULLIF(COUNT(*), 0), 2)
FROM matches WHERE tournament_year = 2026 AND is_quali_playoff = false AND score_ft_home IS NOT NULL
)
WHERE year = 2026
`)
console.log(`${matchCount} matches, ${goalCount} goals`)
console.log('\n✅ Sync complete!')
await client.end()
}