feat: scrape tournament meta from Wikipedia, drop world_cup.csv
Add worldcup.meta.json per year with host, teams_count, winner, runner_up, third_place, fourth_place — derived from match results (Final/Third-place match) with infobox as fallback for edge cases like 1950's round-robin final. Fix infobox host extraction to handle <br>-separated multi-host entries (2002: Japan / South Korea). Fix squad scraper to filter out zero-player phantom sections that Wikipedia appends (References, Captains, etc.). Drop app/data/world_cup.csv and the PLACEMENTS/parseCsv code in seed.ts — all tournament metadata now comes from the scraped JSON files. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+24
-86
@@ -8,39 +8,13 @@ import { getIso } from '../lib/iso-codes'
|
||||
|
||||
const DATABASE_URL = process.env.DATABASE_URL ?? 'postgres://wc:wc@localhost:5432/worldcup'
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url))
|
||||
const DATA_DIR = path.join(__dirname, '../app/data')
|
||||
const WC_DIR = path.join(DATA_DIR, 'wikipedia')
|
||||
const WC_DIR = path.join(__dirname, '../app/data/wikipedia')
|
||||
|
||||
const YEARS = [
|
||||
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
|
||||
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
|
||||
]
|
||||
|
||||
// Third/fourth place not reliably in source data for older years
|
||||
const PLACEMENTS: Record<number, { third?: string; fourth?: string }> = {
|
||||
1930: { third: 'USA', fourth: 'Yugoslavia' },
|
||||
1934: { third: 'Germany', fourth: 'Austria' },
|
||||
1938: { third: 'Brazil', fourth: 'Sweden' },
|
||||
1954: { third: 'Austria', fourth: 'Uruguay' },
|
||||
1958: { third: 'France', fourth: 'Germany' },
|
||||
1962: { third: 'Chile', fourth: 'Yugoslavia' },
|
||||
1966: { third: 'Portugal', fourth: 'Soviet Union' },
|
||||
1970: { third: 'Germany', fourth: 'Uruguay' },
|
||||
1974: { third: 'Poland', fourth: 'Brazil' },
|
||||
1978: { third: 'Brazil', fourth: 'Italy' },
|
||||
1982: { third: 'Poland', fourth: 'France' },
|
||||
1986: { third: 'France', fourth: 'Belgium' },
|
||||
1990: { third: 'Italy', fourth: 'England' },
|
||||
1994: { third: 'Sweden', fourth: 'Bulgaria' },
|
||||
1998: { third: 'Croatia', fourth: 'Netherlands' },
|
||||
2002: { third: 'Turkey', fourth: 'South Korea' },
|
||||
2006: { third: 'Germany', fourth: 'Portugal' },
|
||||
2010: { third: 'Germany', fourth: 'Uruguay' },
|
||||
2014: { third: 'Netherlands', fourth: 'Brazil' },
|
||||
2018: { third: 'Belgium', fourth: 'England' },
|
||||
2022: { third: 'Croatia', fourth: 'Morocco' },
|
||||
}
|
||||
|
||||
// Normalize team names from Wikipedia to canonical DB names
|
||||
const TEAM_ALIASES: Record<string, string> = {
|
||||
'West Germany': 'Germany',
|
||||
@@ -52,38 +26,6 @@ function normTeam(name: string): string {
|
||||
return TEAM_ALIASES[name] ?? name
|
||||
}
|
||||
|
||||
// Minimal RFC-4180 CSV parser
|
||||
function parseCsv(content: string): Record<string, string>[] {
|
||||
const rows: string[][] = []
|
||||
let row: string[] = []
|
||||
let field = ''
|
||||
let inQ = false
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const ch = content[i]
|
||||
if (inQ) {
|
||||
if (ch === '"') {
|
||||
if (content[i + 1] === '"') { field += '"'; i++ }
|
||||
else inQ = false
|
||||
} else {
|
||||
field += ch
|
||||
}
|
||||
} else if (ch === '"') {
|
||||
inQ = true
|
||||
} else if (ch === ',') {
|
||||
row.push(field); field = ''
|
||||
} else if (ch === '\n') {
|
||||
row.push(field); rows.push(row); row = []; field = ''
|
||||
} else if (ch !== '\r') {
|
||||
field += ch
|
||||
}
|
||||
}
|
||||
if (field || row.length) { row.push(field); rows.push(row) }
|
||||
const headers = rows[0]
|
||||
return rows.slice(1)
|
||||
.filter(r => r.some(f => f.trim()))
|
||||
.map(r => Object.fromEntries(headers.map((h, i) => [h.trim(), (r[i] ?? '').trim()])))
|
||||
}
|
||||
|
||||
function readJson<T>(filePath: string): T | null {
|
||||
if (!existsSync(filePath)) return null
|
||||
try { return JSON.parse(readFileSync(filePath, 'utf-8')) as T } catch { return null }
|
||||
@@ -100,6 +42,7 @@ type RawMatch = {
|
||||
group?: string; ground?: string;
|
||||
}
|
||||
type RawData = { matches: RawMatch[] }
|
||||
type RawMeta = { host: string; teams_count: number | null; winner: string | null; runner_up: string | null; third_place: string | null; fourth_place: string | null }
|
||||
type RawStadiums = { stadiums: { name: string; city: string; cc?: string; capacity?: number; timezone?: string; coords?: string }[] }
|
||||
type RawSquad = { name: string; players: { name: string; number?: number; pos?: string; date_of_birth?: string }[] }
|
||||
|
||||
@@ -245,33 +188,7 @@ async function run() {
|
||||
return id
|
||||
}
|
||||
|
||||
// 1. Tournaments from world_cup.csv (host, winner, runner_up)
|
||||
const wcRows = parseCsv(readFileSync(path.join(DATA_DIR, 'world_cup.csv'), 'utf-8'))
|
||||
for (const r of wcRows) {
|
||||
const year = parseInt(r['Year'])
|
||||
if (isNaN(year)) continue
|
||||
const winner = normTeam(r['Champion'] || '')
|
||||
const runnerUp = normTeam(r['Runner-Up'] || '')
|
||||
const p = PLACEMENTS[year] ?? {}
|
||||
await db.execute(sql`
|
||||
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
|
||||
VALUES (
|
||||
${year}, ${r['Host']},
|
||||
${winner || null}, ${runnerUp || null},
|
||||
${p.third ?? null}, ${p.fourth ?? null},
|
||||
${parseInt(r['Teams']) || null}
|
||||
)
|
||||
ON CONFLICT (year) DO UPDATE SET
|
||||
host = EXCLUDED.host,
|
||||
winner = EXCLUDED.winner,
|
||||
runner_up = EXCLUDED.runner_up,
|
||||
third_place = EXCLUDED.third_place,
|
||||
fourth_place = EXCLUDED.fourth_place,
|
||||
teams_count = EXCLUDED.teams_count
|
||||
`)
|
||||
}
|
||||
|
||||
// 2. Per-year match/stadium/squad data from openfootball JSON files
|
||||
// Per-year data from Wikipedia JSON files
|
||||
let totalMatches = 0
|
||||
let totalGoals = 0
|
||||
|
||||
@@ -283,6 +200,27 @@ async function run() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Tournament row from meta.json
|
||||
const meta = readJson<RawMeta>(path.join(yearDir, 'worldcup.meta.json'))
|
||||
if (meta) {
|
||||
await db.execute(sql`
|
||||
INSERT INTO tournaments (year, host, winner, runner_up, third_place, fourth_place, teams_count)
|
||||
VALUES (
|
||||
${year}, ${meta.host || null},
|
||||
${normTeam(meta.winner ?? '') || null}, ${normTeam(meta.runner_up ?? '') || null},
|
||||
${normTeam(meta.third_place ?? '') || null}, ${normTeam(meta.fourth_place ?? '') || null},
|
||||
${meta.teams_count ?? null}
|
||||
)
|
||||
ON CONFLICT (year) DO UPDATE SET
|
||||
host = EXCLUDED.host,
|
||||
winner = EXCLUDED.winner,
|
||||
runner_up = EXCLUDED.runner_up,
|
||||
third_place = EXCLUDED.third_place,
|
||||
fourth_place = EXCLUDED.fourth_place,
|
||||
teams_count = EXCLUDED.teams_count
|
||||
`)
|
||||
}
|
||||
|
||||
let matchCount = 0, goalCount = 0
|
||||
|
||||
// Stadiums
|
||||
|
||||
Reference in New Issue
Block a user