Files
worldcup/scripts/scrape-wikipedia.ts
T
valknar b141356247 refactor: replace hardcoded hex colors with theme tokens, move data/ to root
- Add --color-green-mid token (#4a7a55) to @theme for dimmer stat values
- Replace all text-[#hex]/bg-[#hex] arbitrary values with named tokens:
  text-green, text-green-light, text-green-sec, text-green-muted,
  text-green-dark, text-green-mid, text-text, bg-card, bg-bg, border-border
- Replace rgba(34,197,94,X) inline styles with bg-green/X opacity modifiers
- Convert single-prop style={{ borderColor/background }} to className
- Fix SVG stroke="#dff5e8" → stroke="currentColor"
- Use CSS variables in globals.css base styles (background-color, color)
- Move app/data/wikipedia/ → data/ (project root, not inside Next.js app dir)
- Update Dockerfile, seed.ts, scrape-wikipedia.ts paths accordingly
- Remove unused app/data/world_cup.csv

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-15 18:08:23 +02:00

102 lines
3.9 KiB
TypeScript

/**
* Scrape English Wikipedia for World Cup data and write JSON files to
* data/wikipedia/{year}/.
*
* Usage:
* pnpm scrape # all years, matches + squads
* pnpm scrape 2022 # single year, matches + squads
* pnpm scrape 2022 --matches # matches + meta + stadiums only
* pnpm scrape 2022 --squads # squads only
*/
import { mkdirSync, writeFileSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
import {
fetchWikiHtml, scrapeYear, scrapeSquads,
type Match, type Stadium, type Group, type Meta, type Squad,
} from '../lib/wiki-scraper'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const DATA_DIR = path.join(__dirname, '../data/wikipedia')
const YEARS = [
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
]
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
// ── File output ────────────────────────────────────────────────────────────
function writeMatches(
year: number,
matches: Match[],
stadiums: Map<string, Stadium>,
groups: Map<string, Set<string>>,
meta: Meta,
): void {
const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true })
writeFileSync(path.join(dir, 'worldcup.meta.json'), JSON.stringify(meta, null, 2), 'utf-8')
writeFileSync(path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), 'utf-8')
if (stadiums.size > 0)
writeFileSync(path.join(dir, 'worldcup.stadiums.json'),
JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), 'utf-8')
const groupList: Group[] = []
groups.forEach((teams, name) => groupList.push({ name, teams: Array.from(teams) }))
if (groupList.length > 0)
writeFileSync(path.join(dir, 'worldcup.groups.json'),
JSON.stringify({ groups: groupList }, null, 2), 'utf-8')
}
function writeSquads(year: number, squads: Squad[]): void {
if (squads.length === 0) return
const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true })
writeFileSync(path.join(dir, 'worldcup.squads.json'), JSON.stringify(squads, null, 2), 'utf-8')
}
// ── Entry point ────────────────────────────────────────────────────────────
async function main() {
const args = process.argv.slice(2)
const yearArg = args.find(a => /^\d{4}$/.test(a))
const doMatches = args.includes('--matches') || !args.includes('--squads')
const doSquads = args.includes('--squads') || !args.includes('--matches')
const yearsToScrape = yearArg ? [parseInt(yearArg)] : YEARS
const modeLabel = [doMatches && 'matches', doSquads && 'squads'].filter(Boolean).join(' + ')
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia [${modeLabel}]...`)
for (const year of yearsToScrape) {
process.stdout.write(` ${year}... `)
if (doMatches) {
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
if (!mainHtml) { console.log('FAILED (main page)'); continue }
const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
writeMatches(year, matches, stadiums, groups, meta)
process.stdout.write(`${matches.length} matches`)
await delay(600)
}
if (doSquads) {
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
const squads = squadHtml ? scrapeSquads(squadHtml) : []
writeSquads(year, squads)
process.stdout.write(`${doMatches ? ', ' : ''}${squads.length} squads`)
await delay(600)
}
console.log()
}
console.log('\nDone! Files written to data/wikipedia/{year}/')
}
main().catch(e => { console.error(e); process.exit(1) })