/** * Scrape English Wikipedia for World Cup data and write JSON files to * app/data/wikipedia/{year}/. * * Usage: * pnpm scrape # all years, matches + squads * pnpm scrape 2022 # single year, matches + squads * pnpm scrape 2022 --matches # matches + meta + stadiums only * pnpm scrape 2022 --squads # squads only */ import { mkdirSync, writeFileSync } from 'fs' import path from 'path' import { fileURLToPath } from 'url' import { fetchWikiHtml, scrapeYear, scrapeSquads, type Match, type Stadium, type Group, type Meta, type Squad, } from '../lib/wiki-scraper' const __dirname = path.dirname(fileURLToPath(import.meta.url)) const DATA_DIR = path.join(__dirname, '../app/data/wikipedia') const YEARS = [ 1930,1934,1938,1950,1954,1958,1962,1966,1970,1974, 1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022, ] const delay = (ms: number) => new Promise(r => setTimeout(r, ms)) // ── File output ──────────────────────────────────────────────────────────── function writeMatches( year: number, matches: Match[], stadiums: Map, groups: Map>, meta: Meta, ): void { const dir = path.join(DATA_DIR, String(year)) mkdirSync(dir, { recursive: true }) writeFileSync(path.join(dir, 'worldcup.meta.json'), JSON.stringify(meta, null, 2), 'utf-8') writeFileSync(path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), 'utf-8') if (stadiums.size > 0) writeFileSync(path.join(dir, 'worldcup.stadiums.json'), JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), 'utf-8') const groupList: Group[] = [] groups.forEach((teams, name) => groupList.push({ name, teams: Array.from(teams) })) if (groupList.length > 0) writeFileSync(path.join(dir, 'worldcup.groups.json'), JSON.stringify({ groups: groupList }, null, 2), 'utf-8') } function writeSquads(year: number, squads: Squad[]): void { if (squads.length === 0) return const dir = path.join(DATA_DIR, String(year)) mkdirSync(dir, { recursive: true }) writeFileSync(path.join(dir, 'worldcup.squads.json'), JSON.stringify(squads, null, 2), 'utf-8') } // ── Entry point ──────────────────────────────────────────────────────────── async function main() { const args = process.argv.slice(2) const yearArg = args.find(a => /^\d{4}$/.test(a)) const doMatches = args.includes('--matches') || !args.includes('--squads') const doSquads = args.includes('--squads') || !args.includes('--matches') const yearsToScrape = yearArg ? [parseInt(yearArg)] : YEARS const modeLabel = [doMatches && 'matches', doSquads && 'squads'].filter(Boolean).join(' + ') console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia [${modeLabel}]...`) for (const year of yearsToScrape) { process.stdout.write(` ${year}... `) if (doMatches) { const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`) if (!mainHtml) { console.log('FAILED (main page)'); continue } const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml) writeMatches(year, matches, stadiums, groups, meta) process.stdout.write(`${matches.length} matches`) await delay(600) } if (doSquads) { const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`) const squads = squadHtml ? scrapeSquads(squadHtml) : [] writeSquads(year, squads) process.stdout.write(`${doMatches ? ', ' : ''}${squads.length} squads`) await delay(600) } console.log() } console.log('\nDone! Files written to app/data/wikipedia/{year}/') } main().catch(e => { console.error(e); process.exit(1) })