Files
worldcup/scripts/scrape-wikipedia.ts
T

102 lines
3.9 KiB
TypeScript
Raw Normal View History

/**
* Scrape English Wikipedia for World Cup data and write JSON files to
* app/data/wikipedia/{year}/.
*
* Usage:
* pnpm scrape # all years, matches + squads
* pnpm scrape 2022 # single year, matches + squads
* pnpm scrape 2022 --matches # matches + meta + stadiums only
* pnpm scrape 2022 --squads # squads only
*/
import { mkdirSync, writeFileSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
import {
fetchWikiHtml, scrapeYear, scrapeSquads,
type Match, type Stadium, type Group, type Meta, type Squad,
} from '../lib/wiki-scraper'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const DATA_DIR = path.join(__dirname, '../app/data/wikipedia')
const YEARS = [
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
]
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
// ── File output ────────────────────────────────────────────────────────────
function writeMatches(
year: number,
matches: Match[],
stadiums: Map<string, Stadium>,
groups: Map<string, Set<string>>,
meta: Meta,
): void {
const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true })
writeFileSync(path.join(dir, 'worldcup.meta.json'), JSON.stringify(meta, null, 2), 'utf-8')
writeFileSync(path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), 'utf-8')
if (stadiums.size > 0)
writeFileSync(path.join(dir, 'worldcup.stadiums.json'),
JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), 'utf-8')
const groupList: Group[] = []
groups.forEach((teams, name) => groupList.push({ name, teams: Array.from(teams) }))
if (groupList.length > 0)
writeFileSync(path.join(dir, 'worldcup.groups.json'),
JSON.stringify({ groups: groupList }, null, 2), 'utf-8')
}
function writeSquads(year: number, squads: Squad[]): void {
if (squads.length === 0) return
const dir = path.join(DATA_DIR, String(year))
mkdirSync(dir, { recursive: true })
writeFileSync(path.join(dir, 'worldcup.squads.json'), JSON.stringify(squads, null, 2), 'utf-8')
}
// ── Entry point ────────────────────────────────────────────────────────────
async function main() {
const args = process.argv.slice(2)
const yearArg = args.find(a => /^\d{4}$/.test(a))
const doMatches = args.includes('--matches') || !args.includes('--squads')
const doSquads = args.includes('--squads') || !args.includes('--matches')
const yearsToScrape = yearArg ? [parseInt(yearArg)] : YEARS
const modeLabel = [doMatches && 'matches', doSquads && 'squads'].filter(Boolean).join(' + ')
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia [${modeLabel}]...`)
for (const year of yearsToScrape) {
process.stdout.write(` ${year}... `)
if (doMatches) {
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
if (!mainHtml) { console.log('FAILED (main page)'); continue }
const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
writeMatches(year, matches, stadiums, groups, meta)
process.stdout.write(`${matches.length} matches`)
await delay(600)
}
if (doSquads) {
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
const squads = squadHtml ? scrapeSquads(squadHtml) : []
writeSquads(year, squads)
process.stdout.write(`${doMatches ? ', ' : ''}${squads.length} squads`)
await delay(600)
}
console.log()
}
console.log('\nDone! Files written to app/data/wikipedia/{year}/')
}
main().catch(e => { console.error(e); process.exit(1) })