7e4bf2d07c
- Detect Wikipedia plain-text rate-limit response ("You are making too many
requests") and wait 30s before retrying, rather than silently failing
- Increase inter-attempt delay from 3s to 15s per attempt
- Increase group subpage delay from 1.2s to 3s, year delay from 0.6s to 2s
- Re-scrape 1982, 1998, 2002, 2006 which had failed groups; all groups now
complete — e.g. 2002 now has 64 matches including Group E (Germany/Klose)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
102 lines
3.8 KiB
TypeScript
102 lines
3.8 KiB
TypeScript
/**
|
|
* Scrape English Wikipedia for World Cup data and write JSON files to
|
|
* data/{year}/.
|
|
*
|
|
* Usage:
|
|
* pnpm scrape # all years, matches + squads
|
|
* pnpm scrape 2022 # single year, matches + squads
|
|
* pnpm scrape 2022 --matches # matches + meta + stadiums only
|
|
* pnpm scrape 2022 --squads # squads only
|
|
*/
|
|
import { mkdirSync, writeFileSync } from 'fs'
|
|
import path from 'path'
|
|
import { fileURLToPath } from 'url'
|
|
import {
|
|
fetchWikiHtml, scrapeYear, scrapeSquads,
|
|
type Match, type Stadium, type Group, type Meta, type Squad,
|
|
} from '../lib/wiki-scraper'
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url))
|
|
const DATA_DIR = path.join(__dirname, '../data')
|
|
|
|
const YEARS = [
|
|
1930,1934,1938,1950,1954,1958,1962,1966,1970,1974,
|
|
1978,1982,1986,1990,1994,1998,2002,2006,2010,2014,2018,2022,
|
|
]
|
|
|
|
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
|
|
|
|
// ── File output ────────────────────────────────────────────────────────────
|
|
|
|
function writeMatches(
|
|
year: number,
|
|
matches: Match[],
|
|
stadiums: Map<string, Stadium>,
|
|
groups: Map<string, Set<string>>,
|
|
meta: Meta,
|
|
): void {
|
|
const dir = path.join(DATA_DIR, String(year))
|
|
mkdirSync(dir, { recursive: true })
|
|
|
|
writeFileSync(path.join(dir, 'worldcup.meta.json'), JSON.stringify(meta, null, 2), 'utf-8')
|
|
writeFileSync(path.join(dir, 'worldcup.json'), JSON.stringify({ matches }, null, 2), 'utf-8')
|
|
|
|
if (stadiums.size > 0)
|
|
writeFileSync(path.join(dir, 'worldcup.stadiums.json'),
|
|
JSON.stringify({ stadiums: Array.from(stadiums.values()) }, null, 2), 'utf-8')
|
|
|
|
const groupList: Group[] = []
|
|
groups.forEach((teams, name) => groupList.push({ name, teams: Array.from(teams) }))
|
|
if (groupList.length > 0)
|
|
writeFileSync(path.join(dir, 'worldcup.groups.json'),
|
|
JSON.stringify({ groups: groupList }, null, 2), 'utf-8')
|
|
}
|
|
|
|
function writeSquads(year: number, squads: Squad[]): void {
|
|
if (squads.length === 0) return
|
|
const dir = path.join(DATA_DIR, String(year))
|
|
mkdirSync(dir, { recursive: true })
|
|
writeFileSync(path.join(dir, 'worldcup.squads.json'), JSON.stringify(squads, null, 2), 'utf-8')
|
|
}
|
|
|
|
// ── Entry point ────────────────────────────────────────────────────────────
|
|
|
|
async function main() {
|
|
const args = process.argv.slice(2)
|
|
const yearArg = args.find(a => /^\d{4}$/.test(a))
|
|
const doMatches = args.includes('--matches') || !args.includes('--squads')
|
|
const doSquads = args.includes('--squads') || !args.includes('--matches')
|
|
|
|
const yearsToScrape = yearArg ? [parseInt(yearArg)] : YEARS
|
|
const modeLabel = [doMatches && 'matches', doSquads && 'squads'].filter(Boolean).join(' + ')
|
|
|
|
console.log(`Scraping ${yearsToScrape.length} World Cup(s) from Wikipedia [${modeLabel}]...`)
|
|
|
|
for (const year of yearsToScrape) {
|
|
process.stdout.write(` ${year}... `)
|
|
|
|
if (doMatches) {
|
|
const mainHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup`)
|
|
if (!mainHtml) { console.log('FAILED (main page)'); continue }
|
|
const { matches, stadiums, groups, meta } = await scrapeYear(year, mainHtml)
|
|
writeMatches(year, matches, stadiums, groups, meta)
|
|
process.stdout.write(`${matches.length} matches`)
|
|
await delay(2000)
|
|
}
|
|
|
|
if (doSquads) {
|
|
const squadHtml = await fetchWikiHtml(`${year}_FIFA_World_Cup_squads`)
|
|
const squads = squadHtml ? scrapeSquads(squadHtml) : []
|
|
writeSquads(year, squads)
|
|
process.stdout.write(`${doMatches ? ', ' : ''}${squads.length} squads`)
|
|
await delay(600)
|
|
}
|
|
|
|
console.log()
|
|
}
|
|
|
|
console.log('\nDone! Files written to data/{year}/')
|
|
}
|
|
|
|
main().catch(e => { console.error(e); process.exit(1) })
|