fix: retry failed group subpages, add rate-limit detection in scraper
- Detect Wikipedia plain-text rate-limit response ("You are making too many
requests") and wait 30s before retrying, rather than silently failing
- Increase inter-attempt delay from 3s to 15s per attempt
- Increase group subpage delay from 1.2s to 3s, year delay from 0.6s to 2s
- Re-scrape 1982, 1998, 2002, 2006 which had failed groups; all groups now
complete — e.g. 2002 now has 64 matches including Group E (Germany/Klose)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+8
-5
@@ -58,14 +58,17 @@ type State = { active: boolean; round: string; group: string | null }
|
||||
|
||||
const delay = (ms: number) => new Promise(r => setTimeout(r, ms))
|
||||
|
||||
export async function fetchWikiHtml(page: string, retries = 5): Promise<string | null> {
|
||||
export async function fetchWikiHtml(page: string, retries = 6): Promise<string | null> {
|
||||
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&format=json&prop=text&disabletoc=1`
|
||||
for (let attempt = 0; attempt < retries; attempt++) {
|
||||
try {
|
||||
if (attempt > 0) await delay(3000 * attempt)
|
||||
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0' } })
|
||||
if (attempt > 0) await delay(15000 * attempt)
|
||||
const res = await fetch(url, { headers: { 'User-Agent': 'WorldCupScraper/1.0 (worldcup-stats)' } })
|
||||
if (res.status === 429) { await delay(30000); continue }
|
||||
if (!res.ok) continue
|
||||
const data = await res.json() as { parse?: { text?: { '*': string } } }
|
||||
const text = await res.text()
|
||||
if (text.toLowerCase().startsWith('you are making')) { await delay(30000); continue }
|
||||
const data = JSON.parse(text) as { parse?: { text?: { '*': string } } }
|
||||
const html = data?.parse?.text?.['*']
|
||||
if (html) return html
|
||||
} catch {
|
||||
@@ -414,7 +417,7 @@ export async function scrapeYear(
|
||||
process.stdout.write(`[skip ${group}] `)
|
||||
continue
|
||||
}
|
||||
await delay(1200)
|
||||
await delay(3000)
|
||||
const subHtml = await fetchWikiHtml(page)
|
||||
if (!subHtml) { process.stdout.write(`(failed: ${page}) `); continue }
|
||||
|
||||
|
||||
Reference in New Issue
Block a user