From 9ce2a4e27c2cf75cd771d51eac96cafcec94fb46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Mon, 15 Jun 2026 18:14:53 +0200 Subject: [PATCH] fix: use full player names from title attr, preserve UTC offset in match times MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wikipedia abbreviates goal scorer display text (e.g. "Müller") but the attribute always has the full name. Switch parseGoals() to prefer title attr and strip disambiguation suffixes like "(soccer, born 1993)". This ensures Gerd Müller and Thomas Müller get separate player pages. Also preserve the UTC offset from Wikipedia's ftime (e.g. "12:00 UTC-4") so that isLive() can accurately compute UTC kickoff time instead of treating local time as UTC. upcomingMatches sorts by SPLIT_PART on the HH:MM part to ignore the timezone suffix. Co-Authored-By: Claude Sonnet 4.6 --- lib/graphql/resolvers/index.ts | 2 +- lib/wiki-scraper.ts | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lib/graphql/resolvers/index.ts b/lib/graphql/resolvers/index.ts index 7f593ad..05ae28b 100644 --- a/lib/graphql/resolvers/index.ts +++ b/lib/graphql/resolvers/index.ts @@ -134,7 +134,7 @@ export const resolvers = { sql`${matches.scoreFtHome} IS NULL`, eq(matches.isQualiPlayoff, false), )) - .orderBy(asc(matches.date), sql`${matches.timeLocal} ASC NULLS LAST`, asc(matches.id)) + .orderBy(asc(matches.date), sql`SPLIT_PART(${matches.timeLocal}, ' ', 1) ASC NULLS LAST`, asc(matches.id)) .limit(limit) return Promise.all(rows.map(hydrateMatch)) } catch (e) { if (isMissingTable(e)) return []; throw e } diff --git a/lib/wiki-scraper.ts b/lib/wiki-scraper.ts index 852fb8f..d3ffc90 100644 --- a/lib/wiki-scraper.ts +++ b/lib/wiki-scraper.ts @@ -116,8 +116,12 @@ function parseGoals($: CheerioAPI, $td: Cheerio): Goal[] { let playerName = '' $li.find('a').each((_, a) => { if (!$(a).closest('.fb-goal').length) { - const t = $(a).text().trim() - if (t) { playerName = t; return false } + const display = $(a).text().trim() + if (!display) return + // title attr has the full unabbreviated name; strip disambiguation suffix + const titleAttr = ($(a).attr('title') ?? '').replace(/\s*\([^)]*\)\s*$/, '').trim() + playerName = titleAttr || display + return false } }) if (!playerName) return @@ -155,14 +159,19 @@ function parseGroundParts(ground: string): { name: string; city: string } { } function parseTime12h(text: string): string | undefined { - const m = text.match(/(\d{1,2}):(\d{2})\s*([ap]\.?m\.?)/i) - if (!m) return text.match(/(\d{2}:\d{2})/)?.[1] + // Normalise Unicode minus (U+2212) used by Wikipedia to ASCII hyphen + const t = text.replace(/−/g, '-') + const m = t.match(/(\d{1,2}):(\d{2})\s*([ap]\.?m\.?)/i) + if (!m) return t.match(/(\d{2}:\d{2})/)?.[1] let h = parseInt(m[1]) const min = m[2] const isPm = m[3].toLowerCase().replace(/\./g, '').startsWith('p') if (isPm && h !== 12) h += 12 else if (!isPm && h === 12) h = 0 - return `${String(h).padStart(2, '0')}:${min}` + const time24 = `${String(h).padStart(2, '0')}:${min}` + // Preserve UTC offset so isLive() can compute correct UTC kickoff time + const tz = t.match(/UTC([+-]\d+(?:\.\d+)?)/i) + return tz ? `${time24} UTC${tz[1]}` : time24 } function parseBox($: CheerioAPI, $box: Cheerio, round: string, group: string | null): Match | null {