From 98ddac97e854abc08150233ffee23ee8428b9950 Mon Sep 17 00:00:00 2001 From: valknarness Date: Tue, 28 Oct 2025 09:57:02 +0100 Subject: [PATCH] feat: implement incremental indexing and remove proactive rate limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major performance improvements for CI builds: 1. **Removed proactive rate limit threshold** - No longer waits at 500 remaining requests - Uses full 5000 request quota before forced wait - Maximizes work per rate limit cycle 2. **Implemented incremental indexing** - Checks if repository already exists in database - Compares last_commit (pushedAt) to detect changes - Only fetches README for new or updated repositories - Skips README fetch for unchanged repos (major time savings) 3. **Increased timeout to GitHub maximum** - Job timeout: 180m → 360m (6 hours, GitHub free tier max) - Script timeout: 170m → 350m - Allows full first-run indexing to complete Impact on performance: **First run (empty database):** - Same as before: ~25,000 repos need full indexing - Will use all 360 minutes but should complete **Subsequent runs (incremental):** - Only fetches READMEs for changed repos (~5-10% typically) - Dramatically faster: estimated 30-60 minutes instead of 360 - Makes daily automated builds sustainable Files changed: - lib/github-api.js: Removed proactive rate limit check - lib/indexer.js: Added incremental indexing logic - .github/workflows/build-database.yml: Increased timeout to 360m 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .github/workflows/build-database.yml | 8 ++++---- lib/github-api.js | 23 ----------------------- lib/indexer.js | 17 ++++++++++++++++- 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/.github/workflows/build-database.yml b/.github/workflows/build-database.yml index dbfc42d..cc22e90 100644 --- a/.github/workflows/build-database.yml +++ b/.github/workflows/build-database.yml @@ -22,7 +22,7 @@ permissions: jobs: build-database: runs-on: ubuntu-latest - timeout-minutes: 180 # 3 hours max + timeout-minutes: 360 # 6 hours (GitHub Actions maximum for free tier) steps: - name: Checkout repository @@ -60,8 +60,8 @@ jobs: INDEX_MODE="${{ github.event.inputs.index_mode || 'full' }}" echo "Index mode: $INDEX_MODE" - # Build the index in non-interactive mode (170m timeout, job timeout is 180m) - timeout 170m node -e " + # Build the index in non-interactive mode (350m timeout, job timeout is 360m) + timeout 350m node -e " const db = require('./lib/database'); const dbOps = require('./lib/db-operations'); const indexer = require('./lib/indexer'); @@ -96,7 +96,7 @@ jobs: " || { EXIT_CODE=$? if [ $EXIT_CODE -eq 124 ]; then - echo "❌ Index building timed out after 170 minutes" + echo "❌ Index building timed out after 350 minutes" echo "This may indicate rate limiting issues or too many lists to index" fi exit $EXIT_CODE diff --git a/lib/github-api.js b/lib/github-api.js index 13a1ccd..7ce825f 100644 --- a/lib/github-api.js +++ b/lib/github-api.js @@ -140,29 +140,6 @@ async function rateLimitedRequest(url, options = {}) { await new Promise(resolve => setTimeout(resolve, RATE_LIMIT_DELAY - timeSinceLastRequest)); } - // Check rate limit proactively - // In CI: check frequently with no grace period to maximize efficiency - // Locally: use grace period to reduce API overhead after recovery - const isCI = process.env.CI === 'true'; - const timeSinceRecovery = Date.now() - lastRateLimitRecoveryTime; - const RECOVERY_GRACE_PERIOD = isCI ? 0 : (10 * 60 * 1000); // No grace period in CI, 10min locally - const CHECK_FREQUENCY = isCI ? 0.10 : 0.01; // 10% in CI, 1% locally - const LOW_THRESHOLD = isCI ? 500 : 200; // Wait at 500 in CI for full batches, 200 locally - - if (timeSinceRecovery > RECOVERY_GRACE_PERIOD && Math.random() < CHECK_FREQUENCY) { - const rateLimitStatus = await checkRateLimit(); - if (rateLimitStatus && rateLimitStatus.remaining < LOW_THRESHOLD) { - console.log(); - console.log(chalk.yellow(`⚠️ Rate limit getting low: ${rateLimitStatus.remaining}/${rateLimitStatus.limit} remaining`)); - console.log(chalk.yellow(` Proactively waiting for rate limit to reset...`)); - - if (isCI) { - console.log(chalk.cyan('🤖 CI mode: waiting for full reset to maximize batch efficiency...')); - await waitForRateLimitReset(rateLimitStatus.reset); - } - } - } - lastRequestTime = Date.now(); const token = getGitHubToken(); diff --git a/lib/indexer.js b/lib/indexer.js index e400ad2..e62d31f 100644 --- a/lib/indexer.js +++ b/lib/indexer.js @@ -261,6 +261,9 @@ async function buildIndex(force = false, mode = null) { // Index repositories for (const repo of repos) { try { + // Check if repo already exists (incremental indexing) + const existingRepo = db.getRepositoryByUrl(repo.url); + // Get repo info from GitHub const repoInfo = await github.getRepoInfo(repo.url); @@ -268,8 +271,20 @@ async function buildIndex(force = false, mode = null) { const repoId = db.addRepository(listId, repoInfo.name, repo.url, repo.description || repoInfo.description, repoInfo); indexedRepos++; - // Index README if in full mode + // Determine if we need to fetch README + let shouldFetchReadme = false; if (indexChoice === 'full' || indexChoice === 'sample') { + if (!existingRepo) { + // New repo - fetch README + shouldFetchReadme = true; + } else if (existingRepo.last_commit !== repoInfo.pushedAt) { + // Repo updated since last index - fetch README + shouldFetchReadme = true; + } + // else: repo unchanged, skip README fetch + } + + if (shouldFetchReadme) { const repoReadme = await github.getReadme(repo.url); if (repoReadme) { const textContent = extractTextContent(repoReadme.content);