feat: implement incremental indexing and remove proactive rate limit

Major performance improvements for CI builds: 1. **Removed proactive rate limit threshold** - No longer waits at 500 remaining requests - Uses full 5000 request quota before forced wait - Maximizes work per rate limit cycle 2. **Implemented incremental indexing** - Checks if repository already exists in database - Compares last_commit (pushedAt) to detect changes - Only fetches README for new or updated repositories - Skips README fetch for unchanged repos (major time savings) 3. **Increased timeout to GitHub maximum** - Job timeout: 180m → 360m (6 hours, GitHub free tier max) - Script timeout: 170m → 350m - Allows full first-run indexing to complete Impact on performance: **First run (empty database):** - Same as before: ~25,000 repos need full indexing - Will use all 360 minutes but should complete **Subsequent runs (incremental):** - Only fetches READMEs for changed repos (~5-10% typically) - Dramatically faster: estimated 30-60 minutes instead of 360 - Makes daily automated builds sustainable Files changed: - lib/github-api.js: Removed proactive rate limit check - lib/indexer.js: Added incremental indexing logic - .github/workflows/build-database.yml: Increased timeout to 360m 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-28 09:57:02 +01:00
parent 279cc2fa25
commit 98ddac97e8
3 changed files with 20 additions and 28 deletions
--- a/lib/github-api.js
+++ b/lib/github-api.js
@@ -140,29 +140,6 @@ async function rateLimitedRequest(url, options = {}) {
    await new Promise(resolve => setTimeout(resolve, RATE_LIMIT_DELAY - timeSinceLastRequest));
  }

-  // Check rate limit proactively
-  // In CI: check frequently with no grace period to maximize efficiency
-  // Locally: use grace period to reduce API overhead after recovery
-  const isCI = process.env.CI === 'true';
-  const timeSinceRecovery = Date.now() - lastRateLimitRecoveryTime;
-  const RECOVERY_GRACE_PERIOD = isCI ? 0 : (10 * 60 * 1000); // No grace period in CI, 10min locally
-  const CHECK_FREQUENCY = isCI ? 0.10 : 0.01; // 10% in CI, 1% locally
-  const LOW_THRESHOLD = isCI ? 500 : 200; // Wait at 500 in CI for full batches, 200 locally
-
-  if (timeSinceRecovery > RECOVERY_GRACE_PERIOD && Math.random() < CHECK_FREQUENCY) {
-    const rateLimitStatus = await checkRateLimit();
-    if (rateLimitStatus && rateLimitStatus.remaining < LOW_THRESHOLD) {
-      console.log();
-      console.log(chalk.yellow(`⚠️  Rate limit getting low: ${rateLimitStatus.remaining}/${rateLimitStatus.limit} remaining`));
-      console.log(chalk.yellow(`   Proactively waiting for rate limit to reset...`));
-
-      if (isCI) {
-        console.log(chalk.cyan('🤖 CI mode: waiting for full reset to maximize batch efficiency...'));
-        await waitForRateLimitReset(rateLimitStatus.reset);
-      }
-    }
-  }
-
  lastRequestTime = Date.now();

  const token = getGitHubToken();
--- a/lib/indexer.js
+++ b/lib/indexer.js
@@ -261,6 +261,9 @@ async function buildIndex(force = false, mode = null) {
      // Index repositories
      for (const repo of repos) {
        try {
+          // Check if repo already exists (incremental indexing)
+          const existingRepo = db.getRepositoryByUrl(repo.url);
+
          // Get repo info from GitHub
          const repoInfo = await github.getRepoInfo(repo.url);

@@ -268,8 +271,20 @@ async function buildIndex(force = false, mode = null) {
            const repoId = db.addRepository(listId, repoInfo.name, repo.url, repo.description || repoInfo.description, repoInfo);
            indexedRepos++;

-            // Index README if in full mode
+            // Determine if we need to fetch README
+            let shouldFetchReadme = false;
            if (indexChoice === 'full' || indexChoice === 'sample') {
+              if (!existingRepo) {
+                // New repo - fetch README
+                shouldFetchReadme = true;
+              } else if (existingRepo.last_commit !== repoInfo.pushedAt) {
+                // Repo updated since last index - fetch README
+                shouldFetchReadme = true;
+              }
+              // else: repo unchanged, skip README fetch
+            }
+
+            if (shouldFetchReadme) {
              const repoReadme = await github.getReadme(repo.url);
              if (repoReadme) {
                const textContent = extractTextContent(repoReadme.content);