feat: implement incremental indexing and remove proactive rate limit

Major performance improvements for CI builds:

1. **Removed proactive rate limit threshold**
   - No longer waits at 500 remaining requests
   - Uses full 5000 request quota before forced wait
   - Maximizes work per rate limit cycle

2. **Implemented incremental indexing**
   - Checks if repository already exists in database
   - Compares last_commit (pushedAt) to detect changes
   - Only fetches README for new or updated repositories
   - Skips README fetch for unchanged repos (major time savings)

3. **Increased timeout to GitHub maximum**
   - Job timeout: 180m → 360m (6 hours, GitHub free tier max)
   - Script timeout: 170m → 350m
   - Allows full first-run indexing to complete

Impact on performance:

**First run (empty database):**
- Same as before: ~25,000 repos need full indexing
- Will use all 360 minutes but should complete

**Subsequent runs (incremental):**
- Only fetches READMEs for changed repos (~5-10% typically)
- Dramatically faster: estimated 30-60 minutes instead of 360
- Makes daily automated builds sustainable

Files changed:
- lib/github-api.js: Removed proactive rate limit check
- lib/indexer.js: Added incremental indexing logic
- .github/workflows/build-database.yml: Increased timeout to 360m

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
valknarness
2025-10-28 09:57:02 +01:00
parent 279cc2fa25
commit 98ddac97e8
3 changed files with 20 additions and 28 deletions

View File

@@ -22,7 +22,7 @@ permissions:
jobs: jobs:
build-database: build-database:
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 180 # 3 hours max timeout-minutes: 360 # 6 hours (GitHub Actions maximum for free tier)
steps: steps:
- name: Checkout repository - name: Checkout repository
@@ -60,8 +60,8 @@ jobs:
INDEX_MODE="${{ github.event.inputs.index_mode || 'full' }}" INDEX_MODE="${{ github.event.inputs.index_mode || 'full' }}"
echo "Index mode: $INDEX_MODE" echo "Index mode: $INDEX_MODE"
# Build the index in non-interactive mode (170m timeout, job timeout is 180m) # Build the index in non-interactive mode (350m timeout, job timeout is 360m)
timeout 170m node -e " timeout 350m node -e "
const db = require('./lib/database'); const db = require('./lib/database');
const dbOps = require('./lib/db-operations'); const dbOps = require('./lib/db-operations');
const indexer = require('./lib/indexer'); const indexer = require('./lib/indexer');
@@ -96,7 +96,7 @@ jobs:
" || { " || {
EXIT_CODE=$? EXIT_CODE=$?
if [ $EXIT_CODE -eq 124 ]; then if [ $EXIT_CODE -eq 124 ]; then
echo "❌ Index building timed out after 170 minutes" echo "❌ Index building timed out after 350 minutes"
echo "This may indicate rate limiting issues or too many lists to index" echo "This may indicate rate limiting issues or too many lists to index"
fi fi
exit $EXIT_CODE exit $EXIT_CODE

View File

@@ -140,29 +140,6 @@ async function rateLimitedRequest(url, options = {}) {
await new Promise(resolve => setTimeout(resolve, RATE_LIMIT_DELAY - timeSinceLastRequest)); await new Promise(resolve => setTimeout(resolve, RATE_LIMIT_DELAY - timeSinceLastRequest));
} }
// Check rate limit proactively
// In CI: check frequently with no grace period to maximize efficiency
// Locally: use grace period to reduce API overhead after recovery
const isCI = process.env.CI === 'true';
const timeSinceRecovery = Date.now() - lastRateLimitRecoveryTime;
const RECOVERY_GRACE_PERIOD = isCI ? 0 : (10 * 60 * 1000); // No grace period in CI, 10min locally
const CHECK_FREQUENCY = isCI ? 0.10 : 0.01; // 10% in CI, 1% locally
const LOW_THRESHOLD = isCI ? 500 : 200; // Wait at 500 in CI for full batches, 200 locally
if (timeSinceRecovery > RECOVERY_GRACE_PERIOD && Math.random() < CHECK_FREQUENCY) {
const rateLimitStatus = await checkRateLimit();
if (rateLimitStatus && rateLimitStatus.remaining < LOW_THRESHOLD) {
console.log();
console.log(chalk.yellow(`⚠️ Rate limit getting low: ${rateLimitStatus.remaining}/${rateLimitStatus.limit} remaining`));
console.log(chalk.yellow(` Proactively waiting for rate limit to reset...`));
if (isCI) {
console.log(chalk.cyan('🤖 CI mode: waiting for full reset to maximize batch efficiency...'));
await waitForRateLimitReset(rateLimitStatus.reset);
}
}
}
lastRequestTime = Date.now(); lastRequestTime = Date.now();
const token = getGitHubToken(); const token = getGitHubToken();

View File

@@ -261,6 +261,9 @@ async function buildIndex(force = false, mode = null) {
// Index repositories // Index repositories
for (const repo of repos) { for (const repo of repos) {
try { try {
// Check if repo already exists (incremental indexing)
const existingRepo = db.getRepositoryByUrl(repo.url);
// Get repo info from GitHub // Get repo info from GitHub
const repoInfo = await github.getRepoInfo(repo.url); const repoInfo = await github.getRepoInfo(repo.url);
@@ -268,8 +271,20 @@ async function buildIndex(force = false, mode = null) {
const repoId = db.addRepository(listId, repoInfo.name, repo.url, repo.description || repoInfo.description, repoInfo); const repoId = db.addRepository(listId, repoInfo.name, repo.url, repo.description || repoInfo.description, repoInfo);
indexedRepos++; indexedRepos++;
// Index README if in full mode // Determine if we need to fetch README
let shouldFetchReadme = false;
if (indexChoice === 'full' || indexChoice === 'sample') { if (indexChoice === 'full' || indexChoice === 'sample') {
if (!existingRepo) {
// New repo - fetch README
shouldFetchReadme = true;
} else if (existingRepo.last_commit !== repoInfo.pushedAt) {
// Repo updated since last index - fetch README
shouldFetchReadme = true;
}
// else: repo unchanged, skip README fetch
}
if (shouldFetchReadme) {
const repoReadme = await github.getReadme(repo.url); const repoReadme = await github.getReadme(repo.url);
if (repoReadme) { if (repoReadme) {
const textContent = extractTextContent(repoReadme.content); const textContent = extractTextContent(repoReadme.content);