feat: implement incremental indexing and remove proactive rate limit
Major performance improvements for CI builds: 1. **Removed proactive rate limit threshold** - No longer waits at 500 remaining requests - Uses full 5000 request quota before forced wait - Maximizes work per rate limit cycle 2. **Implemented incremental indexing** - Checks if repository already exists in database - Compares last_commit (pushedAt) to detect changes - Only fetches README for new or updated repositories - Skips README fetch for unchanged repos (major time savings) 3. **Increased timeout to GitHub maximum** - Job timeout: 180m → 360m (6 hours, GitHub free tier max) - Script timeout: 170m → 350m - Allows full first-run indexing to complete Impact on performance: **First run (empty database):** - Same as before: ~25,000 repos need full indexing - Will use all 360 minutes but should complete **Subsequent runs (incremental):** - Only fetches READMEs for changed repos (~5-10% typically) - Dramatically faster: estimated 30-60 minutes instead of 360 - Makes daily automated builds sustainable Files changed: - lib/github-api.js: Removed proactive rate limit check - lib/indexer.js: Added incremental indexing logic - .github/workflows/build-database.yml: Increased timeout to 360m 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
8
.github/workflows/build-database.yml
vendored
8
.github/workflows/build-database.yml
vendored
@@ -22,7 +22,7 @@ permissions:
|
|||||||
jobs:
|
jobs:
|
||||||
build-database:
|
build-database:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
timeout-minutes: 180 # 3 hours max
|
timeout-minutes: 360 # 6 hours (GitHub Actions maximum for free tier)
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
@@ -60,8 +60,8 @@ jobs:
|
|||||||
INDEX_MODE="${{ github.event.inputs.index_mode || 'full' }}"
|
INDEX_MODE="${{ github.event.inputs.index_mode || 'full' }}"
|
||||||
echo "Index mode: $INDEX_MODE"
|
echo "Index mode: $INDEX_MODE"
|
||||||
|
|
||||||
# Build the index in non-interactive mode (170m timeout, job timeout is 180m)
|
# Build the index in non-interactive mode (350m timeout, job timeout is 360m)
|
||||||
timeout 170m node -e "
|
timeout 350m node -e "
|
||||||
const db = require('./lib/database');
|
const db = require('./lib/database');
|
||||||
const dbOps = require('./lib/db-operations');
|
const dbOps = require('./lib/db-operations');
|
||||||
const indexer = require('./lib/indexer');
|
const indexer = require('./lib/indexer');
|
||||||
@@ -96,7 +96,7 @@ jobs:
|
|||||||
" || {
|
" || {
|
||||||
EXIT_CODE=$?
|
EXIT_CODE=$?
|
||||||
if [ $EXIT_CODE -eq 124 ]; then
|
if [ $EXIT_CODE -eq 124 ]; then
|
||||||
echo "❌ Index building timed out after 170 minutes"
|
echo "❌ Index building timed out after 350 minutes"
|
||||||
echo "This may indicate rate limiting issues or too many lists to index"
|
echo "This may indicate rate limiting issues or too many lists to index"
|
||||||
fi
|
fi
|
||||||
exit $EXIT_CODE
|
exit $EXIT_CODE
|
||||||
|
|||||||
@@ -140,29 +140,6 @@ async function rateLimitedRequest(url, options = {}) {
|
|||||||
await new Promise(resolve => setTimeout(resolve, RATE_LIMIT_DELAY - timeSinceLastRequest));
|
await new Promise(resolve => setTimeout(resolve, RATE_LIMIT_DELAY - timeSinceLastRequest));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check rate limit proactively
|
|
||||||
// In CI: check frequently with no grace period to maximize efficiency
|
|
||||||
// Locally: use grace period to reduce API overhead after recovery
|
|
||||||
const isCI = process.env.CI === 'true';
|
|
||||||
const timeSinceRecovery = Date.now() - lastRateLimitRecoveryTime;
|
|
||||||
const RECOVERY_GRACE_PERIOD = isCI ? 0 : (10 * 60 * 1000); // No grace period in CI, 10min locally
|
|
||||||
const CHECK_FREQUENCY = isCI ? 0.10 : 0.01; // 10% in CI, 1% locally
|
|
||||||
const LOW_THRESHOLD = isCI ? 500 : 200; // Wait at 500 in CI for full batches, 200 locally
|
|
||||||
|
|
||||||
if (timeSinceRecovery > RECOVERY_GRACE_PERIOD && Math.random() < CHECK_FREQUENCY) {
|
|
||||||
const rateLimitStatus = await checkRateLimit();
|
|
||||||
if (rateLimitStatus && rateLimitStatus.remaining < LOW_THRESHOLD) {
|
|
||||||
console.log();
|
|
||||||
console.log(chalk.yellow(`⚠️ Rate limit getting low: ${rateLimitStatus.remaining}/${rateLimitStatus.limit} remaining`));
|
|
||||||
console.log(chalk.yellow(` Proactively waiting for rate limit to reset...`));
|
|
||||||
|
|
||||||
if (isCI) {
|
|
||||||
console.log(chalk.cyan('🤖 CI mode: waiting for full reset to maximize batch efficiency...'));
|
|
||||||
await waitForRateLimitReset(rateLimitStatus.reset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
lastRequestTime = Date.now();
|
lastRequestTime = Date.now();
|
||||||
|
|
||||||
const token = getGitHubToken();
|
const token = getGitHubToken();
|
||||||
|
|||||||
@@ -261,6 +261,9 @@ async function buildIndex(force = false, mode = null) {
|
|||||||
// Index repositories
|
// Index repositories
|
||||||
for (const repo of repos) {
|
for (const repo of repos) {
|
||||||
try {
|
try {
|
||||||
|
// Check if repo already exists (incremental indexing)
|
||||||
|
const existingRepo = db.getRepositoryByUrl(repo.url);
|
||||||
|
|
||||||
// Get repo info from GitHub
|
// Get repo info from GitHub
|
||||||
const repoInfo = await github.getRepoInfo(repo.url);
|
const repoInfo = await github.getRepoInfo(repo.url);
|
||||||
|
|
||||||
@@ -268,8 +271,20 @@ async function buildIndex(force = false, mode = null) {
|
|||||||
const repoId = db.addRepository(listId, repoInfo.name, repo.url, repo.description || repoInfo.description, repoInfo);
|
const repoId = db.addRepository(listId, repoInfo.name, repo.url, repo.description || repoInfo.description, repoInfo);
|
||||||
indexedRepos++;
|
indexedRepos++;
|
||||||
|
|
||||||
// Index README if in full mode
|
// Determine if we need to fetch README
|
||||||
|
let shouldFetchReadme = false;
|
||||||
if (indexChoice === 'full' || indexChoice === 'sample') {
|
if (indexChoice === 'full' || indexChoice === 'sample') {
|
||||||
|
if (!existingRepo) {
|
||||||
|
// New repo - fetch README
|
||||||
|
shouldFetchReadme = true;
|
||||||
|
} else if (existingRepo.last_commit !== repoInfo.pushedAt) {
|
||||||
|
// Repo updated since last index - fetch README
|
||||||
|
shouldFetchReadme = true;
|
||||||
|
}
|
||||||
|
// else: repo unchanged, skip README fetch
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shouldFetchReadme) {
|
||||||
const repoReadme = await github.getReadme(repo.url);
|
const repoReadme = await github.getReadme(repo.url);
|
||||||
if (repoReadme) {
|
if (repoReadme) {
|
||||||
const textContent = extractTextContent(repoReadme.content);
|
const textContent = extractTextContent(repoReadme.content);
|
||||||
|
|||||||
Reference in New Issue
Block a user