Files

130 lines
3.6 KiB
Python
Raw Permalink Normal View History

2026-02-16 19:56:25 +01:00
"""Health check and system information endpoints."""
import logging
import os
import time
from typing import Optional
import psutil
from fastapi import APIRouter, HTTPException
from app.config import settings
from app.schemas.health import HealthResponse, RequestStats, SystemInfo
from app.services import file_manager, worker
logger = logging.getLogger(__name__)
router = APIRouter(prefix='/api/v1', tags=['system'])
# Track uptime
_start_time = time.time()
# Request statistics
_stats = {
'total_requests': 0,
'successful_requests': 0,
'failed_requests': 0,
'total_processing_time': 0.0,
'total_images_processed': 0,
}
@router.get('/health')
async def health_check() -> HealthResponse:
"""API health check."""
uptime = time.time() - _start_time
return HealthResponse(
status='ok',
version='1.0.0',
uptime_seconds=uptime,
message='Real-ESRGAN API is running',
)
@router.get('/health/ready')
async def readiness_check():
"""Kubernetes readiness probe."""
from app.services import realesrgan_bridge
bridge = realesrgan_bridge.get_bridge()
if not bridge.initialized:
raise HTTPException(status_code=503, detail='Not ready')
return {'ready': True}
@router.get('/health/live')
async def liveness_check():
"""Kubernetes liveness probe."""
return {'alive': True}
@router.get('/system')
async def get_system_info() -> SystemInfo:
"""Get comprehensive system information."""
try:
# Uptime
uptime = time.time() - _start_time
# CPU and memory
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
memory_percent = memory.percent
# Disk
disk = psutil.disk_usage('/')
disk_percent = disk.percent
# Models directory size
models_size = file_manager.get_directory_size_mb(settings.models_dir)
# Jobs queue
wq = worker.get_worker_queue()
queue_length = wq.queue.qsize()
return SystemInfo(
status='ok',
version='1.0.0',
uptime_seconds=uptime,
cpu_usage_percent=cpu_percent,
memory_usage_percent=memory_percent,
disk_usage_percent=disk_percent,
execution_providers=settings.get_execution_providers(),
models_dir_size_mb=models_size,
jobs_queue_length=queue_length,
)
except Exception as e:
logger.error(f'Failed to get system info: {e}', exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get('/stats')
async def get_stats() -> RequestStats:
"""Get request statistics."""
avg_time = 0.0
if _stats['successful_requests'] > 0:
avg_time = _stats['total_processing_time'] / _stats['successful_requests']
return RequestStats(
total_requests=_stats['total_requests'],
successful_requests=_stats['successful_requests'],
failed_requests=_stats['failed_requests'],
average_processing_time_seconds=avg_time,
total_images_processed=_stats['total_images_processed'],
)
@router.post('/cleanup')
async def cleanup_old_jobs(hours: int = 24):
"""Clean up old job directories."""
try:
cleaned = file_manager.cleanup_old_jobs(hours)
return {
'success': True,
'cleaned_jobs': cleaned,
'message': f'Cleaned up {cleaned} job directories older than {hours} hours',
}
except Exception as e:
logger.error(f'Cleanup failed: {e}', exc_info=True)
raise HTTPException(status_code=500, detail=str(e))