Initial Real-ESRGAN API project setup
This commit is contained in:
146
app/routers/health.py
Normal file
146
app/routers/health.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""Health check and system information endpoints."""
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import psutil
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
from app.config import settings
|
||||
from app.schemas.health import HealthResponse, RequestStats, SystemInfo
|
||||
from app.services import file_manager, worker
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix='/api/v1', tags=['system'])
|
||||
|
||||
# Track uptime
|
||||
_start_time = time.time()
|
||||
|
||||
# Request statistics
|
||||
_stats = {
|
||||
'total_requests': 0,
|
||||
'successful_requests': 0,
|
||||
'failed_requests': 0,
|
||||
'total_processing_time': 0.0,
|
||||
'total_images_processed': 0,
|
||||
}
|
||||
|
||||
|
||||
@router.get('/health')
|
||||
async def health_check() -> HealthResponse:
|
||||
"""API health check."""
|
||||
uptime = time.time() - _start_time
|
||||
return HealthResponse(
|
||||
status='ok',
|
||||
version='1.0.0',
|
||||
uptime_seconds=uptime,
|
||||
message='Real-ESRGAN API is running',
|
||||
)
|
||||
|
||||
|
||||
@router.get('/health/ready')
|
||||
async def readiness_check():
|
||||
"""Kubernetes readiness probe."""
|
||||
from app.services import realesrgan_bridge
|
||||
|
||||
bridge = realesrgan_bridge.get_bridge()
|
||||
|
||||
if not bridge.initialized:
|
||||
raise HTTPException(status_code=503, detail='Not ready')
|
||||
|
||||
return {'ready': True}
|
||||
|
||||
|
||||
@router.get('/health/live')
|
||||
async def liveness_check():
|
||||
"""Kubernetes liveness probe."""
|
||||
return {'alive': True}
|
||||
|
||||
|
||||
@router.get('/system')
|
||||
async def get_system_info() -> SystemInfo:
|
||||
"""Get comprehensive system information."""
|
||||
try:
|
||||
# Uptime
|
||||
uptime = time.time() - _start_time
|
||||
|
||||
# CPU and memory
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
memory = psutil.virtual_memory()
|
||||
memory_percent = memory.percent
|
||||
|
||||
# Disk
|
||||
disk = psutil.disk_usage('/')
|
||||
disk_percent = disk.percent
|
||||
|
||||
# GPU
|
||||
gpu_available = False
|
||||
gpu_memory_mb = None
|
||||
gpu_memory_used_mb = None
|
||||
|
||||
try:
|
||||
import torch
|
||||
gpu_available = torch.cuda.is_available()
|
||||
if gpu_available:
|
||||
gpu_memory_mb = int(torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
|
||||
gpu_memory_used_mb = int(torch.cuda.memory_allocated(0) / (1024 * 1024))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Models directory size
|
||||
models_size = file_manager.get_directory_size_mb(settings.models_dir)
|
||||
|
||||
# Jobs queue
|
||||
wq = worker.get_worker_queue()
|
||||
queue_length = wq.queue.qsize()
|
||||
|
||||
return SystemInfo(
|
||||
status='ok',
|
||||
version='1.0.0',
|
||||
uptime_seconds=uptime,
|
||||
cpu_usage_percent=cpu_percent,
|
||||
memory_usage_percent=memory_percent,
|
||||
disk_usage_percent=disk_percent,
|
||||
gpu_available=gpu_available,
|
||||
gpu_memory_mb=gpu_memory_mb,
|
||||
gpu_memory_used_mb=gpu_memory_used_mb,
|
||||
execution_providers=settings.get_execution_providers(),
|
||||
models_dir_size_mb=models_size,
|
||||
jobs_queue_length=queue_length,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to get system info: {e}', exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get('/stats')
|
||||
async def get_stats() -> RequestStats:
|
||||
"""Get request statistics."""
|
||||
avg_time = 0.0
|
||||
if _stats['successful_requests'] > 0:
|
||||
avg_time = _stats['total_processing_time'] / _stats['successful_requests']
|
||||
|
||||
return RequestStats(
|
||||
total_requests=_stats['total_requests'],
|
||||
successful_requests=_stats['successful_requests'],
|
||||
failed_requests=_stats['failed_requests'],
|
||||
average_processing_time_seconds=avg_time,
|
||||
total_images_processed=_stats['total_images_processed'],
|
||||
)
|
||||
|
||||
|
||||
@router.post('/cleanup')
|
||||
async def cleanup_old_jobs(hours: int = 24):
|
||||
"""Clean up old job directories."""
|
||||
try:
|
||||
cleaned = file_manager.cleanup_old_jobs(hours)
|
||||
return {
|
||||
'success': True,
|
||||
'cleaned_jobs': cleaned,
|
||||
'message': f'Cleaned up {cleaned} job directories older than {hours} hours',
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f'Cleanup failed: {e}', exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
Reference in New Issue
Block a user