147 lines
4.2 KiB
Python
147 lines
4.2 KiB
Python
|
|
"""Health check and system information endpoints."""
|
||
|
|
import logging
|
||
|
|
import os
|
||
|
|
import time
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
import psutil
|
||
|
|
from fastapi import APIRouter, HTTPException
|
||
|
|
|
||
|
|
from app.config import settings
|
||
|
|
from app.schemas.health import HealthResponse, RequestStats, SystemInfo
|
||
|
|
from app.services import file_manager, worker
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
router = APIRouter(prefix='/api/v1', tags=['system'])
|
||
|
|
|
||
|
|
# Track uptime
|
||
|
|
_start_time = time.time()
|
||
|
|
|
||
|
|
# Request statistics
|
||
|
|
_stats = {
|
||
|
|
'total_requests': 0,
|
||
|
|
'successful_requests': 0,
|
||
|
|
'failed_requests': 0,
|
||
|
|
'total_processing_time': 0.0,
|
||
|
|
'total_images_processed': 0,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get('/health')
|
||
|
|
async def health_check() -> HealthResponse:
|
||
|
|
"""API health check."""
|
||
|
|
uptime = time.time() - _start_time
|
||
|
|
return HealthResponse(
|
||
|
|
status='ok',
|
||
|
|
version='1.0.0',
|
||
|
|
uptime_seconds=uptime,
|
||
|
|
message='Real-ESRGAN API is running',
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.get('/health/ready')
|
||
|
|
async def readiness_check():
|
||
|
|
"""Kubernetes readiness probe."""
|
||
|
|
from app.services import realesrgan_bridge
|
||
|
|
|
||
|
|
bridge = realesrgan_bridge.get_bridge()
|
||
|
|
|
||
|
|
if not bridge.initialized:
|
||
|
|
raise HTTPException(status_code=503, detail='Not ready')
|
||
|
|
|
||
|
|
return {'ready': True}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get('/health/live')
|
||
|
|
async def liveness_check():
|
||
|
|
"""Kubernetes liveness probe."""
|
||
|
|
return {'alive': True}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get('/system')
|
||
|
|
async def get_system_info() -> SystemInfo:
|
||
|
|
"""Get comprehensive system information."""
|
||
|
|
try:
|
||
|
|
# Uptime
|
||
|
|
uptime = time.time() - _start_time
|
||
|
|
|
||
|
|
# CPU and memory
|
||
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
||
|
|
memory = psutil.virtual_memory()
|
||
|
|
memory_percent = memory.percent
|
||
|
|
|
||
|
|
# Disk
|
||
|
|
disk = psutil.disk_usage('/')
|
||
|
|
disk_percent = disk.percent
|
||
|
|
|
||
|
|
# GPU
|
||
|
|
gpu_available = False
|
||
|
|
gpu_memory_mb = None
|
||
|
|
gpu_memory_used_mb = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
import torch
|
||
|
|
gpu_available = torch.cuda.is_available()
|
||
|
|
if gpu_available:
|
||
|
|
gpu_memory_mb = int(torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
|
||
|
|
gpu_memory_used_mb = int(torch.cuda.memory_allocated(0) / (1024 * 1024))
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Models directory size
|
||
|
|
models_size = file_manager.get_directory_size_mb(settings.models_dir)
|
||
|
|
|
||
|
|
# Jobs queue
|
||
|
|
wq = worker.get_worker_queue()
|
||
|
|
queue_length = wq.queue.qsize()
|
||
|
|
|
||
|
|
return SystemInfo(
|
||
|
|
status='ok',
|
||
|
|
version='1.0.0',
|
||
|
|
uptime_seconds=uptime,
|
||
|
|
cpu_usage_percent=cpu_percent,
|
||
|
|
memory_usage_percent=memory_percent,
|
||
|
|
disk_usage_percent=disk_percent,
|
||
|
|
gpu_available=gpu_available,
|
||
|
|
gpu_memory_mb=gpu_memory_mb,
|
||
|
|
gpu_memory_used_mb=gpu_memory_used_mb,
|
||
|
|
execution_providers=settings.get_execution_providers(),
|
||
|
|
models_dir_size_mb=models_size,
|
||
|
|
jobs_queue_length=queue_length,
|
||
|
|
)
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f'Failed to get system info: {e}', exc_info=True)
|
||
|
|
raise HTTPException(status_code=500, detail=str(e))
|
||
|
|
|
||
|
|
|
||
|
|
@router.get('/stats')
|
||
|
|
async def get_stats() -> RequestStats:
|
||
|
|
"""Get request statistics."""
|
||
|
|
avg_time = 0.0
|
||
|
|
if _stats['successful_requests'] > 0:
|
||
|
|
avg_time = _stats['total_processing_time'] / _stats['successful_requests']
|
||
|
|
|
||
|
|
return RequestStats(
|
||
|
|
total_requests=_stats['total_requests'],
|
||
|
|
successful_requests=_stats['successful_requests'],
|
||
|
|
failed_requests=_stats['failed_requests'],
|
||
|
|
average_processing_time_seconds=avg_time,
|
||
|
|
total_images_processed=_stats['total_images_processed'],
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.post('/cleanup')
|
||
|
|
async def cleanup_old_jobs(hours: int = 24):
|
||
|
|
"""Clean up old job directories."""
|
||
|
|
try:
|
||
|
|
cleaned = file_manager.cleanup_old_jobs(hours)
|
||
|
|
return {
|
||
|
|
'success': True,
|
||
|
|
'cleaned_jobs': cleaned,
|
||
|
|
'message': f'Cleaned up {cleaned} job directories older than {hours} hours',
|
||
|
|
}
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f'Cleanup failed: {e}', exc_info=True)
|
||
|
|
raise HTTPException(status_code=500, detail=str(e))
|