fix: properly proxy streaming requests without buffering
The orchestrator was calling response.json() which buffered the entire streaming response before returning it. This caused LiteLLM to receive only one chunk with empty content instead of token-by-token streaming. Changes: - Detect streaming requests by parsing request body for 'stream': true - Use client.stream() with aiter_bytes() for streaming requests - Return StreamingResponse with proper SSE headers - Keep original JSONResponse behavior for non-streaming requests This fixes streaming from vLLM → orchestrator → LiteLLM chain.
This commit is contained in:
@@ -197,14 +197,38 @@ async def proxy_request(model_name: str, request: Request):
|
|||||||
# Build target URL
|
# Build target URL
|
||||||
target_url = f"http://localhost:{port}{path}"
|
target_url = f"http://localhost:{port}{path}"
|
||||||
|
|
||||||
logger.info(f"Proxying {method} request to {target_url}")
|
# Check if this is a streaming request
|
||||||
|
body = await request.body()
|
||||||
|
is_streaming = False
|
||||||
|
if method == "POST" and body:
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
body_json = json.loads(body)
|
||||||
|
is_streaming = body_json.get('stream', False)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
logger.info(f"Proxying {method} request to {target_url} (streaming: {is_streaming})")
|
||||||
|
|
||||||
|
if is_streaming:
|
||||||
|
# For streaming requests, use httpx streaming and yield chunks
|
||||||
|
async def stream_response():
|
||||||
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||||
|
async with client.stream(method, target_url, content=body, headers=headers) as response:
|
||||||
|
async for chunk in response.aiter_bytes():
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
stream_response(),
|
||||||
|
media_type="text/event-stream",
|
||||||
|
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# For non-streaming requests, use the original behavior
|
||||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||||
# Handle different request types
|
|
||||||
if method == "GET":
|
if method == "GET":
|
||||||
response = await client.get(target_url, headers=headers)
|
response = await client.get(target_url, headers=headers)
|
||||||
elif method == "POST":
|
elif method == "POST":
|
||||||
body = await request.body()
|
|
||||||
response = await client.post(target_url, content=body, headers=headers)
|
response = await client.post(target_url, content=body, headers=headers)
|
||||||
else:
|
else:
|
||||||
raise HTTPException(status_code=405, detail=f"Method {method} not supported")
|
raise HTTPException(status_code=405, detail=f"Method {method} not supported")
|
||||||
|
|||||||
Reference in New Issue
Block a user