Files
llmx/codex-rs/responses-api-proxy/src/lib.rs

206 lines
6.7 KiB
Rust
Raw Normal View History

feat: introduce responses-api-proxy (#4246) Details are in `responses-api-proxy/README.md`, but the key contribution of this PR is a new subcommand, `codex responses-api-proxy`, which reads the auth token for use with the OpenAI Responses API from `stdin` at startup and then proxies `POST` requests to `/v1/responses` over to `https://api.openai.com/v1/responses`, injecting the auth token as part of the `Authorization` header. The expectation is that `codex responses-api-proxy` is launched by a privileged user who has access to the auth token so that it can be used by unprivileged users of the Codex CLI on the same host. If the client only has one user account with `sudo`, one option is to: - run `sudo codex responses-api-proxy --http-shutdown --server-info /tmp/server-info.json` to start the server - record the port written to `/tmp/server-info.json` - relinquish their `sudo` privileges (which is irreversible!) like so: ``` sudo deluser $USER sudo || sudo gpasswd -d $USER sudo || true ``` - use `codex` with the proxy (see `README.md`) - when done, make a `GET` request to the server using the `PORT` from `server-info.json` to shut it down: ```shell curl --fail --silent --show-error "http://127.0.0.1:$PORT/shutdown" ``` To protect the auth token, we: - allocate a 1024 byte buffer on the stack and write `"Bearer "` into it to start - we then read from `stdin`, copying to the contents into the buffer after the prefix - after verifying the input looks good, we create a `String` from that buffer (so the data is now on the heap) - we zero out the stack-allocated buffer using https://crates.io/crates/zeroize so it is not optimized away by the compiler - we invoke `.leak()` on the `String` so we can treat its contents as a `&'static str`, as it will live for the rest of the processs - on UNIX, we `mlock(2)` the memory backing the `&'static str` - when using the `&'static str` when building an HTTP request, we use `HeaderValue::from_static()` to avoid copying the `&str` - we also invoke `.set_sensitive(true)` on the `HeaderValue`, which in theory indicates to other parts of the HTTP stack that the header should be treated with "special care" to avoid leakage: https://github.com/hyperium/http/blob/439d1c50d71e3be3204b6c4a1bf2255ed78e1f93/src/header/value.rs#L346-L376
2025-09-26 08:19:00 -07:00
use std::fs::File;
use std::fs::{self};
use std::io::Write;
use std::net::SocketAddr;
use std::net::TcpListener;
use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
feat: introduce responses-api-proxy (#4246) Details are in `responses-api-proxy/README.md`, but the key contribution of this PR is a new subcommand, `codex responses-api-proxy`, which reads the auth token for use with the OpenAI Responses API from `stdin` at startup and then proxies `POST` requests to `/v1/responses` over to `https://api.openai.com/v1/responses`, injecting the auth token as part of the `Authorization` header. The expectation is that `codex responses-api-proxy` is launched by a privileged user who has access to the auth token so that it can be used by unprivileged users of the Codex CLI on the same host. If the client only has one user account with `sudo`, one option is to: - run `sudo codex responses-api-proxy --http-shutdown --server-info /tmp/server-info.json` to start the server - record the port written to `/tmp/server-info.json` - relinquish their `sudo` privileges (which is irreversible!) like so: ``` sudo deluser $USER sudo || sudo gpasswd -d $USER sudo || true ``` - use `codex` with the proxy (see `README.md`) - when done, make a `GET` request to the server using the `PORT` from `server-info.json` to shut it down: ```shell curl --fail --silent --show-error "http://127.0.0.1:$PORT/shutdown" ``` To protect the auth token, we: - allocate a 1024 byte buffer on the stack and write `"Bearer "` into it to start - we then read from `stdin`, copying to the contents into the buffer after the prefix - after verifying the input looks good, we create a `String` from that buffer (so the data is now on the heap) - we zero out the stack-allocated buffer using https://crates.io/crates/zeroize so it is not optimized away by the compiler - we invoke `.leak()` on the `String` so we can treat its contents as a `&'static str`, as it will live for the rest of the processs - on UNIX, we `mlock(2)` the memory backing the `&'static str` - when using the `&'static str` when building an HTTP request, we use `HeaderValue::from_static()` to avoid copying the `&str` - we also invoke `.set_sensitive(true)` on the `HeaderValue`, which in theory indicates to other parts of the HTTP stack that the header should be treated with "special care" to avoid leakage: https://github.com/hyperium/http/blob/439d1c50d71e3be3204b6c4a1bf2255ed78e1f93/src/header/value.rs#L346-L376
2025-09-26 08:19:00 -07:00
use anyhow::Context;
use anyhow::Result;
use anyhow::anyhow;
use clap::Parser;
use reqwest::blocking::Client;
use reqwest::header::AUTHORIZATION;
use reqwest::header::HOST;
use reqwest::header::HeaderMap;
use reqwest::header::HeaderName;
use reqwest::header::HeaderValue;
use serde::Serialize;
use tiny_http::Header;
use tiny_http::Method;
use tiny_http::Request;
use tiny_http::Response;
use tiny_http::Server;
use tiny_http::StatusCode;
mod read_api_key;
use read_api_key::read_auth_header_from_stdin;
/// CLI arguments for the proxy.
#[derive(Debug, Clone, Parser)]
#[command(name = "responses-api-proxy", about = "Minimal OpenAI responses proxy")]
pub struct Args {
/// Port to listen on. If not set, an ephemeral port is used.
#[arg(long)]
pub port: Option<u16>,
/// Path to a JSON file to write startup info (single line). Includes {"port": <u16>}.
#[arg(long, value_name = "FILE")]
pub server_info: Option<PathBuf>,
/// Enable HTTP shutdown endpoint at GET /shutdown
#[arg(long)]
pub http_shutdown: bool,
}
#[derive(Serialize)]
struct ServerInfo {
port: u16,
}
/// Entry point for the library main, for parity with other crates.
pub fn run_main(args: Args) -> Result<()> {
let auth_header = read_auth_header_from_stdin()?;
let (listener, bound_addr) = bind_listener(args.port)?;
if let Some(path) = args.server_info.as_ref() {
write_server_info(path, bound_addr.port())?;
}
let server = Server::from_listener(listener, None)
.map_err(|err| anyhow!("creating HTTP server: {err}"))?;
let client = Arc::new(
Client::builder()
// Disable reqwest's 30s default so long-lived response streams keep flowing.
.timeout(None::<Duration>)
feat: introduce responses-api-proxy (#4246) Details are in `responses-api-proxy/README.md`, but the key contribution of this PR is a new subcommand, `codex responses-api-proxy`, which reads the auth token for use with the OpenAI Responses API from `stdin` at startup and then proxies `POST` requests to `/v1/responses` over to `https://api.openai.com/v1/responses`, injecting the auth token as part of the `Authorization` header. The expectation is that `codex responses-api-proxy` is launched by a privileged user who has access to the auth token so that it can be used by unprivileged users of the Codex CLI on the same host. If the client only has one user account with `sudo`, one option is to: - run `sudo codex responses-api-proxy --http-shutdown --server-info /tmp/server-info.json` to start the server - record the port written to `/tmp/server-info.json` - relinquish their `sudo` privileges (which is irreversible!) like so: ``` sudo deluser $USER sudo || sudo gpasswd -d $USER sudo || true ``` - use `codex` with the proxy (see `README.md`) - when done, make a `GET` request to the server using the `PORT` from `server-info.json` to shut it down: ```shell curl --fail --silent --show-error "http://127.0.0.1:$PORT/shutdown" ``` To protect the auth token, we: - allocate a 1024 byte buffer on the stack and write `"Bearer "` into it to start - we then read from `stdin`, copying to the contents into the buffer after the prefix - after verifying the input looks good, we create a `String` from that buffer (so the data is now on the heap) - we zero out the stack-allocated buffer using https://crates.io/crates/zeroize so it is not optimized away by the compiler - we invoke `.leak()` on the `String` so we can treat its contents as a `&'static str`, as it will live for the rest of the processs - on UNIX, we `mlock(2)` the memory backing the `&'static str` - when using the `&'static str` when building an HTTP request, we use `HeaderValue::from_static()` to avoid copying the `&str` - we also invoke `.set_sensitive(true)` on the `HeaderValue`, which in theory indicates to other parts of the HTTP stack that the header should be treated with "special care" to avoid leakage: https://github.com/hyperium/http/blob/439d1c50d71e3be3204b6c4a1bf2255ed78e1f93/src/header/value.rs#L346-L376
2025-09-26 08:19:00 -07:00
.build()
.context("building reqwest client")?,
);
eprintln!("responses-api-proxy listening on {bound_addr}");
let http_shutdown = args.http_shutdown;
for request in server.incoming_requests() {
let client = client.clone();
std::thread::spawn(move || {
if http_shutdown && request.method() == &Method::Get && request.url() == "/shutdown" {
let _ = request.respond(Response::new_empty(StatusCode(200)));
std::process::exit(0);
}
if let Err(e) = forward_request(&client, auth_header, request) {
eprintln!("forwarding error: {e}");
}
});
}
Err(anyhow!("server stopped unexpectedly"))
}
fn bind_listener(port: Option<u16>) -> Result<(TcpListener, SocketAddr)> {
let addr = SocketAddr::from(([127, 0, 0, 1], port.unwrap_or(0)));
let listener = TcpListener::bind(addr).with_context(|| format!("failed to bind {addr}"))?;
let bound = listener.local_addr().context("failed to read local_addr")?;
Ok((listener, bound))
}
fn write_server_info(path: &Path, port: u16) -> Result<()> {
if let Some(parent) = path.parent()
&& !parent.as_os_str().is_empty()
{
let parent_display = parent.display();
fs::create_dir_all(parent).with_context(|| format!("create_dir_all {parent_display}"))?;
}
let info = ServerInfo { port };
let data = serde_json::to_vec(&info).context("serialize startup info")?;
let p = path.display();
let mut f = File::create(path).with_context(|| format!("create {p}"))?;
f.write_all(&data).with_context(|| format!("write {p}"))?;
f.write_all(b"\n").with_context(|| format!("newline {p}"))?;
Ok(())
}
fn forward_request(client: &Client, auth_header: &'static str, mut req: Request) -> Result<()> {
// Only allow POST /v1/responses exactly, no query string.
let method = req.method().clone();
let url_path = req.url().to_string();
let allow = method == Method::Post && url_path == "/v1/responses";
if !allow {
let resp = Response::new_empty(StatusCode(403));
let _ = req.respond(resp);
return Ok(());
}
// Read request body
let mut body = Vec::new();
let mut reader = req.as_reader();
std::io::Read::read_to_end(&mut reader, &mut body)?;
// Build headers for upstream, forwarding everything from the incoming
// request except Authorization (we replace it below).
let mut headers = HeaderMap::new();
for header in req.headers() {
let name_ascii = header.field.as_str();
let lower = name_ascii.to_ascii_lowercase();
if lower.as_str() == "authorization" || lower.as_str() == "host" {
continue;
}
let header_name = match HeaderName::from_bytes(lower.as_bytes()) {
Ok(name) => name,
Err(_) => continue,
};
if let Ok(value) = HeaderValue::from_bytes(header.value.as_bytes()) {
headers.append(header_name, value);
}
}
// As part of our effort to to keep `auth_header` secret, we use a
// combination of `from_static()` and `set_sensitive(true)`.
let mut auth_header_value = HeaderValue::from_static(auth_header);
auth_header_value.set_sensitive(true);
headers.insert(AUTHORIZATION, auth_header_value);
headers.insert(HOST, HeaderValue::from_static("api.openai.com"));
let upstream = "https://api.openai.com/v1/responses";
let upstream_resp = client
.post(upstream)
.headers(headers)
.body(body)
.send()
.context("forwarding request to upstream")?;
// We have to create an adapter between a `reqwest::blocking::Response`
// and a `tiny_http::Response`. Fortunately, `reqwest::blocking::Response`
// implements `Read`, so we can use it directly as the body of the
// `tiny_http::Response`.
let status = upstream_resp.status();
let mut response_headers = Vec::new();
for (name, value) in upstream_resp.headers().iter() {
// Skip headers that tiny_http manages itself.
if matches!(
name.as_str(),
"content-length" | "transfer-encoding" | "connection" | "trailer" | "upgrade"
) {
continue;
}
if let Ok(header) = Header::from_bytes(name.as_str().as_bytes(), value.as_bytes()) {
response_headers.push(header);
}
}
let content_length = upstream_resp.content_length().and_then(|len| {
if len <= usize::MAX as u64 {
Some(len as usize)
} else {
None
}
});
let response = Response::new(
StatusCode(status.as_u16()),
response_headers,
upstream_resp,
content_length,
None,
);
let _ = req.respond(response);
Ok(())
}