feat: add images support to the Codex Typescript SDK (#5281)

Extend `run` and `runStreamed` input to be either a `string` or
structured input. A structured input is an array of text parts and/or
image paths, which will then be fed to the CLI through the `--image`
argument. Text parts are combined with double newlines. For instance:

```ts
const turn = await thread.run([
  { type: "text", text: "Describe these screenshots" },
  { type: "local_image", path: "./ui.png" },
  { type: "local_image", path: "./diagram.jpg" },
  { type: "text", text: "Thanks!" },
]);
```

Ends up launching the CLI with:

```
codex exec --image foo.png --image bar.png "Describe these screenshots\n\nThanks!" 
```

The complete `Input` type for both function now is:

```ts
export type UserInput =
  | {
      type: "text";
      text: string;
    }
  | {
      type: "local_image";
      path: string;
    };

export type Input = string | UserInput[];
```

This brings the Codex SDK closer to feature parity with the CLI.
Adresses #5280 .
This commit is contained in:
needs
2025-10-20 18:54:59 +02:00
committed by GitHub
parent 540abfa05e
commit 3282e86a60
5 changed files with 130 additions and 7 deletions

View File

@@ -11,6 +11,7 @@ export type CodexExecArgs = {
baseUrl?: string;
apiKey?: string;
threadId?: string | null;
images?: string[];
// --model
model?: string;
// --sandbox
@@ -55,6 +56,12 @@ export class CodexExec {
commandArgs.push("--output-schema", args.outputSchemaFile);
}
if (args.images?.length) {
for (const image of args.images) {
commandArgs.push("--image", image);
}
}
if (args.threadId) {
commandArgs.push("resume", args.threadId);
}

View File

@@ -24,7 +24,7 @@ export type {
} from "./items";
export { Thread } from "./thread";
export type { RunResult, RunStreamedResult, Input } from "./thread";
export type { RunResult, RunStreamedResult, Input, UserInput } from "./thread";
export { Codex } from "./codex";

View File

@@ -25,7 +25,17 @@ export type StreamedTurn = {
export type RunStreamedResult = StreamedTurn;
/** An input to send to the agent. */
export type Input = string;
export type UserInput =
| {
type: "text";
text: string;
}
| {
type: "local_image";
path: string;
};
export type Input = string | UserInput[];
/** Respesent a thread of conversation with the agent. One thread can have multiple consecutive turns. */
export class Thread {
@@ -53,21 +63,23 @@ export class Thread {
}
/** Provides the input to the agent and streams events as they are produced during the turn. */
async runStreamed(input: string, turnOptions: TurnOptions = {}): Promise<StreamedTurn> {
async runStreamed(input: Input, turnOptions: TurnOptions = {}): Promise<StreamedTurn> {
return { events: this.runStreamedInternal(input, turnOptions) };
}
private async *runStreamedInternal(
input: string,
input: Input,
turnOptions: TurnOptions = {},
): AsyncGenerator<ThreadEvent> {
const { schemaPath, cleanup } = await createOutputSchemaFile(turnOptions.outputSchema);
const options = this._threadOptions;
const { prompt, images } = normalizeInput(input);
const generator = this._exec.run({
input,
input: prompt,
baseUrl: this._options.baseUrl,
apiKey: this._options.apiKey,
threadId: this._id,
images,
model: options?.model,
sandboxMode: options?.sandboxMode,
workingDirectory: options?.workingDirectory,
@@ -93,7 +105,7 @@ export class Thread {
}
/** Provides the input to the agent and returns the completed turn. */
async run(input: string, turnOptions: TurnOptions = {}): Promise<Turn> {
async run(input: Input, turnOptions: TurnOptions = {}): Promise<Turn> {
const generator = this.runStreamedInternal(input, turnOptions);
const items: ThreadItem[] = [];
let finalResponse: string = "";
@@ -118,3 +130,19 @@ export class Thread {
return { items, finalResponse, usage };
}
}
function normalizeInput(input: Input): { prompt: string; images: string[] } {
if (typeof input === "string") {
return { prompt: input, images: [] };
}
const promptParts: string[] = [];
const images: string[] = [];
for (const item of input) {
if (item.type === "text") {
promptParts.push(item.text);
} else if (item.type === "local_image") {
images.push(item.path);
}
}
return { prompt: promptParts.join("\n\n"), images };
}