feat: add images support to the Codex Typescript SDK (#5281)

Extend `run` and `runStreamed` input to be either a `string` or
structured input. A structured input is an array of text parts and/or
image paths, which will then be fed to the CLI through the `--image`
argument. Text parts are combined with double newlines. For instance:

```ts
const turn = await thread.run([
  { type: "text", text: "Describe these screenshots" },
  { type: "local_image", path: "./ui.png" },
  { type: "local_image", path: "./diagram.jpg" },
  { type: "text", text: "Thanks!" },
]);
```

Ends up launching the CLI with:

```
codex exec --image foo.png --image bar.png "Describe these screenshots\n\nThanks!" 
```

The complete `Input` type for both function now is:

```ts
export type UserInput =
  | {
      type: "text";
      text: string;
    }
  | {
      type: "local_image";
      path: string;
    };

export type Input = string | UserInput[];
```

This brings the Codex SDK closer to feature parity with the CLI.
Adresses #5280 .
This commit is contained in:
needs
2025-10-20 18:54:59 +02:00
committed by GitHub
parent 540abfa05e
commit 3282e86a60
5 changed files with 130 additions and 7 deletions

View File

@@ -83,6 +83,18 @@ const turn = await thread.run("Summarize repository status", {
console.log(turn.finalResponse);
```
### Attaching images
Provide structured input entries when you need to include images alongside text. Text entries are concatenated into the final prompt while image entries are passed to the Codex CLI via `--image`.
```typescript
const turn = await thread.run([
{ type: "text", text: "Describe these screenshots" },
{ type: "local_image", path: "./ui.png" },
{ type: "local_image", path: "./diagram.jpg" },
]);
```
### Resuming an existing thread
Threads are persisted in `~/.codex/sessions`. If you lose the in-memory `Thread` object, reconstruct it with `resumeThread()` and keep going.
@@ -95,7 +107,7 @@ await thread.run("Implement the fix");
### Working directory controls
Codex runs in the current working directory by default. To avoid unrecoverable errors, Codex requires the working directory to be a Git repository. You can skip the Git repository check by passing the `skipGitRepoCheck` option when creating a thread.
Codex runs in the current working directory by default. To avoid unrecoverable errors, Codex requires the working directory to be a Git repository. You can skip the Git repository check by passing the `skipGitRepoCheck` option when creating a thread.
```typescript
const thread = codex.startThread({

View File

@@ -11,6 +11,7 @@ export type CodexExecArgs = {
baseUrl?: string;
apiKey?: string;
threadId?: string | null;
images?: string[];
// --model
model?: string;
// --sandbox
@@ -55,6 +56,12 @@ export class CodexExec {
commandArgs.push("--output-schema", args.outputSchemaFile);
}
if (args.images?.length) {
for (const image of args.images) {
commandArgs.push("--image", image);
}
}
if (args.threadId) {
commandArgs.push("resume", args.threadId);
}

View File

@@ -24,7 +24,7 @@ export type {
} from "./items";
export { Thread } from "./thread";
export type { RunResult, RunStreamedResult, Input } from "./thread";
export type { RunResult, RunStreamedResult, Input, UserInput } from "./thread";
export { Codex } from "./codex";

View File

@@ -25,7 +25,17 @@ export type StreamedTurn = {
export type RunStreamedResult = StreamedTurn;
/** An input to send to the agent. */
export type Input = string;
export type UserInput =
| {
type: "text";
text: string;
}
| {
type: "local_image";
path: string;
};
export type Input = string | UserInput[];
/** Respesent a thread of conversation with the agent. One thread can have multiple consecutive turns. */
export class Thread {
@@ -53,21 +63,23 @@ export class Thread {
}
/** Provides the input to the agent and streams events as they are produced during the turn. */
async runStreamed(input: string, turnOptions: TurnOptions = {}): Promise<StreamedTurn> {
async runStreamed(input: Input, turnOptions: TurnOptions = {}): Promise<StreamedTurn> {
return { events: this.runStreamedInternal(input, turnOptions) };
}
private async *runStreamedInternal(
input: string,
input: Input,
turnOptions: TurnOptions = {},
): AsyncGenerator<ThreadEvent> {
const { schemaPath, cleanup } = await createOutputSchemaFile(turnOptions.outputSchema);
const options = this._threadOptions;
const { prompt, images } = normalizeInput(input);
const generator = this._exec.run({
input,
input: prompt,
baseUrl: this._options.baseUrl,
apiKey: this._options.apiKey,
threadId: this._id,
images,
model: options?.model,
sandboxMode: options?.sandboxMode,
workingDirectory: options?.workingDirectory,
@@ -93,7 +105,7 @@ export class Thread {
}
/** Provides the input to the agent and returns the completed turn. */
async run(input: string, turnOptions: TurnOptions = {}): Promise<Turn> {
async run(input: Input, turnOptions: TurnOptions = {}): Promise<Turn> {
const generator = this.runStreamedInternal(input, turnOptions);
const items: ThreadItem[] = [];
let finalResponse: string = "";
@@ -118,3 +130,19 @@ export class Thread {
return { items, finalResponse, usage };
}
}
function normalizeInput(input: Input): { prompt: string; images: string[] } {
if (typeof input === "string") {
return { prompt: input, images: [] };
}
const promptParts: string[] = [];
const images: string[] = [];
for (const item of input) {
if (item.type === "text") {
promptParts.push(item.text);
} else if (item.type === "local_image") {
images.push(item.path);
}
}
return { prompt: promptParts.join("\n\n"), images };
}

View File

@@ -279,6 +279,82 @@ describe("Codex", () => {
await close();
}
});
it("combines structured text input segments", async () => {
const { url, close, requests } = await startResponsesTestProxy({
statusCode: 200,
responseBodies: [
sse(
responseStarted("response_1"),
assistantMessage("Combined input applied", "item_1"),
responseCompleted("response_1"),
),
],
});
try {
const client = new Codex({ codexPathOverride: codexExecPath, baseUrl: url, apiKey: "test" });
const thread = client.startThread();
await thread.run([
{ type: "text", text: "Describe file changes" },
{ type: "text", text: "Focus on impacted tests" },
]);
const payload = requests[0];
expect(payload).toBeDefined();
const lastUser = payload!.json.input.at(-1);
expect(lastUser?.content?.[0]?.text).toBe("Describe file changes\n\nFocus on impacted tests");
} finally {
await close();
}
});
it("forwards images to exec", async () => {
const { url, close } = await startResponsesTestProxy({
statusCode: 200,
responseBodies: [
sse(
responseStarted("response_1"),
assistantMessage("Images applied", "item_1"),
responseCompleted("response_1"),
),
],
});
const { args: spawnArgs, restore } = codexExecSpy();
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-images-"));
const imagesDirectoryEntries: [string, string] = [
path.join(tempDir, "first.png"),
path.join(tempDir, "second.jpg"),
];
imagesDirectoryEntries.forEach((image, index) => {
fs.writeFileSync(image, `image-${index}`);
});
try {
const client = new Codex({ codexPathOverride: codexExecPath, baseUrl: url, apiKey: "test" });
const thread = client.startThread();
await thread.run([
{ type: "text", text: "describe the images" },
{ type: "local_image", path: imagesDirectoryEntries[0] },
{ type: "local_image", path: imagesDirectoryEntries[1] },
]);
const commandArgs = spawnArgs[0];
expect(commandArgs).toBeDefined();
const forwardedImages: string[] = [];
for (let i = 0; i < commandArgs!.length; i += 1) {
if (commandArgs![i] === "--image") {
forwardedImages.push(commandArgs![i + 1] ?? "");
}
}
expect(forwardedImages).toEqual(imagesDirectoryEntries);
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
restore();
await close();
}
});
it("runs in provided working directory", async () => {
const { url, close } = await startResponsesTestProxy({
statusCode: 200,