feat: add images support to the Codex Typescript SDK (#5281)
Extend `run` and `runStreamed` input to be either a `string` or
structured input. A structured input is an array of text parts and/or
image paths, which will then be fed to the CLI through the `--image`
argument. Text parts are combined with double newlines. For instance:
```ts
const turn = await thread.run([
{ type: "text", text: "Describe these screenshots" },
{ type: "local_image", path: "./ui.png" },
{ type: "local_image", path: "./diagram.jpg" },
{ type: "text", text: "Thanks!" },
]);
```
Ends up launching the CLI with:
```
codex exec --image foo.png --image bar.png "Describe these screenshots\n\nThanks!"
```
The complete `Input` type for both function now is:
```ts
export type UserInput =
| {
type: "text";
text: string;
}
| {
type: "local_image";
path: string;
};
export type Input = string | UserInput[];
```
This brings the Codex SDK closer to feature parity with the CLI.
Adresses #5280 .
This commit is contained in:
@@ -11,6 +11,7 @@ export type CodexExecArgs = {
|
||||
baseUrl?: string;
|
||||
apiKey?: string;
|
||||
threadId?: string | null;
|
||||
images?: string[];
|
||||
// --model
|
||||
model?: string;
|
||||
// --sandbox
|
||||
@@ -55,6 +56,12 @@ export class CodexExec {
|
||||
commandArgs.push("--output-schema", args.outputSchemaFile);
|
||||
}
|
||||
|
||||
if (args.images?.length) {
|
||||
for (const image of args.images) {
|
||||
commandArgs.push("--image", image);
|
||||
}
|
||||
}
|
||||
|
||||
if (args.threadId) {
|
||||
commandArgs.push("resume", args.threadId);
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ export type {
|
||||
} from "./items";
|
||||
|
||||
export { Thread } from "./thread";
|
||||
export type { RunResult, RunStreamedResult, Input } from "./thread";
|
||||
export type { RunResult, RunStreamedResult, Input, UserInput } from "./thread";
|
||||
|
||||
export { Codex } from "./codex";
|
||||
|
||||
|
||||
@@ -25,7 +25,17 @@ export type StreamedTurn = {
|
||||
export type RunStreamedResult = StreamedTurn;
|
||||
|
||||
/** An input to send to the agent. */
|
||||
export type Input = string;
|
||||
export type UserInput =
|
||||
| {
|
||||
type: "text";
|
||||
text: string;
|
||||
}
|
||||
| {
|
||||
type: "local_image";
|
||||
path: string;
|
||||
};
|
||||
|
||||
export type Input = string | UserInput[];
|
||||
|
||||
/** Respesent a thread of conversation with the agent. One thread can have multiple consecutive turns. */
|
||||
export class Thread {
|
||||
@@ -53,21 +63,23 @@ export class Thread {
|
||||
}
|
||||
|
||||
/** Provides the input to the agent and streams events as they are produced during the turn. */
|
||||
async runStreamed(input: string, turnOptions: TurnOptions = {}): Promise<StreamedTurn> {
|
||||
async runStreamed(input: Input, turnOptions: TurnOptions = {}): Promise<StreamedTurn> {
|
||||
return { events: this.runStreamedInternal(input, turnOptions) };
|
||||
}
|
||||
|
||||
private async *runStreamedInternal(
|
||||
input: string,
|
||||
input: Input,
|
||||
turnOptions: TurnOptions = {},
|
||||
): AsyncGenerator<ThreadEvent> {
|
||||
const { schemaPath, cleanup } = await createOutputSchemaFile(turnOptions.outputSchema);
|
||||
const options = this._threadOptions;
|
||||
const { prompt, images } = normalizeInput(input);
|
||||
const generator = this._exec.run({
|
||||
input,
|
||||
input: prompt,
|
||||
baseUrl: this._options.baseUrl,
|
||||
apiKey: this._options.apiKey,
|
||||
threadId: this._id,
|
||||
images,
|
||||
model: options?.model,
|
||||
sandboxMode: options?.sandboxMode,
|
||||
workingDirectory: options?.workingDirectory,
|
||||
@@ -93,7 +105,7 @@ export class Thread {
|
||||
}
|
||||
|
||||
/** Provides the input to the agent and returns the completed turn. */
|
||||
async run(input: string, turnOptions: TurnOptions = {}): Promise<Turn> {
|
||||
async run(input: Input, turnOptions: TurnOptions = {}): Promise<Turn> {
|
||||
const generator = this.runStreamedInternal(input, turnOptions);
|
||||
const items: ThreadItem[] = [];
|
||||
let finalResponse: string = "";
|
||||
@@ -118,3 +130,19 @@ export class Thread {
|
||||
return { items, finalResponse, usage };
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeInput(input: Input): { prompt: string; images: string[] } {
|
||||
if (typeof input === "string") {
|
||||
return { prompt: input, images: [] };
|
||||
}
|
||||
const promptParts: string[] = [];
|
||||
const images: string[] = [];
|
||||
for (const item of input) {
|
||||
if (item.type === "text") {
|
||||
promptParts.push(item.text);
|
||||
} else if (item.type === "local_image") {
|
||||
images.push(item.path);
|
||||
}
|
||||
}
|
||||
return { prompt: promptParts.join("\n\n"), images };
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user