Initial commit

Signed-off-by: Ilan Bigio <ilan@openai.com>
2025-04-16 12:56:08 -04:00
commit 59a180ddec
163 changed files with 30587 additions and 0 deletions
--- a/codex-cli/tests/fixtures/a.txt
+++ b/codex-cli/tests/fixtures/a.txt
@@ -0,0 +1 @@
+hello a
--- a/codex-cli/tests/fixtures/b.txt
+++ b/codex-cli/tests/fixtures/b.txt
@@ -0,0 +1 @@
+hello b
--- a/codex-cli/tests/agent-cancel-early.test.ts
+++ b/codex-cli/tests/agent-cancel-early.test.ts
@@ -0,0 +1,127 @@
+import { describe, it, expect, vi } from "vitest";
+
+// Fake stream that waits a bit before yielding the function_call so the test
+// can cancel first.
+class SlowFunctionCallStream {
+  public controller = { abort: vi.fn() };
+
+  async *[Symbol.asyncIterator]() {
+    await new Promise((r) => setTimeout(r, 30));
+    yield {
+      type: "response.output_item.done",
+      item: {
+        type: "function_call",
+        id: "slow_call",
+        name: "shell",
+        arguments: JSON.stringify({ cmd: ["echo", "hi"] }),
+      },
+    } as any;
+
+    yield {
+      type: "response.completed",
+      response: {
+        id: "resp_slow",
+        status: "completed",
+        output: [
+          {
+            type: "function_call",
+            id: "slow_call",
+            name: "shell",
+            arguments: JSON.stringify({ cmd: ["echo", "hi"] }),
+          },
+        ],
+      },
+    } as any;
+  }
+}
+
+vi.mock("openai", () => {
+  const bodies: Array<any> = [];
+  let callCount = 0;
+  class FakeOpenAI {
+    public responses = {
+      create: async (body: any) => {
+        bodies.push(body);
+        callCount += 1;
+        if (callCount === 1) {
+          return new SlowFunctionCallStream();
+        }
+        return new (class {
+          public controller = { abort: vi.fn() };
+          async *[Symbol.asyncIterator]() {}
+        })();
+      },
+    };
+  }
+
+  class APIConnectionTimeoutError extends Error {}
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    APIConnectionTimeoutError,
+    _test: { getBodies: () => bodies },
+  };
+});
+
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  alwaysApprovedCommands: new Set<string>(),
+  canAutoApprove: () => ({ type: "auto-approve", runInSandbox: false } as any),
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+describe("cancel before first function_call", () => {
+  it("clears previous_response_id if no call ids captured", async () => {
+    const { _test } = (await import("openai")) as any;
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: () => {},
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+      config: { model: "any", instructions: "" },
+    });
+
+    // Start first run.
+    agent.run([
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "do" }],
+      },
+    ] as any);
+
+    // Cancel quickly before any stream item.
+    await new Promise((r) => setTimeout(r, 5));
+    agent.cancel();
+
+    // Second run.
+    await agent.run([
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "new" }],
+      },
+    ] as any);
+
+    const bodies = _test.getBodies();
+    const last = bodies[bodies.length - 1];
+    expect(last.previous_response_id).toBeUndefined();
+  });
+});
--- a/codex-cli/tests/agent-cancel-prev-response.test.ts
+++ b/codex-cli/tests/agent-cancel-prev-response.test.ts
@@ -0,0 +1,150 @@
+import { describe, it, expect, vi } from "vitest";
+
+// Stream that emits a function_call so the agent records a `lastResponseId`.
+class StreamWithFunctionCall {
+  public controller = { abort: vi.fn() };
+
+  async *[Symbol.asyncIterator]() {
+    // First, deliver the function call.
+    yield {
+      type: "response.output_item.done",
+      item: {
+        type: "function_call",
+        id: "call123",
+        name: "shell",
+        arguments: JSON.stringify({ cmd: ["echo", "hi"] }),
+      },
+    } as any;
+
+    // Then conclude the turn.
+    yield {
+      type: "response.completed",
+      response: {
+        id: "resp_func_call", // lastResponseId that would normally be stored
+        status: "completed",
+        output: [
+          {
+            type: "function_call",
+            id: "call123",
+            name: "shell",
+            arguments: JSON.stringify({ cmd: ["echo", "hi"] }),
+          },
+        ],
+      },
+    } as any;
+  }
+}
+
+vi.mock("openai", () => {
+  const invocationBodies: Array<any> = [];
+  let callNum = 0;
+  class FakeOpenAI {
+    public responses = {
+      create: async (body: any) => {
+        invocationBodies.push(body);
+        callNum += 1;
+        // First call streams a function_call, second call returns empty stream.
+        if (callNum === 1) {
+          return new StreamWithFunctionCall();
+        }
+        // Subsequent calls: empty stream.
+        return new (class {
+          public controller = { abort: vi.fn() };
+          async *[Symbol.asyncIterator]() {
+            /* no events */
+          }
+        })();
+      },
+    };
+  }
+
+  class APIConnectionTimeoutError extends Error {}
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    APIConnectionTimeoutError,
+    _test: {
+      getBodies: () => invocationBodies,
+    },
+  };
+});
+
+// Stub helpers not relevant for this test.
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  alwaysApprovedCommands: new Set<string>(),
+  canAutoApprove: () => ({ type: "auto-approve", runInSandbox: false } as any),
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+// Now import the agent.
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+describe("cancel clears previous_response_id", () => {
+  it("second run after cancel should NOT include previous_response_id", async () => {
+    const { _test } = (await import("openai")) as any;
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: () => {},
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+      config: { model: "any", instructions: "" },
+    });
+
+    // First run that triggers a function_call, but we will cancel *before* the
+    // turn completes so the tool result is never returned.
+    agent.run([
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "do something" }],
+      },
+    ] as any);
+    // Give it a moment to receive the function_call.
+    await new Promise((r) => setTimeout(r, 40));
+
+    // Cancel (simulate ESC ESC).
+    agent.cancel();
+
+    // Second user input.
+    await agent.run([
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "new command" }],
+      },
+    ] as any);
+
+    const bodies = _test.getBodies();
+    // eslint-disable-next-line no-console
+    console.log(JSON.stringify(bodies, null, 2));
+    expect(bodies.length).toBeGreaterThanOrEqual(2);
+
+    // The *last* invocation belongs to the second run (after cancellation).
+    const found = bodies.some(
+      (b: any) =>
+        Array.isArray(b.input) &&
+        b.input.some(
+          (i: any) =>
+            i.type === "function_call_output" && i.call_id === "call123",
+        ),
+    );
+
+    expect(found).toBe(true);
+  });
+});
--- a/codex-cli/tests/agent-cancel-race.test.ts
+++ b/codex-cli/tests/agent-cancel-race.test.ts
@@ -0,0 +1,138 @@
+import { describe, it, expect, vi } from "vitest";
+// This test reproduces the real‑world issue where the user cancels the current
+// task (Esc Esc) but the model’s response has already started to stream — the
+// partial answer still shows up in the UI.
+
+// --- Mocks -----------------------------------------------------------------
+
+class FakeStream {
+  public controller = { abort: vi.fn() };
+
+  async *[Symbol.asyncIterator]() {
+    // Immediately start streaming an assistant message so that it is possible
+    // for a user‑triggered cancellation that happens milliseconds later to
+    // arrive *after* the first token has already been emitted. This mirrors
+    // the real‑world race where the UI shows nothing yet (network / rendering
+    // latency) even though the model has technically started responding.
+    // Mimic an assistant message containing the word "hello".
+    yield {
+      type: "response.output_item.done",
+      item: {
+        type: "message",
+        role: "assistant",
+        id: "m1",
+        content: [{ type: "text", text: "hello" }],
+      },
+    } as any;
+
+    yield {
+      type: "response.completed",
+      response: {
+        id: "resp1",
+        status: "completed",
+        output: [
+          {
+            type: "message",
+            role: "assistant",
+            id: "m1",
+            content: [{ type: "text", text: "hello" }],
+          },
+        ],
+      },
+    } as any;
+  }
+}
+
+vi.mock("openai", () => {
+  let callCount = 0;
+  class FakeOpenAI {
+    public responses = {
+      create: async () => {
+        callCount += 1;
+        // Only the *first* stream yields "hello" so that any later answer
+        // clearly comes from the canceled run.
+        return callCount === 1
+          ? new FakeStream()
+          : new (class {
+              public controller = { abort: vi.fn() };
+              async *[Symbol.asyncIterator]() {
+                // empty stream
+              }
+            })();
+      },
+    };
+  }
+  class APIConnectionTimeoutError extends Error {}
+  return { __esModule: true, default: FakeOpenAI, APIConnectionTimeoutError };
+});
+
+// Stubs for external helpers referenced indirectly.
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  isSafeCommand: () => null,
+}));
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+// Stub the logger to avoid file‑system side effects during tests.
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+describe("Agent cancellation race", () => {
+  // We expect this test to highlight the current bug, so the suite should
+  // fail (red) until the underlying race condition in `AgentLoop` is fixed.
+  it("still emits the model answer even though cancel() was called", async () => {
+    const items: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      config: { model: "any", instructions: "" },
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => items.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const input = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "say hello" }],
+      },
+    ];
+
+    agent.run(input as any);
+
+    // Cancel after the stream has started.
+    await new Promise((r) => setTimeout(r, 5));
+    agent.cancel();
+
+    // Immediately issue a new (empty) command to mimic the UI letting the user
+    // type something else – this resets the agent state.
+    agent.run([
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "noop" }],
+      },
+    ] as any);
+
+    // Give everything time to flush.
+    await new Promise((r) => setTimeout(r, 40));
+
+    const assistantMsg = items.find((i) => i.role === "assistant");
+    // The bug manifests if the assistant message is still present even though
+    // it belongs to the canceled run. We assert that it *should not* be
+    // delivered – this test will fail until the bug is fixed.
+    expect(assistantMsg).toBeUndefined();
+  });
+});
--- a/codex-cli/tests/agent-cancel.test.ts
+++ b/codex-cli/tests/agent-cancel.test.ts
@@ -0,0 +1,169 @@
+import { describe, it, expect, vi } from "vitest";
+// Mock the OpenAI SDK used inside AgentLoop so we can control streaming events.
+class FakeStream {
+  public controller = { abort: vi.fn() };
+
+  async *[Symbol.asyncIterator]() {
+    // Immediately yield a function_call item.
+    yield {
+      type: "response.output_item.done",
+      item: {
+        type: "function_call",
+        id: "call1",
+        name: "shell",
+        arguments: JSON.stringify({ cmd: ["node", "-e", "console.log('hi')"] }),
+      },
+    } as any;
+
+    // Indicate turn completion with the same function_call.
+    yield {
+      type: "response.completed",
+      response: {
+        id: "resp1",
+        status: "completed",
+        output: [
+          {
+            type: "function_call",
+            id: "call1",
+            name: "shell",
+            arguments: JSON.stringify({
+              cmd: ["node", "-e", "console.log('hi')"],
+            }),
+          },
+        ],
+      },
+    } as any;
+  }
+}
+
+vi.mock("openai", () => {
+  class FakeOpenAI {
+    public responses = {
+      create: async () => new FakeStream(),
+    };
+  }
+  class APIConnectionTimeoutError extends Error {}
+  return { __esModule: true, default: FakeOpenAI, APIConnectionTimeoutError };
+});
+
+// Mock the approvals and formatCommand helpers referenced by handle‑exec‑command.
+vi.mock("@lib/approvals.js", () => {
+  return {
+    __esModule: true,
+    alwaysApprovedCommands: new Set<string>(),
+    canAutoApprove: () =>
+      ({ type: "auto-approve", runInSandbox: false } as any),
+    isSafeCommand: () => null,
+  };
+});
+
+vi.mock("@lib/format-command.js", () => {
+  return {
+    __esModule: true,
+    formatCommandForDisplay: (cmd: Array<string>) => cmd.join(" "),
+  };
+});
+
+// Stub the logger to avoid file‑system side effects during tests.
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+// After mocking dependencies we can import the modules under test.
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+import * as handleExec from "../src/utils/agent/handle-exec-command.js";
+
+describe("Agent cancellation", () => {
+  it("does not emit function_call_output after cancel", async () => {
+    // Mock handleExecCommand to simulate a slow shell command that would write
+    // "hello" if allowed to finish.
+    vi.spyOn(handleExec, "handleExecCommand").mockImplementation(async () => {
+      await new Promise((r) => setTimeout(r, 50));
+      return { outputText: "hello", metadata: {} } as any;
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      config: { model: "any", instructions: "" },
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (item) => {
+        received.push(item);
+      },
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "say hi" }],
+      },
+    ];
+
+    // Start the agent loop but don't await it – we'll cancel while it's running.
+    agent.run(userMsg as any);
+
+    // Give the agent a moment to start processing.
+    await new Promise((r) => setTimeout(r, 10));
+
+    // Cancel the task.
+    agent.cancel();
+
+    // Wait a little longer to allow any pending promises to settle.
+    await new Promise((r) => setTimeout(r, 100));
+
+    // Ensure no function_call_output items were emitted after cancellation.
+    const hasOutput = received.some((i) => i.type === "function_call_output");
+    expect(hasOutput).toBe(false);
+  });
+
+  it("still suppresses output when cancellation happens after a fast exec", async () => {
+    vi.restoreAllMocks();
+
+    // Quick exec mock (returns immediately).
+    vi.spyOn(handleExec, "handleExecCommand").mockResolvedValue({
+      outputText: "hello-fast",
+      metadata: {},
+    } as any);
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      config: { model: "any", instructions: "" },
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (item) => received.push(item),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "say hi" }],
+      },
+    ];
+
+    agent.run(userMsg as any);
+
+    // Wait a bit so the exec has certainly finished and output is ready.
+    await new Promise((r) => setTimeout(r, 20));
+
+    agent.cancel();
+
+    await new Promise((r) => setTimeout(r, 50));
+
+    const hasOutput = received.some((i) => i.type === "function_call_output");
+    expect(hasOutput).toBe(false);
+  });
+});
--- a/codex-cli/tests/agent-function-call-id.test.ts
+++ b/codex-cli/tests/agent-function-call-id.test.ts
@@ -0,0 +1,149 @@
+import { describe, it, expect, vi } from "vitest";
+// ---------------------------------------------------------------------------
+// This regression test ensures that the AgentLoop correctly copies the ID of a
+// function tool‑call (be it `call_id` from the /responses endpoint *or* `id`
+// from the /chat endpoint) into the subsequent `function_call_output` item. A
+// missing or mismatched ID leads to the dreaded
+//   400 | No tool output found for function call …
+// error from the OpenAI API.
+// ---------------------------------------------------------------------------
+
+// Fake OpenAI stream that immediately yields a *chat‑style* function_call item.
+class FakeStream {
+  public controller = { abort: vi.fn() };
+
+  async *[Symbol.asyncIterator]() {
+    yield {
+      type: "response.output_item.done",
+      item: {
+        // Chat endpoint style (id + nested function descriptor)
+        type: "function_call",
+        id: "call_test_123",
+        function: {
+          name: "shell",
+          arguments: JSON.stringify({ cmd: ["echo", "hi"] }),
+        },
+      },
+    } as any;
+
+    yield {
+      type: "response.completed",
+      response: {
+        id: "resp1",
+        status: "completed",
+        output: [
+          {
+            type: "function_call",
+            id: "call_test_123",
+            function: {
+              name: "shell",
+              arguments: JSON.stringify({ cmd: ["echo", "hi"] }),
+            },
+          },
+        ],
+      },
+    } as any;
+  }
+}
+
+// We intercept the OpenAI SDK so we can inspect the body of the second call –
+// the one that is expected to contain our `function_call_output` item.
+vi.mock("openai", () => {
+  let invocation = 0;
+  let capturedSecondBody: any;
+
+  class FakeOpenAI {
+    public responses = {
+      create: async (body: any) => {
+        invocation += 1;
+        if (invocation === 1) {
+          return new FakeStream();
+        }
+        if (invocation === 2) {
+          capturedSecondBody = body;
+          // empty stream
+          return new (class {
+            public controller = { abort: vi.fn() };
+            async *[Symbol.asyncIterator]() {
+              /* no items */
+            }
+          })();
+        }
+        throw new Error("Unexpected additional invocation in test");
+      },
+    };
+  }
+
+  class APIConnectionTimeoutError extends Error {}
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    APIConnectionTimeoutError,
+    // Re‑export so the test can access the captured body.
+    _test: {
+      getCapturedSecondBody: () => capturedSecondBody,
+    },
+  };
+});
+
+// Stub approvals & command formatting – not relevant for this test.
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  alwaysApprovedCommands: new Set<string>(),
+  canAutoApprove: () => ({ type: "auto-approve", runInSandbox: false } as any),
+  isSafeCommand: () => null,
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+// Stub logger to keep the test output clean.
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+// Finally, import the module under test.
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+describe("function_call_output includes original call ID", () => {
+  it("copies id → call_id so the API accepts the tool result", async () => {
+    const { _test } = (await import("openai")) as any;
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: () => {},
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "run" }],
+      },
+    ];
+
+    await agent.run(userMsg as any);
+
+    // Give the agent a tick to finish the second round‑trip.
+    await new Promise((r) => setTimeout(r, 20));
+
+    const body = _test.getCapturedSecondBody();
+    expect(body).toBeTruthy();
+
+    const outputItem = body.input?.find(
+      (i: any) => i.type === "function_call_output",
+    );
+    expect(outputItem).toBeTruthy();
+    expect(outputItem.call_id).toBe("call_test_123");
+  });
+});
--- a/codex-cli/tests/agent-generic-network-error.test.ts
+++ b/codex-cli/tests/agent-generic-network-error.test.ts
@@ -0,0 +1,132 @@
+import { describe, it, expect, vi } from "vitest";
+
+// ---------------------------------------------------------------------------
+//  Utility helpers & OpenAI mock (lightweight – focuses on network failures)
+// ---------------------------------------------------------------------------
+
+const openAiState: { createSpy?: ReturnType<typeof vi.fn> } = {};
+
+vi.mock("openai", () => {
+  class FakeOpenAI {
+    public responses = {
+      create: (...args: Array<any>) => openAiState.createSpy!(...args),
+    };
+  }
+
+  class APIConnectionTimeoutError extends Error {}
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    APIConnectionTimeoutError,
+  };
+});
+
+// Stub approvals / formatting helpers – unrelated to network handling.
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  alwaysApprovedCommands: new Set<string>(),
+  canAutoApprove: () => ({ type: "auto-approve", runInSandbox: false } as any),
+  isSafeCommand: () => null,
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+// Silence debug logs so test output stays clean.
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+describe("AgentLoop – generic network/server errors", () => {
+  it("emits friendly system message instead of throwing on ECONNRESET", async () => {
+    const netErr: any = new Error("socket hang up");
+    netErr.code = "ECONNRESET";
+
+    openAiState.createSpy = vi.fn(async () => {
+      throw netErr;
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => received.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "ping" }],
+      },
+    ];
+
+    await expect(agent.run(userMsg as any)).resolves.not.toThrow();
+
+    // give flush timers a chance
+    await new Promise((r) => setTimeout(r, 20));
+
+    const sysMsg = received.find(
+      (i) =>
+        i.role === "system" &&
+        typeof i.content?.[0]?.text === "string" &&
+        i.content[0].text.includes("Network error"),
+    );
+
+    expect(sysMsg).toBeTruthy();
+  });
+
+  it("emits user friendly message on HTTP 500 from OpenAI", async () => {
+    const serverErr: any = new Error("Internal Server Error");
+    serverErr.status = 500;
+
+    openAiState.createSpy = vi.fn(async () => {
+      throw serverErr;
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => received.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "ping" }],
+      },
+    ];
+
+    await expect(agent.run(userMsg as any)).resolves.not.toThrow();
+
+    await new Promise((r) => setTimeout(r, 20));
+
+    const sysMsg = received.find(
+      (i) =>
+        i.role === "system" &&
+        typeof i.content?.[0]?.text === "string" &&
+        i.content[0].text.includes("error"),
+    );
+
+    expect(sysMsg).toBeTruthy();
+  });
+});
--- a/codex-cli/tests/agent-invalid-request-error.test.ts
+++ b/codex-cli/tests/agent-invalid-request-error.test.ts
@@ -0,0 +1,88 @@
+import { describe, it, expect, vi } from "vitest";
+
+// ---------------------------------------------------------------------------
+// Mock helpers
+// ---------------------------------------------------------------------------
+
+const openAiState: { createSpy?: ReturnType<typeof vi.fn> } = {};
+
+vi.mock("openai", () => {
+  class FakeOpenAI {
+    public responses = {
+      create: (...args: Array<any>) => openAiState.createSpy!(...args),
+    };
+  }
+
+  class APIConnectionTimeoutError extends Error {}
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    APIConnectionTimeoutError,
+  };
+});
+
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  alwaysApprovedCommands: new Set<string>(),
+  canAutoApprove: () => ({ type: "auto-approve", runInSandbox: false } as any),
+  isSafeCommand: () => null,
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+describe("AgentLoop – invalid request / 4xx errors", () => {
+  it("shows system message and resolves on invalid_request_error", async () => {
+    const err: any = new Error("Invalid request: model not found");
+    err.code = "invalid_request_error";
+    err.status = 400;
+
+    openAiState.createSpy = vi.fn(async () => {
+      throw err;
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => received.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "hello" }],
+      },
+    ];
+
+    await expect(agent.run(userMsg as any)).resolves.not.toThrow();
+
+    await new Promise((r) => setTimeout(r, 20));
+
+    const sysMsg = received.find(
+      (i) =>
+        i.role === "system" &&
+        typeof i.content?.[0]?.text === "string" &&
+        i.content[0].text.includes("OpenAI rejected"),
+    );
+
+    expect(sysMsg).toBeTruthy();
+  });
+});
--- a/codex-cli/tests/agent-max-tokens-error.test.ts
+++ b/codex-cli/tests/agent-max-tokens-error.test.ts
@@ -0,0 +1,92 @@
+import { describe, it, expect, vi } from "vitest";
+
+// ---------------------------------------------------------------------------
+// Mock helpers
+// ---------------------------------------------------------------------------
+
+const openAiState: { createSpy?: ReturnType<typeof vi.fn> } = {};
+
+vi.mock("openai", () => {
+  class FakeOpenAI {
+    public responses = {
+      create: (...args: Array<any>) => openAiState.createSpy!(...args),
+    };
+  }
+
+  class APIConnectionTimeoutError extends Error {}
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    APIConnectionTimeoutError,
+  };
+});
+
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  alwaysApprovedCommands: new Set<string>(),
+  canAutoApprove: () => ({ type: "auto-approve", runInSandbox: false } as any),
+  isSafeCommand: () => null,
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+describe("AgentLoop – max_tokens too large error", () => {
+  it("shows context‑length system message and resolves", async () => {
+    const err: any = new Error(
+      "max_tokens is too large: 167888. This model supports at most 100000 completion tokens, whereas you provided 167888.",
+    );
+    err.type = "invalid_request_error";
+    err.param = "max_tokens";
+    err.status = 400;
+
+    openAiState.createSpy = vi.fn(async () => {
+      throw err;
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => received.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "hello" }],
+      },
+    ];
+
+    await expect(agent.run(userMsg as any)).resolves.not.toThrow();
+
+    // allow asynchronous onItem calls to flush
+    await new Promise((r) => setTimeout(r, 20));
+
+    const sysMsg = received.find(
+      (i) =>
+        i.role === "system" &&
+        typeof i.content?.[0]?.text === "string" &&
+        i.content[0].text.includes("exceeds the maximum context length"),
+    );
+
+    expect(sysMsg).toBeTruthy();
+  });
+});
--- a/codex-cli/tests/agent-network-errors.test.ts
+++ b/codex-cli/tests/agent-network-errors.test.ts
@@ -0,0 +1,179 @@
+import { describe, it, expect, vi } from "vitest";
+// ---------------------------------------------------------------------------
+//  Utility: fake OpenAI SDK with programmable behaviour per test case.
+// ---------------------------------------------------------------------------
+
+// A minimal helper to build predetermined streams.
+function createStream(events: Array<any>, opts: { throwAfter?: Error } = {}) {
+  return new (class {
+    public controller = { abort: vi.fn() };
+
+    async *[Symbol.asyncIterator]() {
+      for (const ev of events) {
+        yield ev;
+      }
+      if (opts.throwAfter) {
+        throw opts.throwAfter;
+      }
+    }
+  })();
+}
+
+// Holders so tests can access spies/state injected by the mock.
+const openAiState: {
+  createSpy?: ReturnType<typeof vi.fn>;
+} = {};
+
+vi.mock("openai", () => {
+  class APIConnectionTimeoutError extends Error {}
+
+  class FakeOpenAI {
+    public responses = {
+      // `createSpy` will be swapped out per test.
+      create: (...args: Array<any>) => openAiState.createSpy!(...args),
+    };
+  }
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    APIConnectionTimeoutError,
+  };
+});
+
+// Stub approvals / formatting helpers – not relevant here.
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  alwaysApprovedCommands: new Set<string>(),
+  canAutoApprove: () => ({ type: "auto-approve", runInSandbox: false } as any),
+  isSafeCommand: () => null,
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+// Silence debug logging from agent‑loop.
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+describe("AgentLoop – network resilience", () => {
+  it("retries once on APIConnectionTimeoutError and succeeds", async () => {
+    // Arrange fake OpenAI: first call throws APIConnectionTimeoutError, second returns a short stream.
+    const { APIConnectionTimeoutError } = await import("openai");
+
+    let call = 0;
+    openAiState.createSpy = vi.fn(async () => {
+      call += 1;
+      if (call === 1) {
+        throw new APIConnectionTimeoutError({ message: "timeout" });
+      }
+      // Second attempt – minimal assistant reply.
+      return createStream([
+        {
+          type: "response.output_item.done",
+          item: {
+            type: "message",
+            role: "assistant",
+            id: "m1",
+            content: [{ type: "text", text: "ok" }],
+          },
+        },
+        {
+          type: "response.completed",
+          response: {
+            id: "r1",
+            status: "completed",
+            output: [
+              {
+                type: "message",
+                role: "assistant",
+                id: "m1",
+                content: [{ type: "text", text: "ok" }],
+              },
+            ],
+          },
+        },
+      ]);
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => received.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "hi" }],
+      },
+    ];
+
+    await agent.run(userMsg as any);
+
+    // Wait a tick for flush.
+    await new Promise((r) => setTimeout(r, 20));
+
+    expect(openAiState.createSpy).toHaveBeenCalledTimes(2);
+
+    const assistant = received.find((i) => i.role === "assistant");
+    expect(assistant).toBeTruthy();
+    expect(assistant.content?.[0]?.text).toBe("ok");
+  });
+
+  it("shows system message when connection closes prematurely", async () => {
+    const prematureError = new Error("Premature close");
+    // @ts-ignore add code prop
+    prematureError.code = "ERR_STREAM_PREMATURE_CLOSE";
+
+    openAiState.createSpy = vi.fn(async () => {
+      return createStream([], { throwAfter: prematureError });
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => received.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "hi" }],
+      },
+    ];
+
+    await agent.run(userMsg as any);
+
+    // Wait a tick.
+    await new Promise((r) => setTimeout(r, 20));
+
+    const sysMsg = received.find(
+      (i) =>
+        i.role === "system" &&
+        i.content?.[0]?.text?.includes("Connection closed prematurely"),
+    );
+    expect(sysMsg).toBeTruthy();
+  });
+});
--- a/codex-cli/tests/agent-project-doc.test.ts
+++ b/codex-cli/tests/agent-project-doc.test.ts
@@ -0,0 +1,141 @@
+import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "fs";
+import { tmpdir } from "os";
+import { join } from "path";
+import { describe, expect, it, vi, beforeEach, afterEach } from "vitest";
+
+// ---------------------------------------------------------------------------
+// Test helpers & mocks
+// ---------------------------------------------------------------------------
+
+// Fake stream returned from the mocked OpenAI SDK. The AgentLoop only cares
+// that the stream is async‑iterable and eventually yields a `response.completed`
+// event so the turn can finish.
+class FakeStream {
+  public controller = { abort: vi.fn() };
+
+  async *[Symbol.asyncIterator]() {
+    yield {
+      type: "response.completed",
+      response: {
+        id: "r1",
+        status: "completed",
+        output: [],
+      },
+    } as any;
+  }
+}
+
+// Capture the parameters that AgentLoop sends to `openai.responses.create()` so
+// we can assert on the `instructions` value.
+let lastCreateParams: any = null;
+
+vi.mock("openai", () => {
+  class FakeOpenAI {
+    public responses = {
+      create: async (params: any) => {
+        lastCreateParams = params;
+        return new FakeStream();
+      },
+    };
+  }
+
+  class APIConnectionTimeoutError extends Error {}
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    APIConnectionTimeoutError,
+  };
+});
+
+// The AgentLoop pulls these helpers in order to decide whether a command can
+// be auto‑approved. None of that matters for this test, so we stub the module
+// with minimal no‑op implementations.
+vi.mock("@lib/approvals.js", () => {
+  return {
+    __esModule: true,
+    alwaysApprovedCommands: new Set<string>(),
+    canAutoApprove: () =>
+      ({ type: "auto-approve", runInSandbox: false } as any),
+    isSafeCommand: () => null,
+  };
+});
+
+vi.mock("@lib/format-command.js", () => {
+  return {
+    __esModule: true,
+    formatCommandForDisplay: (cmd: Array<string>) => cmd.join(" "),
+  };
+});
+
+// Stub the file‑based logger to avoid side effects and keep the test output
+// clean.
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+// ---------------------------------------------------------------------------
+// After mocks are in place we can import the modules under test.
+// ---------------------------------------------------------------------------
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+import { loadConfig } from "../src/utils/config.js";
+
+// ---------------------------------------------------------------------------
+
+let projectDir: string;
+
+beforeEach(() => {
+  // Create a fresh temporary directory to act as an isolated git repo.
+  projectDir = mkdtempSync(join(tmpdir(), "codex-proj-"));
+  mkdirSync(join(projectDir, ".git")); // mark as project root
+
+  // Write a small project doc that we expect to be included in the prompt.
+  writeFileSync(join(projectDir, "codex.md"), "# Test Project\nHello docs!\n");
+
+  lastCreateParams = null; // reset captured SDK params
+});
+
+afterEach(() => {
+  rmSync(projectDir, { recursive: true, force: true });
+});
+
+describe("AgentLoop", () => {
+  it("passes codex.md contents through the instructions parameter", async () => {
+    const config = loadConfig(undefined, undefined, { cwd: projectDir });
+
+    // Sanity‑check that loadConfig picked up the project doc. This is *not* the
+    // main assertion – we just avoid a false‑positive if the fixture setup is
+    // incorrect.
+    expect(config.instructions).toContain("Hello docs!");
+
+    const agent = new AgentLoop({
+      model: "o3-mini", // arbitrary
+      instructions: config.instructions,
+      config,
+      approvalPolicy: { mode: "suggest" } as any,
+      onItem: () => {},
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    // Kick off a single run and wait for it to finish. The fake OpenAI client
+    // will resolve immediately.
+    await agent.run([
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "ping" }],
+      },
+    ]);
+
+    // Ensure the AgentLoop called the SDK and that the instructions we see at
+    // that point still include the project doc. This validates the full path:
+    // loadConfig → AgentLoop → addInstructionPrefix → OpenAI SDK.
+    expect(lastCreateParams).not.toBeNull();
+    expect(lastCreateParams.instructions).toContain("Hello docs!");
+  });
+});
--- a/codex-cli/tests/agent-rate-limit-error.test.ts
+++ b/codex-cli/tests/agent-rate-limit-error.test.ts
@@ -0,0 +1,127 @@
+import { describe, it, expect, vi } from "vitest";
+
+// ---------------------------------------------------------------------------
+//  Utility: fake OpenAI SDK with programmable behaviour per test case.
+// ---------------------------------------------------------------------------
+
+// Same helper as used in agent-network-errors.test.ts – duplicated here to keep
+// the test file self‑contained.
+// Exported so that the strict TypeScript compiler does not flag it as unused –
+// individual tests may import it for ad‑hoc diagnostics when debugging.
+export function _createStream(events: Array<any>) {
+  return new (class {
+    public controller = { abort: vi.fn() };
+
+    async *[Symbol.asyncIterator]() {
+      for (const ev of events) {
+        yield ev;
+      }
+    }
+  })();
+}
+
+// Holders so tests can access spies/state injected by the mock.
+const openAiState: { createSpy?: ReturnType<typeof vi.fn> } = {};
+
+vi.mock("openai", () => {
+  class RateLimitError extends Error {
+    public code = "rate_limit_exceeded";
+    constructor(message: string) {
+      super(message);
+      this.name = "RateLimitError";
+    }
+  }
+
+  // Re‑export the timeout error as well so other tests that expect it continue
+  // to work regardless of execution order.
+  class APIConnectionTimeoutError extends Error {}
+
+  class FakeOpenAI {
+    public responses = {
+      // `createSpy` will be swapped out per test.
+      create: (...args: Array<any>) => openAiState.createSpy!(...args),
+    };
+  }
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    RateLimitError,
+    APIConnectionTimeoutError,
+  };
+});
+
+// Stub approvals / formatting helpers – not relevant to rate‑limit handling.
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  alwaysApprovedCommands: new Set<string>(),
+  canAutoApprove: () => ({ type: "auto-approve", runInSandbox: false } as any),
+  isSafeCommand: () => null,
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+// Silence debug logging from agent‑loop so test output remains clean.
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+describe("AgentLoop – OpenAI rate limit errors", () => {
+  it("surfaces a user‑friendly system message instead of throwing on RateLimitError (TDD – expected to fail)", async () => {
+    // Arrange fake OpenAI: every call fails with a rate‑limit error.
+    const rateLimitErrMsg =
+      "Rate limit reached: Limit 20, Used 20, Requested 1. Please try again.";
+
+    openAiState.createSpy = vi.fn(async () => {
+      // Simulate the SDK throwing before any streaming begins.
+      // In real life this happens when the HTTP response status is 429.
+      const err: any = new Error(rateLimitErrMsg);
+      err.code = "rate_limit_exceeded";
+      throw err;
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => received.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "hello" }],
+      },
+    ];
+
+    // The desired behaviour (not yet implemented): AgentLoop should catch the
+    // rate‑limit error, emit a helpful system message and resolve without
+    // throwing so callers can let the user retry.
+    await expect(agent.run(userMsg as any)).resolves.not.toThrow();
+
+    // Let flush timers run.
+    await new Promise((r) => setTimeout(r, 20));
+
+    const sysMsg = received.find(
+      (i) =>
+        i.role === "system" &&
+        typeof i.content?.[0]?.text === "string" &&
+        i.content[0].text.includes("Rate limit"),
+    );
+
+    expect(sysMsg).toBeTruthy();
+  });
+});
--- a/codex-cli/tests/agent-server-retry.test.ts
+++ b/codex-cli/tests/agent-server-retry.test.ts
@@ -0,0 +1,166 @@
+import { describe, it, expect, vi } from "vitest";
+
+// Utility: fake OpenAI SDK that can be instructed to fail with 5xx a set
+// number of times before succeeding.
+
+function createStream(events: Array<any>) {
+  return new (class {
+    public controller = { abort: vi.fn() };
+    async *[Symbol.asyncIterator]() {
+      for (const ev of events) {
+        yield ev;
+      }
+    }
+  })();
+}
+
+const openAiState: { createSpy?: ReturnType<typeof vi.fn> } = {};
+
+vi.mock("openai", () => {
+  class FakeOpenAI {
+    public responses = {
+      create: (...args: Array<any>) => openAiState.createSpy!(...args),
+    };
+  }
+
+  class APIConnectionTimeoutError extends Error {}
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+    APIConnectionTimeoutError,
+  };
+});
+
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  alwaysApprovedCommands: new Set<string>(),
+  canAutoApprove: () => ({ type: "auto-approve", runInSandbox: false } as any),
+  isSafeCommand: () => null,
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+
+describe("AgentLoop – automatic retry on 5xx errors", () => {
+  it("retries up to 3 times then succeeds", async () => {
+    // Fail twice with 500 then succeed.
+    let call = 0;
+    openAiState.createSpy = vi.fn(async () => {
+      call += 1;
+      if (call <= 2) {
+        const err: any = new Error("Internal Server Error");
+        err.status = 500;
+        throw err;
+      }
+      return createStream([
+        {
+          type: "response.output_item.done",
+          item: {
+            type: "message",
+            role: "assistant",
+            id: "m1",
+            content: [{ type: "text", text: "ok" }],
+          },
+        },
+        {
+          type: "response.completed",
+          response: {
+            id: "r1",
+            status: "completed",
+            output: [
+              {
+                type: "message",
+                role: "assistant",
+                id: "m1",
+                content: [{ type: "text", text: "ok" }],
+              },
+            ],
+          },
+        },
+      ]);
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => received.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "hi" }],
+      },
+    ];
+
+    await agent.run(userMsg as any);
+
+    await new Promise((r) => setTimeout(r, 20));
+
+    expect(openAiState.createSpy).toHaveBeenCalledTimes(3);
+
+    const assistant = received.find((i) => i.role === "assistant");
+    expect(assistant?.content?.[0]?.text).toBe("ok");
+  });
+
+  it("fails after 3 attempts and surfaces system message", async () => {
+    openAiState.createSpy = vi.fn(async () => {
+      const err: any = new Error("Internal Server Error");
+      err.status = 502; // any 5xx
+      throw err;
+    });
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => received.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "hello" }],
+      },
+    ];
+
+    await expect(agent.run(userMsg as any)).resolves.not.toThrow();
+
+    await new Promise((r) => setTimeout(r, 20));
+
+    expect(openAiState.createSpy).toHaveBeenCalledTimes(3);
+
+    const sysMsg = received.find(
+      (i) =>
+        i.role === "system" &&
+        typeof i.content?.[0]?.text === "string" &&
+        i.content[0].text.includes("Network error"),
+    );
+
+    expect(sysMsg).toBeTruthy();
+  });
+});
--- a/codex-cli/tests/agent-terminate.test.ts
+++ b/codex-cli/tests/agent-terminate.test.ts
@@ -0,0 +1,171 @@
+import { describe, it, expect, vi } from "vitest";
+
+// --- OpenAI stream mock ----------------------------------------------------
+
+class FakeStream {
+  public controller = { abort: vi.fn() };
+
+  async *[Symbol.asyncIterator]() {
+    // Immediately ask for a shell function call so we can test that the
+    // subsequent function_call_output never gets surfaced after terminate().
+    yield {
+      type: "response.output_item.done",
+      item: {
+        type: "function_call",
+        id: "call‑terminate‑1",
+        name: "shell",
+        arguments: JSON.stringify({ cmd: ["sleep", "5"] }),
+      },
+    } as any;
+
+    // Turn completion echoing the same function call.
+    yield {
+      type: "response.completed",
+      response: {
+        id: "resp‑terminate‑1",
+        status: "completed",
+        output: [
+          {
+            type: "function_call",
+            id: "call‑terminate‑1",
+            name: "shell",
+            arguments: JSON.stringify({ cmd: ["sleep", "5"] }),
+          },
+        ],
+      },
+    } as any;
+  }
+}
+
+vi.mock("openai", () => {
+  class FakeOpenAI {
+    public responses = {
+      create: async () => new FakeStream(),
+    };
+  }
+  class APIConnectionTimeoutError extends Error {}
+  return { __esModule: true, default: FakeOpenAI, APIConnectionTimeoutError };
+});
+
+// --- Helpers referenced by handle‑exec‑command -----------------------------
+
+vi.mock("@lib/approvals.js", () => {
+  return {
+    __esModule: true,
+    alwaysApprovedCommands: new Set<string>(),
+    canAutoApprove: () =>
+      ({ type: "auto-approve", runInSandbox: false } as any),
+    isSafeCommand: () => null,
+  };
+});
+
+vi.mock("@lib/format-command.js", () => {
+  return {
+    __esModule: true,
+    formatCommandForDisplay: (cmd: Array<string>) => cmd.join(" "),
+  };
+});
+
+// Stub logger to avoid filesystem side‑effects
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+// After dependency mocks we can import the modules under test.
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+import * as handleExec from "../src/utils/agent/handle-exec-command.js";
+
+describe("Agent terminate (hard cancel)", () => {
+  it("suppresses function_call_output and stops processing once terminate() is invoked", async () => {
+    // Simulate a long‑running exec that would normally resolve with output.
+    vi.spyOn(handleExec, "handleExecCommand").mockImplementation(
+      async (_args, _config, _policy, _getConf, abortSignal) => {
+        // Wait until the abort signal is fired or 2s (whichever comes first).
+        await new Promise<void>((resolve) => {
+          if (abortSignal?.aborted) {
+            return resolve();
+          }
+          const timer = setTimeout(resolve, 2000);
+          abortSignal?.addEventListener("abort", () => {
+            clearTimeout(timer);
+            resolve();
+          });
+        });
+
+        return { outputText: "should‑not‑happen", metadata: {} } as any;
+      },
+    );
+
+    const received: Array<any> = [];
+
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      config: { model: "any", instructions: "" },
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (item) => received.push(item),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "run long cmd" }],
+      },
+    ];
+
+    // Start agent loop but don't wait for completion.
+    agent.run(userMsg as any);
+
+    // Give it a brief moment to start and process the function_call.
+    await new Promise((r) => setTimeout(r, 10));
+
+    agent.terminate();
+
+    // Allow promises to settle.
+    await new Promise((r) => setTimeout(r, 50));
+
+    const hasOutput = received.some((i) => i.type === "function_call_output");
+    expect(hasOutput).toBe(false);
+  });
+
+  it("rejects further run() calls after terminate()", async () => {
+    const agent = new AgentLoop({
+      model: "any",
+      instructions: "",
+      config: { model: "any", instructions: "" },
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: () => {},
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    agent.terminate();
+
+    const dummyMsg = [
+      {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text: "noop" }],
+      },
+    ];
+
+    let threw = false;
+    try {
+      // We expect this to fail fast – either by throwing synchronously or by
+      // returning a rejected promise.
+      await agent.run(dummyMsg as any);
+    } catch {
+      threw = true;
+    }
+
+    expect(threw).toBe(true);
+  });
+});
--- a/codex-cli/tests/agent-thinking-time.test.ts
+++ b/codex-cli/tests/agent-thinking-time.test.ts
@@ -0,0 +1,173 @@
+// ---------------------------------------------------------------------------
+// Regression test for the "thinking time" counter. Today the implementation
+// keeps a *single* start‑time across many requests which means that every
+// subsequent command will show an ever‑increasing number such as
+// "thinking for 4409s", "thinking for 4424s", … even though the individual
+// turn only took a couple of milliseconds. Each request should start its own
+// independent timer.
+//
+// We mark the spec with `.fails()` so that the overall suite remains green
+// until the underlying bug is fixed. When the implementation is corrected the
+// expectations below will turn green – Vitest will then error and remind us to
+// remove the `.fails` flag.
+// ---------------------------------------------------------------------------
+
+import { AgentLoop } from "../src/utils/agent/agent-loop.js";
+import { describe, it, expect, vi } from "vitest";
+
+// --- OpenAI mock -----------------------------------------------------------
+
+/**
+ * Fake stream that yields a single `response.completed` after a configurable
+ * delay. This allows us to simulate different thinking times for successive
+ * requests while using Vitest's fake timers.
+ */
+class FakeStream {
+  public controller = { abort: vi.fn() };
+  private delay: number;
+
+  constructor(delay: number) {
+    this.delay = delay; // milliseconds
+  }
+
+  async *[Symbol.asyncIterator]() {
+    if (this.delay > 0) {
+      // Wait the configured delay – fake timers will fast‑forward.
+      await new Promise((r) => setTimeout(r, this.delay));
+    }
+
+    yield {
+      type: "response.completed",
+      response: {
+        id: `resp-${Date.now()}`,
+        status: "completed",
+        output: [
+          {
+            type: "message",
+            role: "assistant",
+            id: "m1",
+            content: [{ type: "text", text: "done" }],
+          },
+        ],
+      },
+    } as any;
+  }
+}
+
+/**
+ * Fake OpenAI client that returns a slower stream for the *first* call and a
+ * faster one for the second so we can verify that per‑task timers reset while
+ * the global counter accumulates.
+ */
+vi.mock("openai", () => {
+  let callCount = 0;
+  class FakeOpenAI {
+    public responses = {
+      create: async () => {
+        callCount += 1;
+        return new FakeStream(callCount === 1 ? 10_000 : 500); // 10s vs 0.5s
+      },
+    };
+  }
+  class APIConnectionTimeoutError extends Error {}
+  return { __esModule: true, default: FakeOpenAI, APIConnectionTimeoutError };
+});
+
+// Stub helpers referenced indirectly so we do not pull in real FS/network
+vi.mock("@lib/approvals.js", () => ({
+  __esModule: true,
+  isSafeCommand: () => null,
+}));
+
+vi.mock("@lib/format-command.js", () => ({
+  __esModule: true,
+  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
+}));
+
+// Suppress file‑system logging in tests.
+vi.mock("../src/utils/agent/log.js", () => ({
+  __esModule: true,
+  log: () => {},
+  isLoggingEnabled: () => false,
+}));
+
+describe("thinking time counter", () => {
+  // Use fake timers for *all* tests in this suite
+  vi.useFakeTimers();
+
+  // Re‐use this array to collect all onItem callbacks
+  let items: Array<any>;
+
+  // Helper that runs two agent turns (10s + 0.5s) and populates `items`
+  async function runScenario() {
+    items = [];
+
+    const agent = new AgentLoop({
+      config: {} as any,
+      model: "any",
+      instructions: "",
+      approvalPolicy: { mode: "auto" } as any,
+      onItem: (i) => items.push(i),
+      onLoading: () => {},
+      getCommandConfirmation: async () => ({ review: "yes" } as any),
+      onLastResponseId: () => {},
+    });
+
+    const userMsg = {
+      type: "message",
+      role: "user",
+      content: [{ type: "input_text", text: "do it" }],
+    } as any;
+
+    // 1️⃣ First request – simulated 10s thinking time
+    agent.run([userMsg]);
+    await vi.advanceTimersByTimeAsync(11_000); // 10s + flush margin
+
+    // 2️⃣ Second request – simulated 0.5s thinking time
+    agent.run([userMsg]);
+    await vi.advanceTimersByTimeAsync(1_000); // 0.5s + flush margin
+  }
+
+  // TODO: this is disabled
+  it.fails("reports correct per-task thinking time per command", async () => {
+    await runScenario();
+
+    const perTaskMsgs = items.filter(
+      (i) =>
+        i.role === "system" &&
+        i.content?.[0]?.text?.startsWith("🤔  Thinking time:"),
+    );
+
+    expect(perTaskMsgs.length).toBe(2);
+
+    const perTaskDurations = perTaskMsgs.map((m) => {
+      const match = m.content[0].text.match(/Thinking time: (\d+) s/);
+      return match ? parseInt(match[1]!, 10) : NaN;
+    });
+
+    // First run ~10s, second run ~0.5s
+    expect(perTaskDurations[0]).toBeGreaterThanOrEqual(9);
+    expect(perTaskDurations[1]).toBeLessThan(3);
+  });
+
+  // TODO: this is disabled
+  it.fails("reports correct global thinking time accumulation", async () => {
+    await runScenario();
+
+    const globalMsgs = items.filter(
+      (i) =>
+        i.role === "system" &&
+        i.content?.[0]?.text?.startsWith("⏱  Total thinking time:"),
+    );
+
+    expect(globalMsgs.length).toBe(2);
+
+    const globalDurations = globalMsgs.map((m) => {
+      const match = m.content[0].text.match(/Total thinking time: (\d+) s/);
+      return match ? parseInt(match[1]!, 10) : NaN;
+    });
+
+    // Total after second run should exceed total after first
+    expect(globalDurations[1]! as number).toBeGreaterThan(globalDurations[0]!);
+  });
+});
--- a/codex-cli/tests/api-key.test.ts
+++ b/codex-cli/tests/api-key.test.ts
@@ -0,0 +1,35 @@
+import { describe, it, expect, beforeEach, afterEach } from "vitest";
+
+// We import the module *lazily* inside each test so that we can control the
+// OPENAI_API_KEY env var independently per test case. Node's module cache
+// would otherwise capture the value present during the first import.
+
+const ORIGINAL_ENV_KEY = process.env["OPENAI_API_KEY"];
+
+beforeEach(() => {
+  delete process.env["OPENAI_API_KEY"];
+});
+
+afterEach(() => {
+  if (ORIGINAL_ENV_KEY !== undefined) {
+    process.env["OPENAI_API_KEY"] = ORIGINAL_ENV_KEY;
+  } else {
+    delete process.env["OPENAI_API_KEY"];
+  }
+});
+
+describe("config.setApiKey", () => {
+  it("overrides the exported OPENAI_API_KEY at runtime", async () => {
+    const { setApiKey, OPENAI_API_KEY } = await import(
+      "../src/utils/config.js"
+    );
+
+    expect(OPENAI_API_KEY).toBe("");
+
+    setApiKey("my‑key");
+
+    const { OPENAI_API_KEY: liveRef } = await import("../src/utils/config.js");
+
+    expect(liveRef).toBe("my‑key");
+  });
+});
--- a/codex-cli/tests/apply-patch.test.ts
+++ b/codex-cli/tests/apply-patch.test.ts
@@ -0,0 +1,318 @@
+import {
+  ActionType,
+  apply_commit,
+  assemble_changes,
+  DiffError,
+  identify_files_added,
+  identify_files_needed,
+  load_files,
+  patch_to_commit,
+  process_patch,
+  text_to_patch,
+} from "../src/utils/agent/apply-patch.js";
+import { test, expect } from "vitest";
+
+function createInMemoryFS(initialFiles: Record<string, string>) {
+  const files: Record<string, string> = { ...initialFiles };
+  const writes: Record<string, string> = {};
+  const removals: Array<string> = [];
+
+  const openFn = (p: string): string => {
+    const file = files[p];
+    if (typeof file === "string") {
+      return file;
+    } else {
+      throw new Error(`File not found: ${p}`);
+    }
+  };
+
+  const writeFn = (p: string, content: string): void => {
+    files[p] = content;
+    writes[p] = content;
+  };
+
+  const removeFn = (p: string): void => {
+    delete files[p];
+    removals.push(p);
+  };
+
+  return { openFn, writeFn, removeFn, writes, removals, files };
+}
+
+test("process_patch - update file", () => {
+  const patch = `*** Begin Patch
+*** Update File: a.txt
+@@
+-hello
+hello world
+*** End Patch`;
+
+  const fs = createInMemoryFS({ "a.txt": "hello" });
+
+  const result = process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+
+  expect(result).toBe("Done!");
+  expect(fs.writes).toEqual({ "a.txt": "hello world" });
+  expect(fs.removals).toEqual([]);
+});
+
+test("process_patch - add file", () => {
+  const patch = `*** Begin Patch
+*** Add File: b.txt
+new content
+*** End Patch`;
+
+  const fs = createInMemoryFS({});
+
+  process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+
+  expect(fs.writes).toEqual({ "b.txt": "new content" });
+  expect(fs.removals).toEqual([]);
+});
+
+test("process_patch - delete file", () => {
+  const patch = `*** Begin Patch
+*** Delete File: c.txt
+*** End Patch`;
+
+  const fs = createInMemoryFS({ "c.txt": "to be removed" });
+
+  process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+
+  expect(fs.writes).toEqual({});
+  expect(fs.removals).toEqual(["c.txt"]);
+});
+
+test("identify_files_needed & identify_files_added", () => {
+  const patch = `*** Begin Patch
+*** Update File: a.txt
+*** Delete File: b.txt
+*** Add File: c.txt
+*** End Patch`;
+
+  expect(identify_files_needed(patch).sort()).toEqual(
+    ["a.txt", "b.txt"].sort(),
+  );
+  expect(identify_files_added(patch)).toEqual(["c.txt"]);
+});
+
+test("process_patch - update file with multiple chunks", () => {
+  const original = "line1\nline2\nline3\nline4";
+  const patch = `*** Begin Patch
+*** Update File: multi.txt
+@@
+ line1
+-line2
+line2 updated
+ line3
+inserted line
+ line4
+*** End Patch`;
+
+  const fs = createInMemoryFS({ "multi.txt": original });
+  process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+
+  const expected = "line1\nline2 updated\nline3\ninserted line\nline4";
+  expect(fs.writes).toEqual({ "multi.txt": expected });
+  expect(fs.removals).toEqual([]);
+});
+
+test("process_patch - move file (rename)", () => {
+  const patch = `*** Begin Patch
+*** Update File: old.txt
+*** Move to: new.txt
+@@
+-old
+new
+*** End Patch`;
+
+  const fs = createInMemoryFS({ "old.txt": "old" });
+  process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+
+  expect(fs.writes).toEqual({ "new.txt": "new" });
+  expect(fs.removals).toEqual(["old.txt"]);
+});
+
+test("process_patch - combined add, update, delete", () => {
+  const patch = `*** Begin Patch
+*** Add File: added.txt
+added contents
+*** Update File: upd.txt
+@@
+-old value
+new value
+*** Delete File: del.txt
+*** End Patch`;
+
+  const fs = createInMemoryFS({
+    "upd.txt": "old value",
+    "del.txt": "delete me",
+  });
+
+  process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+
+  expect(fs.writes).toEqual({
+    "added.txt": "added contents",
+    "upd.txt": "new value",
+  });
+  expect(fs.removals).toEqual(["del.txt"]);
+});
+
+test("process_patch - readme edit", () => {
+  const original = `
+#### Fix an issue
+
+\`\`\`sh
+# First, copy an error
+# Then, start codex with interactive mode
+codex
+
+# Or you can pass in via command line argument
+codex "Fix this issue: $(pbpaste)"
+
+# Or even as a task (it should use your current repo and branch)
+codex -t "Fix this issue: $(pbpaste)"
+\`\`\`
+`;
+  const patch = `*** Begin Patch
+*** Update File: README.md
+@@
+  codex -t "Fix this issue: $(pbpaste)"
+  \`\`\`
+
+hello
+*** End Patch`;
+  const expected = `
+#### Fix an issue
+
+\`\`\`sh
+# First, copy an error
+# Then, start codex with interactive mode
+codex
+
+# Or you can pass in via command line argument
+codex "Fix this issue: $(pbpaste)"
+
+# Or even as a task (it should use your current repo and branch)
+codex -t "Fix this issue: $(pbpaste)"
+\`\`\`
+
+hello
+`;
+
+  const fs = createInMemoryFS({ "README.md": original });
+  process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+
+  expect(fs.writes).toEqual({ "README.md": expected });
+});
+
+test("process_patch - invalid patch throws DiffError", () => {
+  const patch = `*** Begin Patch
+*** Update File: missing.txt
+@@
+something
+*** End Patch`;
+
+  const fs = createInMemoryFS({});
+
+  expect(() =>
+    process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn),
+  ).toThrow(DiffError);
+});
+
+test("process_patch - tolerates omitted space for keep line", () => {
+  const original = "line1\nline2\nline3";
+  const patch = `*** Begin Patch\n*** Update File: foo.txt\n@@\n line1\n-line2\n+some new line2\nline3\n*** End Patch`;
+  const fs = createInMemoryFS({ "foo.txt": original });
+  process_patch(patch, fs.openFn, fs.writeFn, fs.removeFn);
+  expect(fs.files["foo.txt"]).toBe("line1\nsome new line2\nline3");
+});
+
+test("assemble_changes correctly detects add, update and delete", () => {
+  const orig = {
+    "a.txt": "old",
+    "b.txt": "keep",
+    "c.txt": "remove",
+  };
+  const updated = {
+    "a.txt": "new", // update
+    "b.txt": "keep", // unchanged – should be ignored
+    "c.txt": undefined as unknown as string, // delete
+    "d.txt": "created", // add
+  };
+
+  const commit = assemble_changes(orig, updated).changes;
+
+  expect(commit["a.txt"]).toEqual({
+    type: ActionType.UPDATE,
+    old_content: "old",
+    new_content: "new",
+  });
+  expect(commit["c.txt"]).toEqual({
+    type: ActionType.DELETE,
+    old_content: "remove",
+  });
+  expect(commit["d.txt"]).toEqual({
+    type: ActionType.ADD,
+    new_content: "created",
+  });
+
+  // unchanged files should not appear in commit
+  expect(commit).not.toHaveProperty("b.txt");
+});
+
+test("text_to_patch + patch_to_commit handle update and add", () => {
+  const originalFiles = {
+    "a.txt": "old line",
+  };
+
+  const patch = `*** Begin Patch
+*** Update File: a.txt
+@@
+-old line
+new line
+*** Add File: b.txt
+content new
+*** End Patch`;
+
+  const [parsedPatch] = text_to_patch(patch, originalFiles);
+  const commit = patch_to_commit(parsedPatch, originalFiles).changes;
+
+  expect(commit["a.txt"]).toEqual({
+    type: ActionType.UPDATE,
+    old_content: "old line",
+    new_content: "new line",
+  });
+  expect(commit["b.txt"]).toEqual({
+    type: ActionType.ADD,
+    new_content: "content new",
+  });
+});
+
+test("load_files throws DiffError when file is missing", () => {
+  const { openFn } = createInMemoryFS({ "exists.txt": "hi" });
+  // intentionally include a missing file in the list
+  expect(() => load_files(["exists.txt", "missing.txt"], openFn)).toThrow(
+    DiffError,
+  );
+});
+
+test("apply_commit correctly performs move / rename operations", () => {
+  const commit = {
+    changes: {
+      "old.txt": {
+        type: ActionType.UPDATE,
+        old_content: "old",
+        new_content: "new",
+        move_path: "new.txt",
+      },
+    },
+  };
+
+  const { writeFn, removeFn, writes, removals } = createInMemoryFS({});
+
+  apply_commit(commit, writeFn, removeFn);
+
+  expect(writes).toEqual({ "new.txt": "new" });
+  expect(removals).toEqual(["old.txt"]);
+});
--- a/codex-cli/tests/cancel-exec.test.ts
+++ b/codex-cli/tests/cancel-exec.test.ts
@@ -0,0 +1,46 @@
+import { exec as rawExec } from "../src/utils/agent/sandbox/raw-exec.js";
+import { describe, it, expect } from "vitest";
+
+// Import the low‑level exec implementation so we can verify that AbortSignal
+// correctly terminates a spawned process. We bypass the higher‑level wrappers
+// to keep the test focused and fast.
+
+describe("exec cancellation", () => {
+  it("kills the child process when the abort signal is triggered", async () => {
+    const abortController = new AbortController();
+
+    // Spawn a node process that would normally run for 5 seconds before
+    // printing anything. We should abort long before that happens.
+    const cmd = ["node", "-e", "setTimeout(() => console.log('late'), 5000);"];
+
+    const start = Date.now();
+    const promise = rawExec(cmd, {}, [], abortController.signal);
+
+    // Abort almost immediately.
+    abortController.abort();
+
+    const result = await promise;
+    const durationMs = Date.now() - start;
+
+    // The process should have been terminated rapidly (well under the 5s the
+    // child intended to run) – give it a generous 2s budget.
+    expect(durationMs).toBeLessThan(2000);
+
+    // Exit code should indicate abnormal termination (anything but zero)
+    expect(result.exitCode).not.toBe(0);
+
+    // The child never got a chance to print the word "late".
+    expect(result.stdout).not.toContain("late");
+  });
+
+  it("allows the process to finish when not aborted", async () => {
+    const abortController = new AbortController();
+
+    const cmd = ["node", "-e", "console.log('finished')"];
+
+    const result = await rawExec(cmd, {}, [], abortController.signal);
+
+    expect(result.exitCode).toBe(0);
+    expect(result.stdout.trim()).toBe("finished");
+  });
+});
--- a/codex-cli/tests/config.test.tsx
+++ b/codex-cli/tests/config.test.tsx
@@ -0,0 +1,106 @@
+import type * as fsType from "fs";
+
+import { loadConfig, saveConfig } from "../src/utils/config.js"; // parent import first
+import { tmpdir } from "os";
+import { join } from "path";
+import { test, expect, beforeEach, afterEach, vi } from "vitest";
+
+// In‑memory FS store
+let memfs: Record<string, string> = {};
+
+// Mock out the parts of "fs" that our config module uses:
+vi.mock("fs", async () => {
+  // now `real` is the actual fs module
+  const real = (await vi.importActual("fs")) as typeof fsType;
+  return {
+    ...real,
+    existsSync: (path: string) => memfs[path] !== undefined,
+    readFileSync: (path: string) => {
+      if (memfs[path] === undefined) {
+        throw new Error("ENOENT");
+      }
+      return memfs[path];
+    },
+    writeFileSync: (path: string, data: string) => {
+      memfs[path] = data;
+    },
+    mkdirSync: () => {
+      // no‑op in in‑memory store
+    },
+    rmSync: (path: string) => {
+      // recursively delete any key under this prefix
+      const prefix = path.endsWith("/") ? path : path + "/";
+      for (const key of Object.keys(memfs)) {
+        if (key === path || key.startsWith(prefix)) {
+          delete memfs[key];
+        }
+      }
+    },
+  };
+});
+
+let testDir: string;
+let testConfigPath: string;
+let testInstructionsPath: string;
+
+beforeEach(() => {
+  memfs = {}; // reset in‑memory store
+  testDir = tmpdir(); // use the OS temp dir as our "cwd"
+  testConfigPath = join(testDir, "config.json");
+  testInstructionsPath = join(testDir, "instructions.md");
+});
+
+afterEach(() => {
+  memfs = {};
+});
+
+test("loads default config if files don't exist", () => {
+  const config = loadConfig(testConfigPath, testInstructionsPath, {
+    disableProjectDoc: true,
+  });
+  expect(config).toEqual({
+    model: "o4-mini",
+    instructions: "",
+  });
+});
+
+test("saves and loads config correctly", () => {
+  const testConfig = {
+    model: "test-model",
+    instructions: "test instructions",
+  };
+  saveConfig(testConfig, testConfigPath, testInstructionsPath);
+
+  // Our in‑memory fs should now contain those keys:
+  expect(memfs[testConfigPath]).toContain(`"model": "test-model"`);
+  expect(memfs[testInstructionsPath]).toBe("test instructions");
+
+  const loadedConfig = loadConfig(testConfigPath, testInstructionsPath, {
+    disableProjectDoc: true,
+  });
+  expect(loadedConfig).toEqual(testConfig);
+});
+
+test("loads user instructions + project doc when codex.md is present", () => {
+  // 1) seed memfs: a config JSON, an instructions.md, and a codex.md in the cwd
+  const userInstr = "here are user instructions";
+  const projectDoc = "# Project Title\n\nSome project‑specific doc";
+  // first, make config so loadConfig will see storedConfig
+  memfs[testConfigPath] = JSON.stringify({ model: "mymodel" }, null, 2);
+  // then user instructions:
+  memfs[testInstructionsPath] = userInstr;
+  // and now our fake codex.md in the cwd:
+  const codexPath = join(testDir, "codex.md");
+  memfs[codexPath] = projectDoc;
+
+  // 2) loadConfig without disabling project‑doc, but with cwd=testDir
+  const cfg = loadConfig(testConfigPath, testInstructionsPath, {
+    cwd: testDir,
+  });
+
+  // 3) assert we got both pieces concatenated
+  expect(cfg.model).toBe("mymodel");
+  expect(cfg.instructions).toBe(
+    userInstr + "\n\n--- project-doc ---\n\n" + projectDoc,
+  );
+});
--- a/codex-cli/tests/dummy.test.ts
+++ b/codex-cli/tests/dummy.test.ts
@@ -0,0 +1,4 @@
+import { test, expect } from "vitest";
+test("dummy", () => {
+  expect(1).toBe(1);
+});
--- a/codex-cli/tests/external-editor.test.ts
+++ b/codex-cli/tests/external-editor.test.ts
@@ -0,0 +1,56 @@
+import TextBuffer from "../src/lib/text-buffer";
+import { describe, it, expect, vi } from "vitest";
+
+/* -------------------------------------------------------------------------
+ *  External $EDITOR integration – behavioural contract
+ * ---------------------------------------------------------------------- */
+
+describe("TextBuffer – open in external $EDITOR", () => {
+  it("replaces the buffer with the contents saved by the editor", async () => {
+    // Initial text put into the file.
+    const initial = [
+      "// TODO: draft release notes",
+      "",
+      "* Fixed memory leak in xyz module.",
+    ].join("\n");
+
+    const buf = new TextBuffer(initial);
+
+    // -------------------------------------------------------------------
+    //  Stub the child_process.spawnSync call so no real editor launches.
+    // -------------------------------------------------------------------
+    const mockSpawn = vi
+      .spyOn(require("node:child_process"), "spawnSync")
+      .mockImplementation((_cmd, args: any) => {
+        const argv = args as Array<string>;
+        const file = argv[argv.length - 1];
+        // Lazily append a dummy line – our faux "edit".
+        require("node:fs").appendFileSync(
+          file,
+          "\n* Added unit tests for external editor integration.",
+        );
+        return { status: 0 } as any;
+      });
+
+    try {
+      await buf.openInExternalEditor({ editor: "nano" }); // editor param ignored in stub
+    } finally {
+      mockSpawn.mockRestore();
+    }
+
+    const want = [
+      "// TODO: draft release notes",
+      "",
+      "* Fixed memory leak in xyz module.",
+      "* Added unit tests for external editor integration.",
+    ].join("\n");
+
+    expect(buf.getText()).toBe(want);
+    // Cursor should land at the *end* of the newly imported text.
+    const [row, col] = buf.getCursor();
+    expect(row).toBe(3); // 4th line (0‑based)
+    expect(col).toBe(
+      "* Added unit tests for external editor integration.".length,
+    );
+  });
+});
--- a/codex-cli/tests/markdown.test.tsx
+++ b/codex-cli/tests/markdown.test.tsx
@@ -0,0 +1,16 @@
+import { renderTui } from "./ui-test-helpers.js";
+import { Markdown } from "../src/components/chat/terminal-chat-response-item.js";
+import React from "react";
+import { it, expect } from "vitest";
+
+/** Simple sanity check that the Markdown component renders bold/italic text.
+ * We strip ANSI codes, so the output should contain the raw words. */
+it("renders basic markdown", () => {
+  const { lastFrameStripped } = renderTui(
+    <Markdown>**bold** _italic_</Markdown>,
+  );
+
+  const frame = lastFrameStripped();
+  expect(frame).toContain("bold");
+  expect(frame).toContain("italic");
+});
--- a/codex-cli/tests/model-utils-network-error.test.ts
+++ b/codex-cli/tests/model-utils-network-error.test.ts
@@ -0,0 +1,70 @@
+import { describe, it, expect, vi, afterEach } from "vitest";
+
+// The model‑utils module reads OPENAI_API_KEY at import time. We therefore
+// need to tweak the env var *before* importing the module in each test and
+// make sure the module cache is cleared.
+
+const ORIGINAL_ENV_KEY = process.env["OPENAI_API_KEY"];
+
+// Holders so individual tests can adjust behaviour of the OpenAI mock.
+const openAiState: { listSpy?: ReturnType<typeof vi.fn> } = {};
+
+vi.mock("openai", () => {
+  class FakeOpenAI {
+    public models = {
+      // `listSpy` will be swapped out by the tests
+      list: (...args: Array<any>) => openAiState.listSpy!(...args),
+    };
+  }
+
+  return {
+    __esModule: true,
+    default: FakeOpenAI,
+  };
+});
+
+describe("model-utils – offline resilience", () => {
+  afterEach(() => {
+    // Restore env var & module cache so tests are isolated.
+    if (ORIGINAL_ENV_KEY !== undefined) {
+      process.env["OPENAI_API_KEY"] = ORIGINAL_ENV_KEY;
+    } else {
+      delete process.env["OPENAI_API_KEY"];
+    }
+    vi.resetModules();
+    openAiState.listSpy = undefined;
+  });
+
+  it("returns true when API key absent (no network available)", async () => {
+    delete process.env["OPENAI_API_KEY"];
+
+    // Re‑import after env change so the module picks up the new state.
+    vi.resetModules();
+    const { isModelSupportedForResponses } = await import(
+      "../src/utils/model-utils.js"
+    );
+
+    const supported = await isModelSupportedForResponses("o4-mini");
+    expect(supported).toBe(true);
+  });
+
+  it("falls back gracefully when openai.models.list throws a network error", async () => {
+    process.env["OPENAI_API_KEY"] = "dummy";
+
+    const netErr: any = new Error("socket hang up");
+    netErr.code = "ECONNRESET";
+
+    openAiState.listSpy = vi.fn(async () => {
+      throw netErr;
+    });
+
+    vi.resetModules();
+    const { isModelSupportedForResponses } = await import(
+      "../src/utils/model-utils.js"
+    );
+
+    // Should resolve true despite the network failure
+    const supported = await isModelSupportedForResponses("some-model");
+    expect(supported).toBe(true);
+  });
+});
--- a/codex-cli/tests/multiline-ctrl-enter-submit.test.tsx
+++ b/codex-cli/tests/multiline-ctrl-enter-submit.test.tsx
@@ -0,0 +1,41 @@
+// Ctrl+Enter (CSI‑u 13;5u) should submit the buffer.
+
+import { renderTui } from "./ui-test-helpers.js";
+import MultilineTextEditor from "../src/components/chat/multiline-editor.js";
+import * as React from "react";
+import { describe, it, expect, vi } from "vitest";
+
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+describe("MultilineTextEditor – Ctrl+Enter submits", () => {
+  it("calls onSubmit when CSI 13;5u is received", async () => {
+    const onSubmit = vi.fn();
+
+    const { stdin, flush, cleanup } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        height: 5,
+        width: 20,
+        onSubmit,
+      }),
+    );
+
+    await flush();
+
+    await type(stdin, "hello", flush);
+    await type(stdin, "\u001B[13;5u", flush); // Ctrl+Enter (modifier 5 = Ctrl)
+
+    await flush();
+
+    expect(onSubmit).toHaveBeenCalledTimes(1);
+    expect(onSubmit.mock.calls[0]![0]).toBe("hello");
+
+    cleanup();
+  });
+});
--- a/codex-cli/tests/multiline-dynamic-width.test.tsx
+++ b/codex-cli/tests/multiline-dynamic-width.test.tsx
@@ -0,0 +1,77 @@
+// These tests exercise MultilineTextEditor behaviour when the editor width is
+// *not* provided via props so that it has to derive its width from the current
+// terminal size.  We emulate a terminal‑resize by mutating
+// `process.stdout.columns` and emitting a synthetic `resize` event – the
+// `useTerminalSize` hook listens for that and causes the component to
+// re‑render.  The test then asserts that
+//   1.  The rendered line re‑wraps to the new width, *and*
+//   2.  The caret (highlighted inverse character) is still kept in view after
+//       the horizontal shrink so that editing remains possible.
+
+import { renderTui } from "./ui-test-helpers.js";
+import MultilineTextEditor from "../src/components/chat/multiline-editor.js";
+import * as React from "react";
+import { describe, it, expect } from "vitest";
+
+// Helper to synchronously type text then flush Ink's timers so that the next
+// `lastFrame()` call sees the updated UI.
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+describe("MultilineTextEditor – dynamic width", () => {
+  // The dynamic horizontal scroll logic is still flaky – mark as an expected
+  // *failing* test so it doesn't break CI until the feature is aligned with
+  // the Rust implementation.
+  it("keeps the caret visible when the terminal width shrinks", async () => {
+    // Fake an initial terminal width large enough that no horizontal
+    // scrolling is required while we type the long alphabet sequence.
+    process.stdout.columns = 40; // width seen by useTerminalSize (after padding)
+
+    const { stdin, lastFrame, flush, cleanup } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        initialText: "",
+        // width *omitted* – component should fall back to terminal columns
+        height: 3,
+      }),
+    );
+
+    // Ensure initial render completes.
+    await flush();
+
+    // Type the alphabet – longer than the width we'll shrink to.
+    const alphabet = "abcdefghijklmnopqrstuvwxyz";
+    await type(stdin, alphabet, flush);
+
+    // The cursor (block) now sits on the far right after the 'z'. Verify that
+    // the character 'z' is visible in the current frame.
+    expect(lastFrame()?.includes("z")).toBe(true);
+
+    /* -----------------------  Simulate resize  ----------------------- */
+
+    // Shrink the reported terminal width so that the previously visible slice
+    // would no longer include the cursor *unless* the editor re‑computes
+    // scroll offsets on re‑render.
+    process.stdout.columns = 20; // shrink significantly (remember: padding‑8)
+    process.stdout.emit("resize"); // notify listeners
+
+    // Allow Ink to schedule the state update and then perform the re‑render.
+    await flush();
+    await flush();
+
+    // After the resize the editor should have scrolled horizontally so that
+    // the caret (and thus the 'z' character that is block‑highlighted) remains
+    // visible in the rendered slice.
+    const frameAfter = lastFrame() || "";
+    // eslint-disable-next-line no-console
+    console.log("FRAME AFTER RESIZE:\n" + frameAfter);
+    expect(frameAfter.includes("z")).toBe(true);
+
+    cleanup();
+  });
+});
--- a/codex-cli/tests/multiline-enter-submit-cr.test.tsx
+++ b/codex-cli/tests/multiline-enter-submit-cr.test.tsx
@@ -0,0 +1,41 @@
+// Plain Enter (CR) should submit.
+
+import { renderTui } from "./ui-test-helpers.js";
+import MultilineTextEditor from "../src/components/chat/multiline-editor.js";
+import * as React from "react";
+import { describe, it, expect, vi } from "vitest";
+
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+describe("MultilineTextEditor – Enter submits (CR)", () => {
+  it("calls onSubmit when \r is received", async () => {
+    const onSubmit = vi.fn();
+
+    const { stdin, flush, cleanup } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        height: 5,
+        width: 20,
+        onSubmit,
+      }),
+    );
+
+    await flush();
+
+    await type(stdin, "hello", flush);
+    await type(stdin, "\r", flush);
+
+    await flush();
+
+    expect(onSubmit).toHaveBeenCalledTimes(1);
+    expect(onSubmit.mock.calls[0]![0]).toBe("hello");
+
+    cleanup();
+  });
+});
--- a/codex-cli/tests/multiline-external-editor-shortcut.test.tsx
+++ b/codex-cli/tests/multiline-external-editor-shortcut.test.tsx
@@ -0,0 +1,64 @@
+import { renderTui } from "./ui-test-helpers.js";
+import MultilineTextEditor from "../src/components/chat/multiline-editor.js";
+import TextBuffer from "../src/lib/text-buffer.js";
+import * as React from "react";
+import { describe, it, expect, vi } from "vitest";
+
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+describe("MultilineTextEditor – external editor shortcut", () => {
+  it("fires openInExternalEditor on Ctrl‑E (single key)", async () => {
+    const spy = vi
+      .spyOn(TextBuffer.prototype as any, "openInExternalEditor")
+      .mockResolvedValue(undefined);
+
+    const { stdin, flush, cleanup } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        initialText: "hello",
+        width: 20,
+        height: 3,
+      }),
+    );
+
+    // Ensure initial render.
+    await flush();
+
+    // Send Ctrl‑E → should fire immediately
+    await type(stdin, "\x05", flush); // Ctrl‑E (ENQ / 0x05)
+    expect(spy).toHaveBeenCalledTimes(1);
+
+    spy.mockRestore();
+    cleanup();
+  });
+
+  it("fires openInExternalEditor on Ctrl‑X (single key)", async () => {
+    const spy = vi
+      .spyOn(TextBuffer.prototype as any, "openInExternalEditor")
+      .mockResolvedValue(undefined);
+
+    const { stdin, flush, cleanup } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        initialText: "hello",
+        width: 20,
+        height: 3,
+      }),
+    );
+
+    // Ensure initial render.
+    await flush();
+
+    // Send Ctrl‑X → should fire immediately
+    await type(stdin, "\x18", flush); // Ctrl‑X (SUB / 0x18)
+    expect(spy).toHaveBeenCalledTimes(1);
+
+    spy.mockRestore();
+    cleanup();
+  });
+});
--- a/codex-cli/tests/multiline-history-behavior.test.tsx
+++ b/codex-cli/tests/multiline-history-behavior.test.tsx
@@ -0,0 +1,171 @@
+/* --------------------------------------------------------------------------
+ *  Regression test – chat history navigation (↑/↓) should *only* activate
+ *  once the caret reaches the very first / last line of the multiline input.
+ *
+ *  Current buggy behaviour: TerminalChatInput intercepts the up‑arrow at the
+ *  outer <useInput> handler regardless of the caret row, causing an immediate
+ *  history recall even when the user is still somewhere within a multi‑line
+ *  draft.  The test captures the *expected* behaviour (matching e.g. Bash,
+ *  zsh, Readline, etc.) – the ↑ key must first move the caret vertically to
+ *  the topmost row; only a *subsequent* press should start cycling through
+ *  previous messages.
+ *
+ *  The spec is written *before* the fix so we mark it as an expected failure
+ *  (it.todo) until the implementation is aligned.
+ * ----------------------------------------------------------------------- */
+
+import { renderTui } from "./ui-test-helpers.js";
+import * as React from "react";
+import { describe, it, expect, vi } from "vitest";
+
+// ---------------------------------------------------------------------------
+//  Module mocks *must* be registered *before* the module under test is
+//  imported so that Vitest can replace the dependency during evaluation.
+// ---------------------------------------------------------------------------
+
+// The chat‑input component relies on an async helper that performs filesystem
+// work when images are referenced.  Mock it so our unit test remains fast and
+// free of side‑effects.
+vi.mock("../src/utils/input-utils.js", () => ({
+  createInputItem: vi.fn(async (text: string /*, images: Array<string> */) => ({
+    role: "user",
+    type: "message",
+    content: [{ type: "input_text", text }],
+  })),
+}));
+
+// Mock the optional @lib/* dependencies so the dynamic import in parsers.ts
+// does not fail during the test environment where the alias isn't configured.
+vi.mock("@lib/format-command.js", () => ({
+  formatCommandForDisplay: (cmd: Array<string>) => cmd.join(" "),
+}));
+vi.mock("@lib/approvals.js", () => ({
+  isSafeCommand: (_cmd: Array<string>) => null,
+}));
+
+// After mocks are in place we can safely import the component under test.
+import TerminalChatInput from "../src/components/chat/terminal-chat-new-input.js";
+
+// Tiny helper mirroring the one used in other UI tests so we can await Ink's
+// internal promises between keystrokes.
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+/** Build a set of no‑op callbacks so <TerminalChatInput> renders with minimal
+ *  scaffolding.
+ */
+function stubProps(): any {
+  return {
+    isNew: true,
+    loading: false,
+    submitInput: vi.fn(),
+    confirmationPrompt: null,
+    submitConfirmation: vi.fn(),
+    setLastResponseId: vi.fn(),
+    // Cast to any to satisfy the generic React.Dispatch signature without
+    // pulling the ResponseItem type into the test bundle.
+    setItems: (() => {}) as any,
+    contextLeftPercent: 100,
+    openOverlay: vi.fn(),
+    openModelOverlay: vi.fn(),
+    openHelpOverlay: vi.fn(),
+    interruptAgent: vi.fn(),
+    active: true,
+  };
+}
+
+describe("TerminalChatInput – history navigation with multiline drafts", () => {
+  it("should not recall history until caret is on the first line", async () => {
+    const { stdin, lastFrameStripped, flush, cleanup } = renderTui(
+      React.createElement(TerminalChatInput, stubProps()),
+    );
+
+    // -------------------------------------------------------------------
+    // 1.  Submit one previous message so that history isn't empty.
+    // -------------------------------------------------------------------
+    for (const ch of ["p", "r", "e", "v"]) {
+      await type(stdin, ch, flush);
+    }
+    await type(stdin, "\r", flush); // <Enter/Return> submits the text
+
+    // Let the async onSubmit finish (mocked so it's immediate, but flush once
+    // more to allow state updates to propagate).
+    await flush();
+
+    // -------------------------------------------------------------------
+    // 2.  Start a *multi‑line* draft so that the caret ends up on row 1.
+    // -------------------------------------------------------------------
+    await type(stdin, "line1", flush);
+    await type(stdin, "\n", flush); // newline inside the editor (Shift+Enter)
+    await type(stdin, "line2", flush);
+
+    // Sanity‑check – both lines should be visible in the current frame.
+    const frameBefore = lastFrameStripped();
+    expect(frameBefore.includes("line1")).toBe(true);
+    expect(frameBefore.includes("line2")).toBe(true);
+
+    // -------------------------------------------------------------------
+    // 3.  Press ↑ once.  Expected: caret moves from (row:1) -> (row:0) but
+    //     NO history recall yet, so the text stays unchanged.
+    // -------------------------------------------------------------------
+    await type(stdin, "\x1b[A", flush); // up‑arrow
+
+    const frameAfter = lastFrameStripped();
+
+    // The buffer should be unchanged – we *haven't* entered history‑navigation
+    // mode yet because the caret only moved vertically inside the draft.
+    expect(frameAfter.includes("prev")).toBe(false);
+    expect(frameAfter.includes("line1")).toBe(true);
+
+    cleanup();
+  });
+
+  it("should restore the draft when navigating forward (↓) past the newest history entry", async () => {
+    const { stdin, lastFrameStripped, flush, cleanup } = renderTui(
+      React.createElement(TerminalChatInput, stubProps()),
+    );
+
+    // Submit one message so we have history to recall later.
+    for (const ch of ["p", "r", "e", "v"]) {
+      await type(stdin, ch, flush);
+    }
+    await type(stdin, "\r", flush); // <Enter> – submit
+    await flush();
+
+    // Begin a multi‑line draft that we'll want to recover later.
+    await type(stdin, "draft1", flush);
+    await type(stdin, "\n", flush); // newline inside editor
+    await type(stdin, "draft2", flush);
+
+    // Record the frame so we can later assert that it comes back.
+    const draftFrame = lastFrameStripped();
+    expect(draftFrame.includes("draft1")).toBe(true);
+    expect(draftFrame.includes("draft2")).toBe(true);
+
+    // ────────────────────────────────────────────────────────────────────
+    // 1) Hit ↑ twice: first press just moves the caret to row‑0, second
+    //    enters history mode and shows the previous message ("prev").
+    // ────────────────────────────────────────────────────────────────────
+    await type(stdin, "\x1b[A", flush); // first up – vertical move only
+    await type(stdin, "\x1b[A", flush); // second up – recall history
+
+    const historyFrame = lastFrameStripped();
+    expect(historyFrame.includes("prev")).toBe(true);
+
+    // 2) Hit ↓ once – should exit history mode and restore the original draft
+    //    (multi‑line input).
+    await type(stdin, "\x1b[B", flush); // down‑arrow
+
+    const restoredFrame = lastFrameStripped();
+    expect(restoredFrame.includes("draft1")).toBe(true);
+    expect(restoredFrame.includes("draft2")).toBe(true);
+
+    cleanup();
+  });
+});
--- a/codex-cli/tests/multiline-input-test.ts
+++ b/codex-cli/tests/multiline-input-test.ts
@@ -0,0 +1,164 @@
+import { renderTui } from "./ui-test-helpers.js";
+import MultilineTextEditor from "../src/components/chat/multiline-editor.js";
+import * as React from "react";
+import { describe, it, expect, vi } from "vitest";
+
+// Helper that lets us type and then immediately flush ink's async timers
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+describe("MultilineTextEditor", () => {
+  it("renders the initial text", async () => {
+    const { lastFrame, cleanup, waitUntilExit } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        initialText: "hello",
+        width: 10,
+        height: 3,
+      }),
+    );
+
+    await waitUntilExit(); // initial render
+    expect(lastFrame()?.includes("hello")).toBe(true);
+    cleanup();
+  });
+
+  it("updates the buffer when typing and shows the change", async () => {
+    const {
+      stdin,
+      lastFrame,
+      cleanup,
+      waitUntilExit: _,
+      flush,
+    } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        initialText: "",
+        width: 10,
+        height: 3,
+      }),
+    );
+
+    // Type "h"
+    await type(stdin, "h", flush);
+    expect(lastFrame()?.includes("h")).toBe(true);
+
+    // Type "i"
+    await type(stdin, "i", flush);
+    expect(lastFrame()?.includes("hi")).toBe(true);
+
+    cleanup();
+  });
+
+  it("calls onSubmit with the current text on <Esc>", async () => {
+    const onSubmit = vi.fn();
+    const { stdin, flush, cleanup } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        initialText: "foo",
+        width: 10,
+        height: 3,
+        onSubmit,
+      }),
+    );
+
+    // Press Escape
+    await type(stdin, "\x1b", flush);
+
+    expect(onSubmit).toHaveBeenCalledTimes(1);
+    expect(onSubmit).toHaveBeenCalledWith("foo");
+
+    cleanup();
+  });
+
+  it("updates text when backspacing", async () => {
+    const { stdin, lastFrameStripped, flush, cleanup, waitUntilExit } =
+      renderTui(
+        React.createElement(MultilineTextEditor, {
+          initialText: "",
+          width: 10,
+          height: 3,
+        }),
+      );
+
+    await waitUntilExit();
+
+    // Type "hello"
+    stdin.write("hello");
+    await flush();
+    expect(lastFrameStripped().includes("hello")).toBe(true);
+
+    // Send 2× backspace (DEL / 0x7f)
+    stdin.write("\x7f\x7f");
+    await flush();
+
+    const frame = lastFrameStripped();
+    expect(frame.includes("hel")).toBe(true);
+    expect(frame.includes("hell")).toBe(false);
+
+    cleanup();
+  });
+
+  it("three consecutive backspaces after typing 'hello' leaves 'he'", async () => {
+    const { stdin, lastFrameStripped, flush, cleanup, waitUntilExit } =
+      renderTui(
+        React.createElement(MultilineTextEditor, {
+          initialText: "",
+          width: 10,
+          height: 3,
+        }),
+      );
+
+    await waitUntilExit();
+
+    stdin.write("hello");
+    await flush();
+    // 3 backspaces
+    stdin.write("\x7f\x7f\x7f");
+    await flush();
+
+    const frame = lastFrameStripped();
+    expect(frame.includes("he")).toBe(true);
+    expect(frame.includes("hel")).toBe(false);
+    expect(frame.includes("hello")).toBe(false);
+
+    cleanup();
+  });
+
+  /* -------------------------------------------------------------- */
+  /*  Caret highlighting semantics                                  */
+  /* -------------------------------------------------------------- */
+
+  it("highlights the character *under* the caret (after arrow moves)", async () => {
+    const { stdin, lastFrame, flush, cleanup, waitUntilExit } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        initialText: "",
+        width: 10,
+        height: 3,
+      }),
+    );
+
+    await waitUntilExit();
+
+    // Type "bar" and move caret left twice
+    stdin.write("bar");
+    stdin.write("\x1b[D");
+    await flush();
+    stdin.write("\x1b[D");
+    await flush(); // ensure each arrow processed
+
+    const frameRaw = lastFrame() || "";
+    // eslint-disable-next-line no-console
+    console.log("DEBUG frame:", frameRaw);
+    const highlightedMatch = frameRaw.match(/\x1b\[7m(.)\x1b\[27m/);
+    expect(highlightedMatch).not.toBeNull();
+    const highlightedChar = highlightedMatch ? highlightedMatch[1] : null;
+
+    expect(highlightedChar).toBe("a"); // caret should block‑highlight 'a'
+
+    cleanup();
+  });
+});
--- a/codex-cli/tests/multiline-newline.test.tsx
+++ b/codex-cli/tests/multiline-newline.test.tsx
@@ -0,0 +1,56 @@
+import { renderTui } from "./ui-test-helpers.js";
+import MultilineTextEditor from "../src/components/chat/multiline-editor.js";
+import * as React from "react";
+import { describe, it, expect } from "vitest";
+
+// Helper to send keystrokes and wait for Ink's async timing so that the frame
+// reflects the input.
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+describe("MultilineTextEditor – inserting new lines", () => {
+  // Same as above – the React wrapper still differs from the Rust reference
+  // when handling <Enter>.  Keep the test around but mark it as expected to
+  // fail.
+  it("splits the line and renders the new row when <Enter> is pressed", async () => {
+    const { stdin, lastFrameStripped, flush, cleanup } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        height: 5,
+        width: 20,
+        initialText: "",
+      }),
+    );
+
+    // Wait for first render
+    await flush();
+
+    // Type "hello", press Enter, then type "world"
+    await type(stdin, "hello", flush);
+    await type(stdin, "\n", flush); // Enter / Return
+    await type(stdin, "world", flush);
+
+    const frame = lastFrameStripped();
+    const lines = frame.split("\n");
+
+    // eslint-disable-next-line no-console
+    console.log(
+      "\n--- RENDERED FRAME ---\n" + frame + "\n---------------------",
+    );
+
+    // We expect at least two rendered lines and the texts to appear on their
+    // own respective rows.
+    expect(lines.length).toBeGreaterThanOrEqual(2);
+    // First rendered (inside border) line should contain 'hello'
+    expect(lines.some((l: string) => l.includes("hello"))).toBe(true);
+    // Another line should contain 'world'
+    expect(lines.some((l: string) => l.includes("world"))).toBe(true);
+
+    cleanup();
+  });
+});
--- a/codex-cli/tests/multiline-shift-enter-crlf.test.tsx
+++ b/codex-cli/tests/multiline-shift-enter-crlf.test.tsx
@@ -0,0 +1,51 @@
+// Regression test: Some terminals emit a carriage‑return ("\r") for
+// Shift+Enter instead of a bare line‑feed.  Pressing Shift+Enter in that
+// environment should insert a newline **without** triggering submission.
+
+import { renderTui } from "./ui-test-helpers.js";
+import MultilineTextEditor from "../src/components/chat/multiline-editor.js";
+import * as React from "react";
+import { describe, it, expect, vi } from "vitest";
+
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+describe("MultilineTextEditor – Shift+Enter (\r variant)", () => {
+  it("inserts a newline and does NOT submit when the terminal sends \r for Shift+Enter", async () => {
+    const onSubmit = vi.fn();
+
+    const { stdin, lastFrameStripped, flush, cleanup } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        height: 5,
+        width: 20,
+        initialText: "",
+        onSubmit,
+      }),
+    );
+
+    await flush();
+
+    // Type some text then press Shift+Enter (simulated by kitty CSI-u seq).
+    await type(stdin, "foo", flush);
+    await type(stdin, "\u001B[13;2u", flush); // ESC [ 13 ; 2 u
+    await type(stdin, "bar", flush);
+
+    const frame = lastFrameStripped();
+    expect(frame).toMatch(/foo/);
+    expect(frame).toMatch(/bar/);
+
+    // Must have inserted a newline (two rendered lines inside the frame)
+    expect(frame.split("\n").length).toBeGreaterThanOrEqual(2);
+
+    // No submission should have occurred
+    expect(onSubmit).not.toHaveBeenCalled();
+
+    cleanup();
+  });
+});
--- a/codex-cli/tests/multiline-shift-enter.test.tsx
+++ b/codex-cli/tests/multiline-shift-enter.test.tsx
@@ -0,0 +1,49 @@
+import { renderTui } from "./ui-test-helpers.js";
+import MultilineTextEditor from "../src/components/chat/multiline-editor.js";
+import * as React from "react";
+import { describe, it, expect, vi } from "vitest";
+
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+describe("MultilineTextEditor – Shift+Enter", () => {
+  it("inserts a newline instead of submitting", async () => {
+    const onSubmit = vi.fn();
+
+    const { stdin, lastFrameStripped, flush, cleanup } = renderTui(
+      React.createElement(MultilineTextEditor, {
+        height: 5,
+        width: 20,
+        initialText: "",
+        onSubmit,
+      }),
+    );
+
+    await flush();
+
+    // type 'hi'
+    await type(stdin, "hi", flush);
+
+    // send Shift+Enter – simulated by \n without key.return. Ink's test stdin
+    // delivers raw bytes only, so we approximate by writing "\n" directly.
+    await type(stdin, "\n", flush);
+
+    // type 'there'
+    await type(stdin, "there", flush);
+
+    const frame = lastFrameStripped();
+    expect(frame).toMatch(/hi/);
+    expect(frame).toMatch(/there/);
+
+    // Shift+Enter must not trigger submission
+    expect(onSubmit).not.toHaveBeenCalled();
+
+    cleanup();
+  });
+});
--- a/codex-cli/tests/project-doc.test.ts
+++ b/codex-cli/tests/project-doc.test.ts
@@ -0,0 +1,57 @@
+import { loadConfig, PROJECT_DOC_MAX_BYTES } from "../src/utils/config.js";
+import { mkdirSync, rmSync, writeFileSync, mkdtempSync } from "fs";
+import { tmpdir } from "os";
+import { join } from "path";
+import { describe, expect, test, beforeEach, afterEach, vi } from "vitest";
+
+let projectDir: string;
+let configPath: string;
+let instructionsPath: string;
+
+beforeEach(() => {
+  projectDir = mkdtempSync(join(tmpdir(), "codex-proj-"));
+  // Create fake .git dir to mark project root
+  mkdirSync(join(projectDir, ".git"));
+
+  // Config & instructions paths under temp dir so we don't pollute real homedir
+  configPath = join(projectDir, "config.json");
+  instructionsPath = join(projectDir, "instructions.md");
+});
+
+afterEach(() => {
+  rmSync(projectDir, { recursive: true, force: true });
+});
+
+describe("project doc integration", () => {
+  test("happy path: project doc gets merged into instructions", () => {
+    const docContent = "# Project\nThis is my project.";
+    writeFileSync(join(projectDir, "codex.md"), docContent);
+
+    const cfg = loadConfig(configPath, instructionsPath, { cwd: projectDir });
+    expect(cfg.instructions).toContain(docContent);
+  });
+
+  test("opt-out via flag prevents inclusion", () => {
+    const docContent = "will be ignored";
+    writeFileSync(join(projectDir, "codex.md"), docContent);
+
+    const cfg = loadConfig(configPath, instructionsPath, {
+      cwd: projectDir,
+      disableProjectDoc: true,
+    });
+    expect(cfg.instructions).not.toContain(docContent);
+  });
+
+  test("file larger than limit gets truncated and warns", () => {
+    const big = "x".repeat(PROJECT_DOC_MAX_BYTES + 4096);
+    writeFileSync(join(projectDir, "codex.md"), big);
+
+    const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
+    const cfg = loadConfig(configPath, instructionsPath, { cwd: projectDir });
+
+    expect(cfg.instructions.length).toBe(PROJECT_DOC_MAX_BYTES);
+    expect(warnSpy).toHaveBeenCalledOnce();
+
+    warnSpy.mockRestore();
+  });
+});
--- a/codex-cli/tests/raw-exec-process-group.test.ts
+++ b/codex-cli/tests/raw-exec-process-group.test.ts
@@ -0,0 +1,64 @@
+import { describe, it, expect } from "vitest";
+import { exec as rawExec } from "../src/utils/agent/sandbox/raw-exec.js";
+
+// Regression test: When cancelling an in‑flight `rawExec()` the implementation
+// must terminate *all* processes that belong to the spawned command – not just
+// the direct child.  The original logic only sent `SIGTERM` to the immediate
+// child which meant that grandchildren (for instance when running through a
+// `bash -c` wrapper) were left running and turned into "zombie" processes.
+
+// Strategy:
+//   1. Start a Bash shell that spawns a long‑running `sleep`, prints the PID
+//      of that `sleep`, and then waits forever.  This guarantees we can later
+//      check if the grand‑child is still alive.
+//   2. Abort the exec almost immediately.
+//   3. After `rawExec()` resolves we probe the previously printed PID with
+//      `process.kill(pid, 0)`.  If the call throws `ESRCH` the process no
+//      longer exists – the desired outcome.  Otherwise the test fails.
+
+// The negative‑PID process‑group trick employed by the fixed implementation is
+// POSIX‑only.  On Windows we skip the test.
+
+describe("rawExec – abort kills entire process group", () => {
+  it("terminates grandchildren spawned via bash", async () => {
+    if (process.platform === "win32") {
+      return;
+    }
+
+    const abortController = new AbortController();
+
+    // Bash script: spawn `sleep 30` in background, print its PID, then wait.
+    const script = "sleep 30 & pid=$!; echo $pid; wait $pid";
+    const cmd = ["bash", "-c", script];
+
+    // Kick off the command.
+    const execPromise = rawExec(cmd, {}, [], abortController.signal);
+
+    // Give Bash a tiny bit of time to start and print the PID.
+    await new Promise((r) => setTimeout(r, 50));
+
+    // Cancel the task – this should kill *both* bash and the inner sleep.
+    abortController.abort();
+
+    const { exitCode, stdout } = await execPromise;
+
+    // We expect a non‑zero exit code because the process was killed.
+    expect(exitCode).not.toBe(0);
+
+    // Extract the grand‑child PID from stdout.
+    const pidMatch = /^(\d+)/.exec(stdout.trim());
+    expect(pidMatch).not.toBeNull();
+    const sleepPid = Number(pidMatch![1]);
+
+    // Verify that the sleep process is no longer alive.
+    let alive = true;
+    try {
+      process.kill(sleepPid, 0); // throws if the process does not exist
+      alive = true;
+    } catch {
+      alive = false;
+    }
+
+    expect(alive).toBe(false);
+  });
+});
--- a/codex-cli/tests/terminal-chat-response-item.test.tsx
+++ b/codex-cli/tests/terminal-chat-response-item.test.tsx
@@ -0,0 +1,59 @@
+import { renderTui } from "./ui-test-helpers.js";
+import TerminalChatResponseItem from "../src/components/chat/terminal-chat-response-item.js";
+import React from "react";
+import { describe, it, expect } from "vitest";
+
+// Component under test
+
+// The ResponseItem type is complex and imported from the OpenAI SDK. To keep
+// this test lightweight we construct the minimal runtime objects we need and
+// cast them to `any` so that TypeScript is satisfied.
+
+function userMessage(text: string) {
+  return {
+    type: "message",
+    role: "user",
+    content: [
+      {
+        type: "input_text",
+        text,
+      },
+    ],
+  } as any;
+}
+
+function assistantMessage(text: string) {
+  return {
+    type: "message",
+    role: "assistant",
+    content: [
+      {
+        type: "output_text",
+        text,
+      },
+    ],
+  } as any;
+}
+
+describe("TerminalChatResponseItem", () => {
+  it("renders a user message", () => {
+    const { lastFrameStripped } = renderTui(
+      <TerminalChatResponseItem item={userMessage("Hello world")} />,
+    );
+
+    const frame = lastFrameStripped();
+    expect(frame).toContain("user");
+    expect(frame).toContain("Hello world");
+  });
+
+  it("renders an assistant message", () => {
+    const { lastFrameStripped } = renderTui(
+      <TerminalChatResponseItem item={assistantMessage("Sure thing")} />,
+    );
+
+    const frame = lastFrameStripped();
+    // assistant messages are labelled "codex" in the UI
+    expect(frame.toLowerCase()).toContain("codex");
+    expect(frame).toContain("Sure thing");
+  });
+});
--- a/codex-cli/tests/text-buffer-copy-paste.test.ts
+++ b/codex-cli/tests/text-buffer-copy-paste.test.ts
@@ -0,0 +1,50 @@
+import TextBuffer from "../src/lib/text-buffer.js";
+import { describe, it, expect } from "vitest";
+
+// These tests ensure that the TextBuffer copy‑&‑paste logic keeps parity with
+// the Rust reference implementation (`textarea.rs`).  When a multi‑line
+// string *without* a trailing newline is pasted at the beginning of a line,
+// the final pasted line should be merged with the text that originally
+// followed the caret – exactly how most editors behave.
+
+function setupBuffer(): TextBuffer {
+  return new TextBuffer("ab\ncd\nef");
+}
+
+describe("TextBuffer – copy/paste multi‑line", () => {
+  it("copies a multi‑line selection without the trailing newline", () => {
+    const buf = setupBuffer();
+
+    // Select from (0,0) → (1,2)  ["ab", "cd"]
+    buf.startSelection(); // anchor at 0,0
+    buf.move("down"); // 1,0
+    buf.move("right");
+    buf.move("right"); // 1,2
+
+    const copied = buf.copy();
+    expect(copied).toBe("ab\ncd");
+  });
+
+  it("pastes the multi‑line clipboard as separate lines (does not merge with following text)", () => {
+    const buf = setupBuffer();
+
+    // Make the same selection and copy
+    buf.startSelection();
+    buf.move("down");
+    buf.move("right");
+    buf.move("right");
+    buf.copy();
+
+    // Move caret to the start of the last line and paste
+    buf.move("down");
+    buf.move("home"); // (2,0)
+
+    const ok = buf.paste();
+    expect(ok).toBe(true);
+
+    // Desired final buffer – behaviour should match the Rust reference:
+    // the final pasted line is *merged* with the original text on the
+    // insertion row.
+    expect(buf.getLines()).toEqual(["ab", "cd", "ab", "cdef"]);
+  });
+});
--- a/codex-cli/tests/text-buffer-crlf.test.ts
+++ b/codex-cli/tests/text-buffer-crlf.test.ts
@@ -0,0 +1,14 @@
+import TextBuffer from "../src/lib/text-buffer.js";
+import { describe, it, expect } from "vitest";
+
+describe("TextBuffer – newline normalisation", () => {
+  it("insertStr should split on \r and \r\n sequences", () => {
+    const buf = new TextBuffer("");
+
+    // Windows‑style CRLF
+    buf.insertStr("ab\r\ncd\r\nef");
+
+    expect(buf.getLines()).toEqual(["ab", "cd", "ef"]);
+    expect(buf.getCursor()).toEqual([2, 2]); // after 'f'
+  });
+});
--- a/codex-cli/tests/text-buffer-gaps.test.ts
+++ b/codex-cli/tests/text-buffer-gaps.test.ts
@@ -0,0 +1,250 @@
+import TextBuffer from "../src/lib/text-buffer";
+import { describe, it, expect } from "vitest";
+
+// The purpose of this test‑suite is NOT to make the implementation green today
+// – quite the opposite.  We capture behaviours that are already covered by the
+// reference Rust implementation (textarea.rs) but are *still missing* from the
+// current TypeScript port.  Every test is therefore marked with `.fails()` so
+// that the suite passes while the functionality is absent.  When a particular
+// gap is closed the corresponding test will begin to succeed, causing Vitest to
+// raise an error (a *good* error) that reminds us to remove the `.fails` flag.
+
+/* -------------------------------------------------------------------------- */
+/*  Soft‑tab insertion                                                         */
+/* -------------------------------------------------------------------------- */
+
+describe("soft‑tab insertion (↹ => 4 spaces)", () => {
+  it.fails(
+    "inserts 4 spaces at caret position when hard‑tab mode is off",
+    () => {
+      const buf = new TextBuffer("");
+
+      // A literal "\t" character is treated as user pressing the Tab key.  The
+      // Rust version expands it to soft‑tabs by default.
+      buf.insert("\t");
+
+      expect(buf.getText()).toBe("    ");
+      expect(buf.getCursor()).toEqual([0, 4]);
+    },
+  );
+});
+
+/* -------------------------------------------------------------------------- */
+/*  Undo / Redo – grouping & stack clearing                                   */
+/* -------------------------------------------------------------------------- */
+
+describe("undo / redo – advanced behaviour", () => {
+  it.fails(
+    "typing a word character‑by‑character should undo in one step",
+    () => {
+      const buf = new TextBuffer("");
+
+      for (const ch of "hello") {
+        buf.insert(ch);
+      }
+
+      // One single undo should revert the *whole* word, leaving empty buffer.
+      buf.undo();
+
+      expect(buf.getText()).toBe("");
+      expect(buf.getCursor()).toEqual([0, 0]);
+    },
+  );
+});
+
+/* -------------------------------------------------------------------------- */
+/*  Selection – cut / delete selection                                        */
+/* -------------------------------------------------------------------------- */
+
+describe("selection – cut/delete", () => {
+  it.fails(
+    "cut() removes the selected range and yanks it into clipboard",
+    () => {
+      const buf = new TextBuffer("foo bar baz");
+
+      // Select the middle word "bar"
+      buf.move("wordRight"); // after "foo" + space => col 4
+      buf.startSelection();
+      buf.move("wordRight"); // after "bar" (col 8)
+      // @ts-expect-error – method missing in current implementation
+      buf.cut();
+
+      // Text should now read "foo  baz" (two spaces collapsed only if impl trims)
+      expect(buf.getText()).toBe("foo baz");
+
+      // Cursor should be at the start of the gap where text was removed
+      expect(buf.getCursor()).toEqual([0, 4]);
+
+      // And clipboard/yank buffer should contain the deleted word
+      // @ts-expect-error – clipboard getter not exposed yet
+      expect(buf.getClipboard()).toBe("bar");
+    },
+  );
+});
+
+/* -------------------------------------------------------------------------- */
+/*  Word‑wise forward deletion (Ctrl+Delete)                                  */
+/* -------------------------------------------------------------------------- */
+
+describe("delete_next_word (Ctrl+Delete)", () => {
+  it.fails("removes everything until the next word boundary", () => {
+    const vp = { width: 80, height: 25 };
+    const buf = new TextBuffer("hello world!!  next");
+
+    // Place caret at start of line (0,0).  One Ctrl+Delete should wipe the
+    // word "hello" and the following space.
+    buf.handleInput(undefined, { delete: true, ctrl: true }, vp);
+
+    expect(buf.getText()).toBe("world!!  next");
+    expect(buf.getCursor()).toEqual([0, 0]);
+  });
+});
+
+/* -------------------------------------------------------------------------- */
+/*  Configurable tab length                                                   */
+/* -------------------------------------------------------------------------- */
+
+describe("tab length configuration", () => {
+  it.fails("inserts the configured number of spaces when tabLen=2", () => {
+    // @ts-expect-error – constructor currently has no config object
+    const buf = new TextBuffer("", { tabLen: 2 });
+
+    buf.insert("\t");
+
+    expect(buf.getText()).toBe("  "); // two spaces
+    expect(buf.getCursor()).toEqual([0, 2]);
+  });
+});
+
+/* -------------------------------------------------------------------------- */
+/*  Search subsystem                                                          */
+/* -------------------------------------------------------------------------- */
+
+describe("search / regex navigation", () => {
+  it.fails("search_forward jumps to the next match", () => {
+    const text = [
+      "alpha beta gamma",
+      "beta gamma alpha",
+      "gamma alpha beta",
+    ].join("\n");
+
+    const buf = new TextBuffer(text);
+
+    // @ts-expect-error – method missing
+    buf.setSearchPattern(/beta/);
+
+    // Cursor starts at 0,0.  First search_forward should land on the first
+    // occurrence (row 0, col 6)
+    // @ts-expect-error – method missing
+    buf.searchForward();
+
+    expect(buf.getCursor()).toEqual([0, 6]);
+
+    // Second invocation should wrap within viewport and find next occurrence
+    // (row 1, col 0)
+    // @ts-expect-error – method missing
+    buf.searchForward();
+
+    expect(buf.getCursor()).toEqual([1, 0]);
+  });
+});
+
+/* -------------------------------------------------------------------------- */
+/*  Word‑wise navigation accuracy                                             */
+/* -------------------------------------------------------------------------- */
+
+describe("wordLeft / wordRight – punctuation boundaries", () => {
+  it.fails("wordLeft stops after punctuation like hyphen (-)", () => {
+    const buf = new TextBuffer("hello-world");
+
+    // Place caret at end of line
+    buf.move("end");
+
+    // Perform a single wordLeft – in Rust implementation this lands right
+    // *after* the hyphen, i.e. between '-' and 'w' (column index 6).
+    buf.move("wordLeft");
+
+    expect(buf.getCursor()).toEqual([0, 6]);
+  });
+
+  it.fails(
+    "wordRight stops after punctuation like underscore (_) which is not in JS boundary set",
+    () => {
+      const buf = new TextBuffer("foo_bar");
+
+      // From start, one wordRight should land right after the underscore (col 4)
+      buf.move("wordRight");
+
+      expect(buf.getCursor()).toEqual([0, 4]);
+    },
+  );
+});
+
+/* -------------------------------------------------------------------------- */
+/*  Word‑wise deletion (Ctrl+Backspace)                                        */
+/* -------------------------------------------------------------------------- */
+
+describe("word deletion shortcuts", () => {
+  it.fails("Ctrl+Backspace deletes the previous word", () => {
+    const vp = { width: 80, height: 25 };
+    const buf = new TextBuffer("hello world");
+
+    // Place caret after the last character
+    buf.move("end");
+
+    // Simulate Ctrl+Backspace (terminal usually sends backspace with ctrl flag)
+    buf.handleInput(undefined, { backspace: true, ctrl: true }, vp);
+
+    // The whole word "world" (and the preceding space) should be removed,
+    // leaving just "hello".
+    expect(buf.getText()).toBe("hello");
+    expect(buf.getCursor()).toEqual([0, 5]);
+  });
+});
+
+/* -------------------------------------------------------------------------- */
+/*  Paragraph navigation                                                       */
+/* -------------------------------------------------------------------------- */
+
+describe("paragraph navigation", () => {
+  it.fails("Jumping forward by paragraph stops after a blank line", () => {
+    const text = [
+      "first paragraph line 1",
+      "first paragraph line 2",
+      "", // blank line separates paragraphs
+      "second paragraph line 1",
+    ].join("\n");
+
+    const buf = new TextBuffer(text);
+
+    // Start at very beginning
+    // (No method exposed yet – once implemented we will call move("paragraphForward"))
+    // For now we imitate the call; test will fail until the command exists.
+    // @ts-expect-error – method not implemented yet
+    buf.move("paragraphForward");
+
+    // Expect caret to land at start of the first line _after_ the blank one
+    expect(buf.getCursor()).toEqual([3, 0]);
+  });
+});
+
+/* -------------------------------------------------------------------------- */
+/*  Independent scrolling                                                     */
+/* -------------------------------------------------------------------------- */
+
+describe("viewport scrolling independent of cursor", () => {
+  it.fails("scrolls without moving the caret", () => {
+    const lines = Array.from({ length: 100 }, (_, i) => `line ${i}`);
+    const buf = new TextBuffer(lines.join("\n"));
+    const vp = { width: 10, height: 5 };
+
+    // Cursor stays at 0,0.  We now ask the view to scroll down by one page.
+    // @ts-expect-error – method not implemented yet
+    buf.scroll("pageDown", vp);
+
+    // Cursor must remain at (0,0) even though viewport origin changed.
+    expect(buf.getCursor()).toEqual([0, 0]);
+    // The first visible line should now be "line 5".
+    expect(buf.getVisibleLines(vp)[0]).toBe("line 5");
+  });
+});
--- a/codex-cli/tests/text-buffer-word.test.ts
+++ b/codex-cli/tests/text-buffer-word.test.ts
@@ -0,0 +1,115 @@
+import TextBuffer from "../src/lib/text-buffer.js";
+import { describe, test, expect } from "vitest";
+
+describe("TextBuffer – word‑wise navigation & deletion", () => {
+  test("wordRight moves to end‑of‑line when no further boundary", () => {
+    const tb = new TextBuffer("hello");
+
+    // Move the caret inside the word (index 3)
+    tb.move("right");
+    tb.move("right");
+    tb.move("right");
+
+    tb.move("wordRight");
+
+    const [, col] = tb.getCursor();
+    expect(col).toBe(5); // end of the word / line
+  });
+
+  test("Ctrl+Backspace on raw byte deletes previous word", () => {
+    const tb = new TextBuffer("hello world");
+    const vp = { height: 10, width: 80 } as const;
+
+    // Place caret at end
+    tb.move("end");
+
+    // Simulate terminal sending DEL (0x7f) byte with ctrl modifier – Ink
+    // usually does *not* set `key.backspace` in this path.
+    tb.handleInput("\x7f", { ctrl: true }, vp);
+
+    expect(tb.getText()).toBe("hello ");
+  });
+
+  test("Option/Alt+Backspace deletes previous word", () => {
+    const tb = new TextBuffer("foo bar baz");
+    const vp = { height: 10, width: 80 } as const;
+
+    // caret at end
+    tb.move("end");
+
+    // Simulate Option+Backspace (alt): Ink sets key.backspace = true, key.alt = true (no raw byte)
+    tb.handleInput(undefined, { backspace: true, alt: true }, vp);
+
+    expect(tb.getText()).toBe("foo bar ");
+  });
+
+  test("Option/Alt+Delete deletes next word", () => {
+    const tb = new TextBuffer("foo bar baz");
+    const vp = { height: 10, width: 80 } as const;
+
+    // Move caret between first and second word (after space)
+    tb.move("wordRight"); // after foo
+    tb.move("right"); // skip space -> start of bar
+
+    // Option+Delete
+    tb.handleInput(undefined, { delete: true, alt: true }, vp);
+
+    expect(tb.getText()).toBe("foo  baz"); // note double space removed later maybe
+  });
+
+  test("wordLeft eventually reaches column 0", () => {
+    const tb = new TextBuffer("hello world");
+
+    // Move to end of line first
+    tb.move("end");
+
+    // two wordLefts should land at start of line
+    tb.move("wordLeft");
+    tb.move("wordLeft");
+
+    const [, col] = tb.getCursor();
+    expect(col).toBe(0);
+  });
+
+  test("wordRight jumps over a delimiter into the next word", () => {
+    const tb = new TextBuffer("hello world");
+
+    tb.move("wordRight"); // from start – should land after "hello" (between space & w)
+    let [, col] = tb.getCursor();
+    expect(col).toBe(5);
+
+    // Next wordRight should move to end of line (after "world")
+    tb.move("wordRight");
+    [, col] = tb.getCursor();
+    expect(col).toBe(11);
+  });
+
+  test("deleteWordLeft removes the previous word and positions the caret correctly", () => {
+    const tb = new TextBuffer("hello world");
+
+    // Place caret at end of line
+    tb.move("end");
+
+    // Act
+    tb.deleteWordLeft();
+
+    expect(tb.getText()).toBe("hello ");
+    const [, col] = tb.getCursor();
+    expect(col).toBe(6); // after the space
+  });
+
+  test("deleteWordRight removes the following word", () => {
+    const tb = new TextBuffer("hello world");
+
+    // Move caret to start of "world"
+    tb.move("wordRight"); // caret after "hello"
+    tb.move("right"); // skip the space, now at index 6 (start of world)
+
+    // Act
+    tb.deleteWordRight();
+
+    expect(tb.getText()).toBe("hello ");
+    const [, col] = tb.getCursor();
+    expect(col).toBe(6);
+  });
+});
--- a/codex-cli/tests/text-buffer.test.ts
+++ b/codex-cli/tests/text-buffer.test.ts
@@ -0,0 +1,264 @@
+import TextBuffer from "../src/lib/text-buffer";
+import { describe, it, expect } from "vitest";
+
+describe("TextBuffer – basic editing parity with Rust suite", () => {
+  /* ------------------------------------------------------------------ */
+  /*  insert_char                                                        */
+  /* ------------------------------------------------------------------ */
+  it("insert_char / printable (single line)", () => {
+    // (col, char, expectedLine)
+    const cases: Array<[number, string, string]> = [
+      [0, "x", "xab"],
+      [1, "x", "axb"],
+      [2, "x", "abx"],
+      [1, "あ", "aあb"],
+    ];
+
+    for (const [col, ch, want] of cases) {
+      const buf = new TextBuffer("ab");
+      buf.move("end"); // go to col 2
+      while (buf.getCursor()[1] > col) {
+        buf.move("left");
+      }
+      buf.insert(ch);
+      expect(buf.getText()).toBe(want);
+      expect(buf.getCursor()).toEqual([0, col + 1]);
+    }
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  insert_char – newline support                                      */
+  /* ------------------------------------------------------------------ */
+  it("insert_char with a newline should split the line", () => {
+    const buf = new TextBuffer("ab");
+    // jump to end of first (and only) line
+    buf.move("end");
+    // Insert a raw \n character – the Rust implementation splits the line
+    buf.insert("\n");
+
+    // We expect the text to be split into two separate lines
+    expect(buf.getLines()).toEqual(["ab", ""]);
+    expect(buf.getCursor()).toEqual([1, 0]);
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  insert_str helpers                                                 */
+  /* ------------------------------------------------------------------ */
+  it("insert_str should insert multi‑line strings", () => {
+    const initial = ["ab", "cd", "ef"].join("\n");
+    const buf = new TextBuffer(initial);
+
+    // place cursor at (row:0, col:0)
+    // No move needed – cursor starts at 0,0
+
+    buf.insertStr("x\ny");
+
+    const wantLines = ["x", "yab", "cd", "ef"];
+    expect(buf.getLines()).toEqual(wantLines);
+    expect(buf.getCursor()).toEqual([1, 1]);
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  Undo / Redo                                                        */
+  /* ------------------------------------------------------------------ */
+  it("undo / redo history should revert edits", () => {
+    const buf = new TextBuffer("hello");
+    buf.move("end");
+    buf.insert("!"); // text becomes "hello!"
+
+    expect(buf.undo()).toBe(true);
+    expect(buf.getText()).toBe("hello");
+
+    expect(buf.redo()).toBe(true);
+    expect(buf.getText()).toBe("hello!");
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  Selection model                                                    */
+  /* ------------------------------------------------------------------ */
+  it("copy & paste should operate on current selection", () => {
+    const buf = new TextBuffer("hello world");
+    buf.startSelection();
+    // Select the word "hello"
+    buf.move("right"); // h
+    buf.move("right"); // e
+    buf.move("right"); // l
+    buf.move("right"); // l
+    buf.move("right"); // o
+    buf.endSelection();
+    buf.copy();
+
+    // Move to end and paste
+    buf.move("end");
+    // add one space before pasting copied word
+    buf.insert(" ");
+    buf.paste();
+
+    expect(buf.getText()).toBe("hello world hello");
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  Backspace behaviour                                                */
+  /* ------------------------------------------------------------------ */
+
+  describe("backspace", () => {
+    it("deletes the character to the *left* of the caret within a line", () => {
+      const buf = new TextBuffer("abc");
+
+      // Move caret after the second character ( index 2 => after 'b' )
+      buf.move("right"); // -> a|bc (col 1)
+      buf.move("right"); // -> ab|c (col 2)
+
+      buf.backspace();
+
+      expect(buf.getLines()).toEqual(["ac"]);
+      expect(buf.getCursor()).toEqual([0, 1]);
+    });
+
+    it("merges with the previous line when invoked at column 0", () => {
+      const buf = new TextBuffer(["ab", "cd"].join("\n"));
+
+      // Place caret at the beginning of second line
+      buf.move("down"); // row = 1, col = 0
+
+      buf.backspace();
+
+      expect(buf.getLines()).toEqual(["abcd"]);
+      expect(buf.getCursor()).toEqual([0, 2]); // after 'b'
+    });
+
+    it("is a no‑op at the very beginning of the buffer", () => {
+      const buf = new TextBuffer("ab");
+      buf.backspace(); // caret starts at (0,0)
+
+      expect(buf.getLines()).toEqual(["ab"]);
+      expect(buf.getCursor()).toEqual([0, 0]);
+    });
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  Vertical cursor movement – we should preserve the preferred column  */
+  /* ------------------------------------------------------------------ */
+
+  describe("up / down navigation keeps the preferred column", () => {
+    it("restores horizontal position when moving across shorter lines", () => {
+      // Three lines: long / short / long
+      const lines = ["abcdef", "x", "abcdefg"].join("\n");
+      const buf = new TextBuffer(lines);
+
+      // Place caret after the 5th char in first line (col = 5)
+      buf.move("end"); // col 6 (after 'f')
+      buf.move("left"); // col 5 (between 'e' and 'f')
+
+      // Move down twice – through a short line and back to a long one
+      buf.move("down"); // should land on (1, 1) due to clamp
+      buf.move("down"); // desired: (2, 5)
+
+      expect(buf.getCursor()).toEqual([2, 5]);
+    });
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  Left / Right arrow navigation across Unicode surrogate pairs       */
+  /* ------------------------------------------------------------------ */
+
+  describe("left / right navigation", () => {
+    it("should treat multi‑code‑unit emoji as a single character", () => {
+      // '🐶' is a surrogate‑pair (length 2) but one user‑perceived char.
+      const buf = new TextBuffer("🐶a");
+
+      // Move caret once to the right – logically past the emoji.
+      buf.move("right");
+
+      // Insert another printable character
+      buf.insert("x");
+
+      // We expect the emoji to stay intact and the text to be 🐶xa
+      expect(buf.getLines()).toEqual(["🐶xa"]);
+      // Cursor should be after the inserted char (two visible columns along)
+      expect(buf.getCursor()).toEqual([0, 2]);
+    });
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  HandleInput – raw DEL bytes should map to backspace                */
+  /* ------------------------------------------------------------------ */
+
+  it("handleInput should treat \x7f input as backspace", () => {
+    const buf = new TextBuffer("");
+    const vp = { width: 80, height: 25 };
+
+    // Type "hello" via printable input path
+    for (const ch of "hello") {
+      buf.handleInput(ch, {}, vp);
+    }
+
+    // Two DEL bytes – terminal's backspace
+    buf.handleInput("\x7f", {}, vp);
+    buf.handleInput("\x7f", {}, vp);
+
+    expect(buf.getText()).toBe("hel");
+    expect(buf.getCursor()).toEqual([0, 3]);
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  HandleInput – `key.delete` should ALSO behave as backspace          */
+  /* ------------------------------------------------------------------ */
+
+  it("handleInput should treat key.delete as backspace", () => {
+    const buf = new TextBuffer("");
+    const vp = { width: 80, height: 25 };
+
+    for (const ch of "hello") {
+      buf.handleInput(ch, {}, vp);
+    }
+
+    // Simulate the Delete (Mac backspace) key three times
+    buf.handleInput(undefined, { delete: true }, vp);
+    buf.handleInput(undefined, { delete: true }, vp);
+    buf.handleInput(undefined, { delete: true }, vp);
+
+    expect(buf.getText()).toBe("he");
+    expect(buf.getCursor()).toEqual([0, 2]);
+  });
+
+  /* ------------------------------------------------------------------ */
+  /*  Cursor positioning semantics                                       */
+  /* ------------------------------------------------------------------ */
+
+  describe("cursor movement & backspace semantics", () => {
+    it("typing should leave cursor after the last inserted character", () => {
+      const vp = { width: 80, height: 25 };
+      const buf = new TextBuffer("");
+
+      buf.handleInput("h", {}, vp);
+      expect(buf.getCursor()).toEqual([0, 1]);
+
+      for (const ch of "ello") {
+        buf.handleInput(ch, {}, vp);
+      }
+      expect(buf.getCursor()).toEqual([0, 5]); // after 'o'
+    });
+
+    it("arrow‑left moves the caret to *between* characters (highlight next)", () => {
+      const vp = { width: 80, height: 25 };
+      const buf = new TextBuffer("");
+      for (const ch of "bar") {
+        buf.handleInput(ch, {}, vp);
+      } // cursor at col 3
+
+      buf.move("left"); // col 2 (right before 'r')
+      buf.move("left"); // col 1 (right before 'a')
+
+      expect(buf.getCursor()).toEqual([0, 1]);
+      // Character to the RIGHT of caret should be 'a'
+      const charRight = [...buf.getLines()[0]!][buf.getCursor()[1]];
+      expect(charRight).toBe("a");
+
+      // Backspace should delete the char to the *left* (i.e. 'b'), leaving "ar"
+      buf.backspace();
+      expect(buf.getLines()[0]).toBe("ar");
+      expect(buf.getCursor()).toEqual([0, 0]);
+    });
+  });
+});
--- a/codex-cli/tests/typeahead-scroll.test.tsx
+++ b/codex-cli/tests/typeahead-scroll.test.tsx
@@ -0,0 +1,69 @@
+/*
+ * Regression test – ensure that the TypeaheadOverlay passes the *complete*
+ * list of items down to <SelectInput>.  This guarantees that users can scroll
+ * through the full set instead of being limited to the hard‑coded "limit"
+ * slice that is only meant to control how many rows are visible at once.
+ */
+
+import * as React from "react";
+import { describe, it, expect, vi } from "vitest";
+
+// ---------------------------------------------------------------------------
+//  Mock <ink-select-input> so we can capture the props that TypeaheadOverlay
+//  forwards without rendering the real component (which would require a full
+//  Ink TTY environment).
+// ---------------------------------------------------------------------------
+
+let receivedItems: Array<{ label: string; value: string }> | null = null;
+
+vi.mock("ink-select-input", () => {
+  return {
+    default: (props: any) => {
+      receivedItems = props.items;
+      return null; // Do not render anything – we only care about the props
+    },
+  };
+});
+
+// Ink's <TextInput> toggles raw‑mode which calls .ref() / .unref() on stdin.
+// The test environment's mock streams don't implement those methods, so we
+// polyfill them to no‑ops on the prototype *before* the component tree mounts.
+import { EventEmitter } from "node:events";
+if (!(EventEmitter.prototype as any).ref) {
+  (EventEmitter.prototype as any).ref = () => {};
+  (EventEmitter.prototype as any).unref = () => {};
+}
+
+import type { TypeaheadItem } from "../src/components/typeahead-overlay.js";
+import TypeaheadOverlay from "../src/components/typeahead-overlay.js";
+
+import { renderTui } from "./ui-test-helpers.js";
+
+describe("TypeaheadOverlay – scrolling capability", () => {
+  it("passes the full item list to <SelectInput> so users can scroll beyond the visible limit", async () => {
+    const ITEMS: Array<TypeaheadItem> = Array.from({ length: 20 }, (_, i) => ({
+      label: `model-${i + 1}`,
+      value: `model-${i + 1}`,
+    }));
+
+    // Sanity – reset capture before rendering
+    receivedItems = null;
+
+    const { flush, cleanup } = renderTui(
+      React.createElement(TypeaheadOverlay, {
+        title: "Test",
+        initialItems: ITEMS,
+        limit: 5, // visible rows – should *not* limit the underlying list
+        onSelect: () => {},
+        onExit: () => {},
+      }),
+    );
+
+    await flush(); // allow first render to complete
+
+    expect(receivedItems).not.toBeNull();
+    expect((receivedItems ?? []).length).toBe(ITEMS.length);
+
+    cleanup();
+  });
+});
--- a/codex-cli/tests/ui-test-helpers.tsx
+++ b/codex-cli/tests/ui-test-helpers.tsx
@@ -0,0 +1,28 @@
+import type React from "react";
+
+import { render } from "ink-testing-library";
+import stripAnsi from "strip-ansi";
+
+/**
+ * Render an Ink component for testing.
+ *
+ * Returns the full testing‑library utils plus `lastFrameStripped()` which
+ * yields the latest rendered frame with ANSI escape codes removed so that
+ * assertions can be colour‑agnostic.
+ */
+export function renderTui(ui: React.ReactElement): any {
+  const utils = render(ui);
+
+  const lastFrameStripped = () => stripAnsi(utils.lastFrame() || "");
+
+  // A tiny helper that waits for Ink's internal promises / timers to settle
+  // so the next `lastFrame()` call reflects the latest UI state.
+  const flush = async () =>
+    new Promise<void>((resolve) => setTimeout(resolve, 0));
+
+  return {
+    ...utils,
+    lastFrameStripped,
+    flush,
+  };
+}