Files
llmx/codex-cli/tests/agent-thinking-time.test.ts
Michael Bolin ae5b1b5cb5 add support for -w,--writable-root to add more writable roots for sandbox (#263)
This adds support for a new flag, `-w,--writable-root`, that can be
specified multiple times to _amend_ the list of folders that should be
configured as "writable roots" by the sandbox used in `full-auto` mode.
Values that are passed as relative paths will be resolved to absolute
paths.

Incidentally, this required updating a number of the `agent*.test.ts`
files: it feels like some of the setup logic across those tests could be
consolidated.

In my testing, it seems that this might be slightly out of distribution
for the model, as I had to explicitly tell it to run `apply_patch` and
that it had the permissions to write those files (initially, it just
showed me a diff and told me to apply it myself). Nevertheless, I think
this is a good starting point.
2025-04-17 15:39:26 -07:00

175 lines
5.3 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ---------------------------------------------------------------------------
// Regression test for the "thinking time" counter. Today the implementation
// keeps a *single* starttime across many requests which means that every
// subsequent command will show an everincreasing number such as
// "thinking for 4409s", "thinking for 4424s", … even though the individual
// turn only took a couple of milliseconds. Each request should start its own
// independent timer.
//
// We mark the spec with `.fails()` so that the overall suite remains green
// until the underlying bug is fixed. When the implementation is corrected the
// expectations below will turn green Vitest will then error and remind us to
// remove the `.fails` flag.
// ---------------------------------------------------------------------------
import { AgentLoop } from "../src/utils/agent/agent-loop.js";
import { describe, it, expect, vi } from "vitest";
// --- OpenAI mock -----------------------------------------------------------
/**
* Fake stream that yields a single `response.completed` after a configurable
* delay. This allows us to simulate different thinking times for successive
* requests while using Vitest's fake timers.
*/
class FakeStream {
public controller = { abort: vi.fn() };
private delay: number;
constructor(delay: number) {
this.delay = delay; // milliseconds
}
async *[Symbol.asyncIterator]() {
if (this.delay > 0) {
// Wait the configured delay fake timers will fastforward.
await new Promise((r) => setTimeout(r, this.delay));
}
yield {
type: "response.completed",
response: {
id: `resp-${Date.now()}`,
status: "completed",
output: [
{
type: "message",
role: "assistant",
id: "m1",
content: [{ type: "text", text: "done" }],
},
],
},
} as any;
}
}
/**
* Fake OpenAI client that returns a slower stream for the *first* call and a
* faster one for the second so we can verify that pertask timers reset while
* the global counter accumulates.
*/
vi.mock("openai", () => {
let callCount = 0;
class FakeOpenAI {
public responses = {
create: async () => {
callCount += 1;
return new FakeStream(callCount === 1 ? 10_000 : 500); // 10s vs 0.5s
},
};
}
class APIConnectionTimeoutError extends Error {}
return { __esModule: true, default: FakeOpenAI, APIConnectionTimeoutError };
});
// Stub helpers referenced indirectly so we do not pull in real FS/network
vi.mock("../src/approvals.js", () => ({
__esModule: true,
isSafeCommand: () => null,
}));
vi.mock("../src/format-command.js", () => ({
__esModule: true,
formatCommandForDisplay: (c: Array<string>) => c.join(" "),
}));
// Suppress filesystem logging in tests.
vi.mock("../src/utils/agent/log.js", () => ({
__esModule: true,
log: () => {},
isLoggingEnabled: () => false,
}));
describe("thinking time counter", () => {
// Use fake timers for *all* tests in this suite
vi.useFakeTimers();
// Reuse this array to collect all onItem callbacks
let items: Array<any>;
// Helper that runs two agent turns (10s + 0.5s) and populates `items`
async function runScenario() {
items = [];
const agent = new AgentLoop({
config: {} as any,
model: "any",
instructions: "",
approvalPolicy: { mode: "auto" } as any,
additionalWritableRoots: [],
onItem: (i) => items.push(i),
onLoading: () => {},
getCommandConfirmation: async () => ({ review: "yes" } as any),
onLastResponseId: () => {},
});
const userMsg = {
type: "message",
role: "user",
content: [{ type: "input_text", text: "do it" }],
} as any;
// 1⃣ First request simulated 10s thinking time
agent.run([userMsg]);
await vi.advanceTimersByTimeAsync(11_000); // 10s + flush margin
// 2⃣ Second request simulated 0.5s thinking time
agent.run([userMsg]);
await vi.advanceTimersByTimeAsync(1_000); // 0.5s + flush margin
}
// TODO: this is disabled
it.fails("reports correct per-task thinking time per command", async () => {
await runScenario();
const perTaskMsgs = items.filter(
(i) =>
i.role === "system" &&
i.content?.[0]?.text?.startsWith("🤔 Thinking time:"),
);
expect(perTaskMsgs.length).toBe(2);
const perTaskDurations = perTaskMsgs.map((m) => {
const match = m.content[0].text.match(/Thinking time: (\d+) s/);
return match ? parseInt(match[1]!, 10) : NaN;
});
// First run ~10s, second run ~0.5s
expect(perTaskDurations[0]).toBeGreaterThanOrEqual(9);
expect(perTaskDurations[1]).toBeLessThan(3);
});
// TODO: this is disabled
it.fails("reports correct global thinking time accumulation", async () => {
await runScenario();
const globalMsgs = items.filter(
(i) =>
i.role === "system" &&
i.content?.[0]?.text?.startsWith("⏱ Total thinking time:"),
);
expect(globalMsgs.length).toBe(2);
const globalDurations = globalMsgs.map((m) => {
const match = m.content[0].text.match(/Total thinking time: (\d+) s/);
return match ? parseInt(match[1]!, 10) : NaN;
});
// Total after second run should exceed total after first
expect(globalDurations[1]! as number).toBeGreaterThan(globalDurations[0]!);
});
});