Files
llmx/codex-cli/tests/agent-thinking-time.test.ts
Ilan Bigio 59a180ddec Initial commit
Signed-off-by: Ilan Bigio <ilan@openai.com>
2025-04-16 12:56:08 -04:00

174 lines
5.3 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ---------------------------------------------------------------------------
// Regression test for the "thinking time" counter. Today the implementation
// keeps a *single* starttime across many requests which means that every
// subsequent command will show an everincreasing number such as
// "thinking for 4409s", "thinking for 4424s", … even though the individual
// turn only took a couple of milliseconds. Each request should start its own
// independent timer.
//
// We mark the spec with `.fails()` so that the overall suite remains green
// until the underlying bug is fixed. When the implementation is corrected the
// expectations below will turn green Vitest will then error and remind us to
// remove the `.fails` flag.
// ---------------------------------------------------------------------------
import { AgentLoop } from "../src/utils/agent/agent-loop.js";
import { describe, it, expect, vi } from "vitest";
// --- OpenAI mock -----------------------------------------------------------
/**
* Fake stream that yields a single `response.completed` after a configurable
* delay. This allows us to simulate different thinking times for successive
* requests while using Vitest's fake timers.
*/
class FakeStream {
public controller = { abort: vi.fn() };
private delay: number;
constructor(delay: number) {
this.delay = delay; // milliseconds
}
async *[Symbol.asyncIterator]() {
if (this.delay > 0) {
// Wait the configured delay fake timers will fastforward.
await new Promise((r) => setTimeout(r, this.delay));
}
yield {
type: "response.completed",
response: {
id: `resp-${Date.now()}`,
status: "completed",
output: [
{
type: "message",
role: "assistant",
id: "m1",
content: [{ type: "text", text: "done" }],
},
],
},
} as any;
}
}
/**
* Fake OpenAI client that returns a slower stream for the *first* call and a
* faster one for the second so we can verify that pertask timers reset while
* the global counter accumulates.
*/
vi.mock("openai", () => {
let callCount = 0;
class FakeOpenAI {
public responses = {
create: async () => {
callCount += 1;
return new FakeStream(callCount === 1 ? 10_000 : 500); // 10s vs 0.5s
},
};
}
class APIConnectionTimeoutError extends Error {}
return { __esModule: true, default: FakeOpenAI, APIConnectionTimeoutError };
});
// Stub helpers referenced indirectly so we do not pull in real FS/network
vi.mock("@lib/approvals.js", () => ({
__esModule: true,
isSafeCommand: () => null,
}));
vi.mock("@lib/format-command.js", () => ({
__esModule: true,
formatCommandForDisplay: (c: Array<string>) => c.join(" "),
}));
// Suppress filesystem logging in tests.
vi.mock("../src/utils/agent/log.js", () => ({
__esModule: true,
log: () => {},
isLoggingEnabled: () => false,
}));
describe("thinking time counter", () => {
// Use fake timers for *all* tests in this suite
vi.useFakeTimers();
// Reuse this array to collect all onItem callbacks
let items: Array<any>;
// Helper that runs two agent turns (10s + 0.5s) and populates `items`
async function runScenario() {
items = [];
const agent = new AgentLoop({
config: {} as any,
model: "any",
instructions: "",
approvalPolicy: { mode: "auto" } as any,
onItem: (i) => items.push(i),
onLoading: () => {},
getCommandConfirmation: async () => ({ review: "yes" } as any),
onLastResponseId: () => {},
});
const userMsg = {
type: "message",
role: "user",
content: [{ type: "input_text", text: "do it" }],
} as any;
// 1⃣ First request simulated 10s thinking time
agent.run([userMsg]);
await vi.advanceTimersByTimeAsync(11_000); // 10s + flush margin
// 2⃣ Second request simulated 0.5s thinking time
agent.run([userMsg]);
await vi.advanceTimersByTimeAsync(1_000); // 0.5s + flush margin
}
// TODO: this is disabled
it.fails("reports correct per-task thinking time per command", async () => {
await runScenario();
const perTaskMsgs = items.filter(
(i) =>
i.role === "system" &&
i.content?.[0]?.text?.startsWith("🤔 Thinking time:"),
);
expect(perTaskMsgs.length).toBe(2);
const perTaskDurations = perTaskMsgs.map((m) => {
const match = m.content[0].text.match(/Thinking time: (\d+) s/);
return match ? parseInt(match[1]!, 10) : NaN;
});
// First run ~10s, second run ~0.5s
expect(perTaskDurations[0]).toBeGreaterThanOrEqual(9);
expect(perTaskDurations[1]).toBeLessThan(3);
});
// TODO: this is disabled
it.fails("reports correct global thinking time accumulation", async () => {
await runScenario();
const globalMsgs = items.filter(
(i) =>
i.role === "system" &&
i.content?.[0]?.text?.startsWith("⏱ Total thinking time:"),
);
expect(globalMsgs.length).toBe(2);
const globalDurations = globalMsgs.map((m) => {
const match = m.content[0].text.match(/Total thinking time: (\d+) s/);
return match ? parseInt(match[1]!, 10) : NaN;
});
// Total after second run should exceed total after first
expect(globalDurations[1]! as number).toBeGreaterThan(globalDurations[0]!);
});
});