Files
llmx/codex-cli/tests/agent-thinking-time.test.ts
Luci a9ecb2efce chore: upgrade prettier to v3 (#644)
## Description

This PR addresses the following improvements:

**Unify Prettier Version**: Currently, the Prettier version used in
`/package.json` and `/codex-cli/package.json` are different. In this PR,
we're updating both to use Prettier v3.

- Prettier v3 introduces improved support for JavaScript and TypeScript.
(e.g. the formatting scenario shown in the image below. This is more
aligned with the TypeScript indentation standard).

<img width="1126" alt="image"
src="https://github.com/user-attachments/assets/6e237eb8-4553-4574-b336-ed9561c55370"
/>

**Add Prettier Auto-Formatting in lint-staged**: We've added a step to
automatically run prettier --write on JavaScript and TypeScript files as
part of the lint-staged process, before the ESLint checks.

- This will help ensure that all committed code is properly formatted
according to the project's Prettier configuration.
2025-04-25 07:21:50 -07:00

175 lines
5.3 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ---------------------------------------------------------------------------
// Regression test for the "thinking time" counter. Today the implementation
// keeps a *single* starttime across many requests which means that every
// subsequent command will show an everincreasing number such as
// "thinking for 4409s", "thinking for 4424s", … even though the individual
// turn only took a couple of milliseconds. Each request should start its own
// independent timer.
//
// We mark the spec with `.fails()` so that the overall suite remains green
// until the underlying bug is fixed. When the implementation is corrected the
// expectations below will turn green Vitest will then error and remind us to
// remove the `.fails` flag.
// ---------------------------------------------------------------------------
import { AgentLoop } from "../src/utils/agent/agent-loop.js";
import { describe, it, expect, vi } from "vitest";
// --- OpenAI mock -----------------------------------------------------------
/**
* Fake stream that yields a single `response.completed` after a configurable
* delay. This allows us to simulate different thinking times for successive
* requests while using Vitest's fake timers.
*/
class FakeStream {
public controller = { abort: vi.fn() };
private delay: number;
constructor(delay: number) {
this.delay = delay; // milliseconds
}
async *[Symbol.asyncIterator]() {
if (this.delay > 0) {
// Wait the configured delay fake timers will fastforward.
await new Promise((r) => setTimeout(r, this.delay));
}
yield {
type: "response.completed",
response: {
id: `resp-${Date.now()}`,
status: "completed",
output: [
{
type: "message",
role: "assistant",
id: "m1",
content: [{ type: "text", text: "done" }],
},
],
},
} as any;
}
}
/**
* Fake OpenAI client that returns a slower stream for the *first* call and a
* faster one for the second so we can verify that pertask timers reset while
* the global counter accumulates.
*/
vi.mock("openai", () => {
let callCount = 0;
class FakeOpenAI {
public responses = {
create: async () => {
callCount += 1;
return new FakeStream(callCount === 1 ? 10_000 : 500); // 10s vs 0.5s
},
};
}
class APIConnectionTimeoutError extends Error {}
return { __esModule: true, default: FakeOpenAI, APIConnectionTimeoutError };
});
// Stub helpers referenced indirectly so we do not pull in real FS/network
vi.mock("../src/approvals.js", () => ({
__esModule: true,
isSafeCommand: () => null,
}));
vi.mock("../src/format-command.js", () => ({
__esModule: true,
formatCommandForDisplay: (c: Array<string>) => c.join(" "),
}));
// Suppress filesystem logging in tests.
vi.mock("../src/utils/agent/log.js", () => ({
__esModule: true,
log: () => {},
isLoggingEnabled: () => false,
}));
describe("thinking time counter", () => {
// Use fake timers for *all* tests in this suite
vi.useFakeTimers();
// Reuse this array to collect all onItem callbacks
let items: Array<any>;
// Helper that runs two agent turns (10s + 0.5s) and populates `items`
async function runScenario() {
items = [];
const agent = new AgentLoop({
config: {} as any,
model: "any",
instructions: "",
approvalPolicy: { mode: "auto" } as any,
additionalWritableRoots: [],
onItem: (i) => items.push(i),
onLoading: () => {},
getCommandConfirmation: async () => ({ review: "yes" }) as any,
onLastResponseId: () => {},
});
const userMsg = {
type: "message",
role: "user",
content: [{ type: "input_text", text: "do it" }],
} as any;
// 1⃣ First request simulated 10s thinking time
agent.run([userMsg]);
await vi.advanceTimersByTimeAsync(11_000); // 10s + flush margin
// 2⃣ Second request simulated 0.5s thinking time
agent.run([userMsg]);
await vi.advanceTimersByTimeAsync(1_000); // 0.5s + flush margin
}
// TODO: this is disabled
it.fails("reports correct per-task thinking time per command", async () => {
await runScenario();
const perTaskMsgs = items.filter(
(i) =>
i.role === "system" &&
i.content?.[0]?.text?.startsWith("🤔 Thinking time:"),
);
expect(perTaskMsgs.length).toBe(2);
const perTaskDurations = perTaskMsgs.map((m) => {
const match = m.content[0].text.match(/Thinking time: (\d+) s/);
return match ? parseInt(match[1]!, 10) : NaN;
});
// First run ~10s, second run ~0.5s
expect(perTaskDurations[0]).toBeGreaterThanOrEqual(9);
expect(perTaskDurations[1]).toBeLessThan(3);
});
// TODO: this is disabled
it.fails("reports correct global thinking time accumulation", async () => {
await runScenario();
const globalMsgs = items.filter(
(i) =>
i.role === "system" &&
i.content?.[0]?.text?.startsWith("⏱ Total thinking time:"),
);
expect(globalMsgs.length).toBe(2);
const globalDurations = globalMsgs.map((m) => {
const match = m.content[0].text.match(/Total thinking time: (\d+) s/);
return match ? parseInt(match[1]!, 10) : NaN;
});
// Total after second run should exceed total after first
expect(globalDurations[1]! as number).toBeGreaterThan(globalDurations[0]!);
});
});