llmx/codex-cli/tests/agent-thinking-time.test.ts

// ---------------------------------------------------------------------------
// Regression test for the "thinking time" counter. Today the implementation
// keeps a *single* start‑time across many requests which means that every
// subsequent command will show an ever‑increasing number such as
// "thinking for 4409s", "thinking for 4424s", … even though the individual
// turn only took a couple of milliseconds. Each request should start its own
// independent timer.
//
// We mark the spec with `.fails()` so that the overall suite remains green
// until the underlying bug is fixed. When the implementation is corrected the
// expectations below will turn green – Vitest will then error and remind us to
// remove the `.fails` flag.
// ---------------------------------------------------------------------------

import { AgentLoop } from "../src/utils/agent/agent-loop.js";
import { describe, it, expect, vi } from "vitest";

// --- OpenAI mock -----------------------------------------------------------

/**
 * Fake stream that yields a single `response.completed` after a configurable
 * delay. This allows us to simulate different thinking times for successive
 * requests while using Vitest's fake timers.
 */
class FakeStream {
  public controller = { abort: vi.fn() };
  private delay: number;

  constructor(delay: number) {
    this.delay = delay; // milliseconds
  }

  async *[Symbol.asyncIterator]() {
    if (this.delay > 0) {
      // Wait the configured delay – fake timers will fast‑forward.
      await new Promise((r) => setTimeout(r, this.delay));
    }

    yield {
      type: "response.completed",
      response: {
        id: `resp-${Date.now()}`,
        status: "completed",
        output: [
          {
            type: "message",
            role: "assistant",
            id: "m1",
            content: [{ type: "text", text: "done" }],
          },
        ],
      },
    } as any;
  }
}

/**
 * Fake OpenAI client that returns a slower stream for the *first* call and a
 * faster one for the second so we can verify that per‑task timers reset while
 * the global counter accumulates.
 */
vi.mock("openai", () => {
  let callCount = 0;
  class FakeOpenAI {
    public responses = {
      create: async () => {
        callCount += 1;
        return new FakeStream(callCount === 1 ? 10_000 : 500); // 10s vs 0.5s
      },
    };
  }
  class APIConnectionTimeoutError extends Error {}
  return { __esModule: true, default: FakeOpenAI, APIConnectionTimeoutError };
});

// Stub helpers referenced indirectly so we do not pull in real FS/network
vi.mock("@lib/approvals.js", () => ({
  __esModule: true,
  isSafeCommand: () => null,
}));

vi.mock("@lib/format-command.js", () => ({
  __esModule: true,
  formatCommandForDisplay: (c: Array<string>) => c.join(" "),
}));

// Suppress file‑system logging in tests.
vi.mock("../src/utils/agent/log.js", () => ({
  __esModule: true,
  log: () => {},
  isLoggingEnabled: () => false,
}));

describe("thinking time counter", () => {
  // Use fake timers for *all* tests in this suite
  vi.useFakeTimers();

  // Re‐use this array to collect all onItem callbacks
  let items: Array<any>;

  // Helper that runs two agent turns (10s + 0.5s) and populates `items`
  async function runScenario() {
    items = [];

    const agent = new AgentLoop({
      config: {} as any,
      model: "any",
      instructions: "",
      approvalPolicy: { mode: "auto" } as any,
      onItem: (i) => items.push(i),
      onLoading: () => {},
      getCommandConfirmation: async () => ({ review: "yes" } as any),
      onLastResponseId: () => {},
    });

    const userMsg = {
      type: "message",
      role: "user",
      content: [{ type: "input_text", text: "do it" }],
    } as any;

    // 1️⃣ First request – simulated 10s thinking time
    agent.run([userMsg]);
    await vi.advanceTimersByTimeAsync(11_000); // 10s + flush margin

    // 2️⃣ Second request – simulated 0.5s thinking time
    agent.run([userMsg]);
    await vi.advanceTimersByTimeAsync(1_000); // 0.5s + flush margin
  }

  // TODO: this is disabled
  it.fails("reports correct per-task thinking time per command", async () => {
    await runScenario();

    const perTaskMsgs = items.filter(
      (i) =>
        i.role === "system" &&
        i.content?.[0]?.text?.startsWith("🤔  Thinking time:"),
    );

    expect(perTaskMsgs.length).toBe(2);

    const perTaskDurations = perTaskMsgs.map((m) => {
      const match = m.content[0].text.match(/Thinking time: (\d+) s/);
      return match ? parseInt(match[1]!, 10) : NaN;
    });

    // First run ~10s, second run ~0.5s
    expect(perTaskDurations[0]).toBeGreaterThanOrEqual(9);
    expect(perTaskDurations[1]).toBeLessThan(3);
  });

  // TODO: this is disabled
  it.fails("reports correct global thinking time accumulation", async () => {
    await runScenario();

    const globalMsgs = items.filter(
      (i) =>
        i.role === "system" &&
        i.content?.[0]?.text?.startsWith("⏱  Total thinking time:"),
    );

    expect(globalMsgs.length).toBe(2);

    const globalDurations = globalMsgs.map((m) => {
      const match = m.content[0].text.match(/Total thinking time: (\d+) s/);
      return match ? parseInt(match[1]!, 10) : NaN;
    });

    // Total after second run should exceed total after first
    expect(globalDurations[1]! as number).toBeGreaterThan(globalDurations[0]!);
  });
});