Files
llmx/codex-cli/src/utils/approximate-tokens-used.ts

56 lines
1.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import type { ResponseItem } from "openai/resources/responses/responses.mjs";
/**
* Roughly estimate the number of languagemodel tokens represented by a list
* of OpenAI `ResponseItem`s.
*
* A full tokenizer would be more accurate, but would add a heavyweight
* dependency for only marginal benefit. Empirically, assuming ~4 characters
* per token offers a good enough signal for displaying contextwindow usage
* to the user.
*
* The algorithm counts characters from the different content types we may
* encounter and then converts that char count to tokens by dividing by four
* and rounding up.
*/
export function approximateTokensUsed(items: Array<ResponseItem>): number {
let charCount = 0;
for (const item of items) {
switch (item.type) {
case "message": {
if (item.role !== "user" && item.role !== "assistant") {
continue;
}
for (const c of item.content) {
if (c.type === "input_text" || c.type === "output_text") {
charCount += c.text.length;
} else if (c.type === "refusal") {
charCount += c.refusal.length;
} else if (c.type === "input_file") {
charCount += c.filename?.length ?? 0;
}
// images and other content types are ignored (0 chars)
}
break;
}
case "function_call": {
charCount += (item.name?.length || 0) + (item.arguments?.length || 0);
break;
}
case "function_call_output": {
charCount += item.output.length;
break;
}
default:
break;
}
}
return Math.ceil(charCount / 4);
}