add check to ensure ToC in README.md matches headings in the file (#541)

This introduces a Python script (written by Codex!) to verify that the table of contents in the root `README.md` matches the headings. Like `scripts/asciicheck.py` in https://github.com/openai/codex/pull/513, it reports differences by default (and exits non-zero if there are any) and also has a `--fix` option to synchronize the ToC with the headings. This will be enforced by CI and the changes to `README.md` in this PR were generated by the script, so you can see that our ToC was missing some entries prior to this PR.
2025-04-22 09:38:12 -07:00
parent dd330646d2
commit 9b06fb48a7
3 changed files with 128 additions and 1 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -70,3 +70,5 @@ jobs:

      - name: Ensure README.md contains only ASCII and certain Unicode code points
        run: ./scripts/asciicheck.py README.md
+      - name: Check README ToC
+        run: python3 scripts/readme_toc.py README.md
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@
 <details>
 <summary><strong>Table&nbsp;of&nbsp;Contents</strong></summary>

+<!-- Begin ToC -->
+
 - [Experimental Technology Disclaimer](#experimental-technology-disclaimer)
 - [Quickstart](#quickstart)
 - [Why Codex?](#why-codex)
@@ -19,13 +21,16 @@
 - [CLI Reference](#cli-reference)
 - [Memory & Project Docs](#memory--project-docs)
 - [Non-interactive / CI mode](#non-interactive--ci-mode)
+- [Tracing / Verbose Logging](#tracing--verbose-logging)
 - [Recipes](#recipes)
 - [Installation](#installation)
 - [Configuration](#configuration)
 - [FAQ](#faq)
+- [Zero Data Retention (ZDR) Organization Limitation](#zero-data-retention-zdr-organization-limitation)
 - [Funding Opportunity](#funding-opportunity)
 - [Contributing](#contributing)
  - [Development workflow](#development-workflow)
+  - [Git Hooks with Husky](#git-hooks-with-husky)
    - [Nix Flake Development](#nix-flake-development)
  - [Writing high-impact code changes](#writing-high-impact-code-changes)
  - [Opening a pull request](#opening-a-pull-request)
@@ -37,7 +42,8 @@
  - [Releasing `codex`](#releasing-codex)
 - [Security & Responsible AI](#security--responsible-ai)
 - [License](#license)
- [Zero Data Retention (ZDR) Organization Limitation](#zero-data-retention-zdr-organization-limitation)
+
+<!-- End ToC -->

 </details>

--- a/scripts/readme_toc.py
+++ b/scripts/readme_toc.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+
+"""
+Utility script to verify (and optionally fix) the Table of Contents in a
+Markdown file. By default, it checks that the ToC between `<!-- Begin ToC -->`
+and `<!-- End ToC -->` matches the headings in the file. With --fix, it
+rewrites the file to update the ToC.
+"""
+
+import argparse
+import sys
+import re
+import difflib
+from pathlib import Path
+from typing import List
+
+# Markers for the Table of Contents section
+BEGIN_TOC: str = "<!-- Begin ToC -->"
+END_TOC: str = "<!-- End ToC -->"
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Check and optionally fix the README.md Table of Contents."
+    )
+    parser.add_argument(
+        "file", nargs="?", default="README.md", help="Markdown file to process"
+    )
+    parser.add_argument(
+        "--fix", action="store_true", help="Rewrite file with updated ToC"
+    )
+    args = parser.parse_args()
+    path = Path(args.file)
+    return check_or_fix(path, args.fix)
+
+
+def generate_toc_lines(content: str) -> List[str]:
+    """
+    Generate markdown list lines for headings (## to ######) in content.
+    """
+    lines = content.splitlines()
+    headings = []
+    in_code = False
+    for line in lines:
+        if line.strip().startswith("```"):
+            in_code = not in_code
+            continue
+        if in_code:
+            continue
+        m = re.match(r"^(#{2,6})\s+(.*)$", line)
+        if not m:
+            continue
+        level = len(m.group(1))
+        text = m.group(2).strip()
+        headings.append((level, text))
+
+    toc = []
+    for level, text in headings:
+        indent = "  " * (level - 2)
+        slug = text.lower()
+        # normalize spaces and dashes
+        slug = slug.replace("\u00a0", " ")
+        slug = slug.replace("\u2011", "-").replace("\u2013", "-").replace("\u2014", "-")
+        # drop other punctuation
+        slug = re.sub(r"[^0-9a-z\s-]", "", slug)
+        slug = slug.strip().replace(" ", "-")
+        toc.append(f"{indent}- [{text}](#{slug})")
+    return toc
+
+
+def check_or_fix(readme_path: Path, fix: bool) -> int:
+    if not readme_path.is_file():
+        print(f"Error: file not found: {readme_path}", file=sys.stderr)
+        return 1
+    content = readme_path.read_text(encoding="utf-8")
+    lines = content.splitlines()
+    # locate ToC markers
+    try:
+        begin_idx = next(i for i, l in enumerate(lines) if l.strip() == BEGIN_TOC)
+        end_idx = next(i for i, l in enumerate(lines) if l.strip() == END_TOC)
+    except StopIteration:
+        print(
+            f"Error: Could not locate '{BEGIN_TOC}' or '{END_TOC}' in {readme_path}.",
+            file=sys.stderr,
+        )
+        return 1
+    # extract current ToC list items
+    current_block = lines[begin_idx + 1 : end_idx]
+    current = [l for l in current_block if l.lstrip().startswith("- [")]
+    # generate expected ToC
+    expected = generate_toc_lines(content)
+    if current == expected:
+        return 0
+    if not fix:
+        print(
+            "ERROR: README ToC is out of date. Diff between existing and generated ToC:"
+        )
+        # Show full unified diff of current vs expected
+        diff = difflib.unified_diff(
+            current,
+            expected,
+            fromfile="existing ToC",
+            tofile="generated ToC",
+            lineterm="",
+        )
+        for line in diff:
+            print(line)
+        return 1
+    # rebuild file with updated ToC
+    prefix = lines[: begin_idx + 1]
+    suffix = lines[end_idx:]
+    new_lines = prefix + [""] + expected + [""] + suffix
+    readme_path.write_text("\n".join(new_lines) + "\n", encoding="utf-8")
+    print(f"Updated ToC in {readme_path}.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())