Enforce ASCII in README.md (#513)

This all started because I was going to write a script to autogenerate the Table of Contents in the root `README.md`, but I noticed that the `href` for the "Why Codex?" heading was `#whycodex` instead of `#why-codex`. This piqued my curiosity and it turned out that the space in "Why Codex?" was not an ASCII space but **U+00A0**, a non-breaking space, and so GitHub ignored it when generating the `href` for the heading. This also meant that when I did a text search for `why codex` in the `README.md` in VS Code, the "Why Codex" heading did not match because of the presence of **U+00A0**. In short, these types of Unicode characters seem like a hazard, so I decided to introduce this script to flag them, and if desired, to replace them with "good enough" ASCII equivalents. For now, this only applies to the root `README.md` file, but I think we should ultimately apply this across our source code, as well, as we seem to have quite a lot of non-ASCII Unicode and it's probably going to cause `rg` to miss things. Contributions of this PR: * `./scripts/asciicheck.py`, which takes a list of filepaths and returns non-zero if any of them contain non-ASCII characters. (Currently, there is one exception for ✨ aka **U+2728**, though I would like to default to an empty allowlist and then require all exceptions to be specified as flags.) * A `--fix` option that will attempt to rewrite files with violations using a equivalents from a hardcoded substitution list. * An update to `ci.yml` to verify `./scripts/asciicheck.py README.md` succeeds. * A cleanup of `README.md` using the `--fix` option as well as some editorial decisions on my part. * I tried to update the `href`s in the Table of Contents to reflect the changes in the heading titles. (TIL that if a heading has a character like `&` surrounded by spaces, it becomes `--` in the generated `href`.)
2025-04-22 07:07:40 -07:00
parent 2cb8355968
commit c00ae2dcc1
3 changed files with 211 additions and 82 deletions
--- a/scripts/asciicheck.py
+++ b/scripts/asciicheck.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+
+import argparse
+import sys
+from pathlib import Path
+
+"""
+Utility script that takes a list of files and returns non-zero if any of them
+contain non-ASCII characters other than those in the allowed list.
+
+If --fix is used, it will attempt to replace non-ASCII characters with ASCII
+equivalents.
+
+The motivation behind this script is that characters like U+00A0 (non-breaking
+space) can cause regexes not to match and can result in surprising anchor
+values for headings when GitHub renders Markdown as HTML.
+"""
+
+
+"""
+When --fix is used, perform the following substitutions.
+"""
+substitutions: dict[int, str] = {
+    0x00A0: " ",  # non-breaking space
+    0x2011: "-",  # non-breaking hyphen
+    0x2013: "-",  # en dash
+    0x2014: "-",  # em dash
+    0x2018: "'",  # left single quote
+    0x2019: "'",  # right single quote
+    0x201C: '"',  # left double quote
+    0x201D: '"',  # right double quote
+    0x2026: "...",  # ellipsis
+    0x202F: " ",  # narrow non-breaking space
+}
+
+"""
+Unicode codepoints that are allowed in addition to ASCII.
+Be conservative with this list.
+
+Note that it is always an option to use the hex HTML representation
+instead of the character itself so the source code is ASCII-only.
+For example, U+2728 (sparkles) can be written as `&#x2728;`.
+"""
+allowed_unicode_codepoints = {
+    0x2728,  # sparkles
+}
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Check for non-ASCII characters in files."
+    )
+    parser.add_argument(
+        "--fix",
+        action="store_true",
+        help="Rewrite files, replacing non-ASCII characters with ASCII equivalents, where possible.",
+    )
+    parser.add_argument(
+        "files",
+        nargs="+",
+        help="Files to check for non-ASCII characters.",
+    )
+    args = parser.parse_args()
+
+    has_errors = False
+    for filename in args.files:
+        path = Path(filename)
+        has_errors |= lint_utf8_ascii(path, fix=args.fix)
+    return 1 if has_errors else 0
+
+
+def lint_utf8_ascii(filename: Path, fix: bool) -> bool:
+    """Returns True if an error was printed."""
+    try:
+        with open(filename, "rb") as f:
+            raw = f.read()
+        text = raw.decode("utf-8")
+    except UnicodeDecodeError as e:
+        print("UTF-8 decoding error:")
+        print(f"  byte offset: {e.start}")
+        print(f"  reason: {e.reason}")
+        # Attempt to find line/column
+        partial = raw[: e.start]
+        line = partial.count(b"\n") + 1
+        col = e.start - (partial.rfind(b"\n") if b"\n" in partial else -1)
+        print(f"  location: line {line}, column {col}")
+        return True
+
+    errors = []
+    for lineno, line in enumerate(text.splitlines(keepends=True), 1):
+        for colno, char in enumerate(line, 1):
+            codepoint = ord(char)
+            if char == "\n":
+                continue
+            if (
+                not (0x20 <= codepoint <= 0x7E)
+                and codepoint not in allowed_unicode_codepoints
+            ):
+                errors.append((lineno, colno, char, codepoint))
+
+    if errors:
+        for lineno, colno, char, codepoint in errors:
+            safe_char = repr(char)[1:-1]  # nicely escape things like \u202f
+            print(
+                f"Invalid character at line {lineno}, column {colno}: U+{codepoint:04X} ({safe_char})"
+            )
+
+    if errors and fix:
+        print(f"Attempting to fix {filename}...")
+        num_replacements = 0
+        new_contents = ""
+        for char in text:
+            codepoint = ord(char)
+            if codepoint in substitutions:
+                num_replacements += 1
+                new_contents += substitutions[codepoint]
+            else:
+                new_contents += char
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(new_contents)
+        print(f"Fixed {num_replacements} of {len(errors)} errors in {filename}.")
+
+    return bool(errors)
+
+
+if __name__ == "__main__":
+    sys.exit(main())