This all started because I was going to write a script to autogenerate
the Table of Contents in the root `README.md`, but I noticed that the
`href` for the "Why Codex?" heading was `#whycodex` instead of
`#why-codex`. This piqued my curiosity and it turned out that the space
in "Why Codex?" was not an ASCII space but **U+00A0**, a non-breaking
space, and so GitHub ignored it when generating the `href` for the
heading.
This also meant that when I did a text search for `why codex` in the
`README.md` in VS Code, the "Why Codex" heading did not match because of
the presence of **U+00A0**.
In short, these types of Unicode characters seem like a hazard, so I
decided to introduce this script to flag them, and if desired, to
replace them with "good enough" ASCII equivalents. For now, this only
applies to the root `README.md` file, but I think we should ultimately
apply this across our source code, as well, as we seem to have quite a
lot of non-ASCII Unicode and it's probably going to cause `rg` to miss
things.
Contributions of this PR:
* `./scripts/asciicheck.py`, which takes a list of filepaths and returns
non-zero if any of them contain non-ASCII characters. (Currently, there
is one exception for ✨ aka **U+2728**, though I would like to default to
an empty allowlist and then require all exceptions to be specified as
flags.)
* A `--fix` option that will attempt to rewrite files with violations
using a equivalents from a hardcoded substitution list.
* An update to `ci.yml` to verify `./scripts/asciicheck.py README.md`
succeeds.
* A cleanup of `README.md` using the `--fix` option as well as some
editorial decisions on my part.
* I tried to update the `href`s in the Table of Contents to reflect the
changes in the heading titles. (TIL that if a heading has a character
like `&` surrounded by spaces, it becomes `--` in the generated `href`.)
128 lines
3.9 KiB
Python
Executable File
128 lines
3.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
"""
|
|
Utility script that takes a list of files and returns non-zero if any of them
|
|
contain non-ASCII characters other than those in the allowed list.
|
|
|
|
If --fix is used, it will attempt to replace non-ASCII characters with ASCII
|
|
equivalents.
|
|
|
|
The motivation behind this script is that characters like U+00A0 (non-breaking
|
|
space) can cause regexes not to match and can result in surprising anchor
|
|
values for headings when GitHub renders Markdown as HTML.
|
|
"""
|
|
|
|
|
|
"""
|
|
When --fix is used, perform the following substitutions.
|
|
"""
|
|
substitutions: dict[int, str] = {
|
|
0x00A0: " ", # non-breaking space
|
|
0x2011: "-", # non-breaking hyphen
|
|
0x2013: "-", # en dash
|
|
0x2014: "-", # em dash
|
|
0x2018: "'", # left single quote
|
|
0x2019: "'", # right single quote
|
|
0x201C: '"', # left double quote
|
|
0x201D: '"', # right double quote
|
|
0x2026: "...", # ellipsis
|
|
0x202F: " ", # narrow non-breaking space
|
|
}
|
|
|
|
"""
|
|
Unicode codepoints that are allowed in addition to ASCII.
|
|
Be conservative with this list.
|
|
|
|
Note that it is always an option to use the hex HTML representation
|
|
instead of the character itself so the source code is ASCII-only.
|
|
For example, U+2728 (sparkles) can be written as `✨`.
|
|
"""
|
|
allowed_unicode_codepoints = {
|
|
0x2728, # sparkles
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(
|
|
description="Check for non-ASCII characters in files."
|
|
)
|
|
parser.add_argument(
|
|
"--fix",
|
|
action="store_true",
|
|
help="Rewrite files, replacing non-ASCII characters with ASCII equivalents, where possible.",
|
|
)
|
|
parser.add_argument(
|
|
"files",
|
|
nargs="+",
|
|
help="Files to check for non-ASCII characters.",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
has_errors = False
|
|
for filename in args.files:
|
|
path = Path(filename)
|
|
has_errors |= lint_utf8_ascii(path, fix=args.fix)
|
|
return 1 if has_errors else 0
|
|
|
|
|
|
def lint_utf8_ascii(filename: Path, fix: bool) -> bool:
|
|
"""Returns True if an error was printed."""
|
|
try:
|
|
with open(filename, "rb") as f:
|
|
raw = f.read()
|
|
text = raw.decode("utf-8")
|
|
except UnicodeDecodeError as e:
|
|
print("UTF-8 decoding error:")
|
|
print(f" byte offset: {e.start}")
|
|
print(f" reason: {e.reason}")
|
|
# Attempt to find line/column
|
|
partial = raw[: e.start]
|
|
line = partial.count(b"\n") + 1
|
|
col = e.start - (partial.rfind(b"\n") if b"\n" in partial else -1)
|
|
print(f" location: line {line}, column {col}")
|
|
return True
|
|
|
|
errors = []
|
|
for lineno, line in enumerate(text.splitlines(keepends=True), 1):
|
|
for colno, char in enumerate(line, 1):
|
|
codepoint = ord(char)
|
|
if char == "\n":
|
|
continue
|
|
if (
|
|
not (0x20 <= codepoint <= 0x7E)
|
|
and codepoint not in allowed_unicode_codepoints
|
|
):
|
|
errors.append((lineno, colno, char, codepoint))
|
|
|
|
if errors:
|
|
for lineno, colno, char, codepoint in errors:
|
|
safe_char = repr(char)[1:-1] # nicely escape things like \u202f
|
|
print(
|
|
f"Invalid character at line {lineno}, column {colno}: U+{codepoint:04X} ({safe_char})"
|
|
)
|
|
|
|
if errors and fix:
|
|
print(f"Attempting to fix {filename}...")
|
|
num_replacements = 0
|
|
new_contents = ""
|
|
for char in text:
|
|
codepoint = ord(char)
|
|
if codepoint in substitutions:
|
|
num_replacements += 1
|
|
new_contents += substitutions[codepoint]
|
|
else:
|
|
new_contents += char
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
f.write(new_contents)
|
|
print(f"Fixed {num_replacements} of {len(errors)} errors in {filename}.")
|
|
|
|
return bool(errors)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|