Enforce ASCII in README.md (#513)
This all started because I was going to write a script to autogenerate
the Table of Contents in the root `README.md`, but I noticed that the
`href` for the "Why Codex?" heading was `#whycodex` instead of
`#why-codex`. This piqued my curiosity and it turned out that the space
in "Why Codex?" was not an ASCII space but **U+00A0**, a non-breaking
space, and so GitHub ignored it when generating the `href` for the
heading.
This also meant that when I did a text search for `why codex` in the
`README.md` in VS Code, the "Why Codex" heading did not match because of
the presence of **U+00A0**.
In short, these types of Unicode characters seem like a hazard, so I
decided to introduce this script to flag them, and if desired, to
replace them with "good enough" ASCII equivalents. For now, this only
applies to the root `README.md` file, but I think we should ultimately
apply this across our source code, as well, as we seem to have quite a
lot of non-ASCII Unicode and it's probably going to cause `rg` to miss
things.
Contributions of this PR:
* `./scripts/asciicheck.py`, which takes a list of filepaths and returns
non-zero if any of them contain non-ASCII characters. (Currently, there
is one exception for ✨ aka **U+2728**, though I would like to default to
an empty allowlist and then require all exceptions to be specified as
flags.)
* A `--fix` option that will attempt to rewrite files with violations
using a equivalents from a hardcoded substitution list.
* An update to `ci.yml` to verify `./scripts/asciicheck.py README.md`
succeeds.
* A cleanup of `README.md` using the `--fix` option as well as some
editorial decisions on my part.
* I tried to update the `href`s in the Table of Contents to reflect the
changes in the heading titles. (TIL that if a heading has a character
like `&` surrounded by spaces, it becomes `--` in the generated `href`.)
This commit is contained in:
127
scripts/asciicheck.py
Executable file
127
scripts/asciicheck.py
Executable file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
"""
|
||||
Utility script that takes a list of files and returns non-zero if any of them
|
||||
contain non-ASCII characters other than those in the allowed list.
|
||||
|
||||
If --fix is used, it will attempt to replace non-ASCII characters with ASCII
|
||||
equivalents.
|
||||
|
||||
The motivation behind this script is that characters like U+00A0 (non-breaking
|
||||
space) can cause regexes not to match and can result in surprising anchor
|
||||
values for headings when GitHub renders Markdown as HTML.
|
||||
"""
|
||||
|
||||
|
||||
"""
|
||||
When --fix is used, perform the following substitutions.
|
||||
"""
|
||||
substitutions: dict[int, str] = {
|
||||
0x00A0: " ", # non-breaking space
|
||||
0x2011: "-", # non-breaking hyphen
|
||||
0x2013: "-", # en dash
|
||||
0x2014: "-", # em dash
|
||||
0x2018: "'", # left single quote
|
||||
0x2019: "'", # right single quote
|
||||
0x201C: '"', # left double quote
|
||||
0x201D: '"', # right double quote
|
||||
0x2026: "...", # ellipsis
|
||||
0x202F: " ", # narrow non-breaking space
|
||||
}
|
||||
|
||||
"""
|
||||
Unicode codepoints that are allowed in addition to ASCII.
|
||||
Be conservative with this list.
|
||||
|
||||
Note that it is always an option to use the hex HTML representation
|
||||
instead of the character itself so the source code is ASCII-only.
|
||||
For example, U+2728 (sparkles) can be written as `✨`.
|
||||
"""
|
||||
allowed_unicode_codepoints = {
|
||||
0x2728, # sparkles
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Check for non-ASCII characters in files."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fix",
|
||||
action="store_true",
|
||||
help="Rewrite files, replacing non-ASCII characters with ASCII equivalents, where possible.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"files",
|
||||
nargs="+",
|
||||
help="Files to check for non-ASCII characters.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
has_errors = False
|
||||
for filename in args.files:
|
||||
path = Path(filename)
|
||||
has_errors |= lint_utf8_ascii(path, fix=args.fix)
|
||||
return 1 if has_errors else 0
|
||||
|
||||
|
||||
def lint_utf8_ascii(filename: Path, fix: bool) -> bool:
|
||||
"""Returns True if an error was printed."""
|
||||
try:
|
||||
with open(filename, "rb") as f:
|
||||
raw = f.read()
|
||||
text = raw.decode("utf-8")
|
||||
except UnicodeDecodeError as e:
|
||||
print("UTF-8 decoding error:")
|
||||
print(f" byte offset: {e.start}")
|
||||
print(f" reason: {e.reason}")
|
||||
# Attempt to find line/column
|
||||
partial = raw[: e.start]
|
||||
line = partial.count(b"\n") + 1
|
||||
col = e.start - (partial.rfind(b"\n") if b"\n" in partial else -1)
|
||||
print(f" location: line {line}, column {col}")
|
||||
return True
|
||||
|
||||
errors = []
|
||||
for lineno, line in enumerate(text.splitlines(keepends=True), 1):
|
||||
for colno, char in enumerate(line, 1):
|
||||
codepoint = ord(char)
|
||||
if char == "\n":
|
||||
continue
|
||||
if (
|
||||
not (0x20 <= codepoint <= 0x7E)
|
||||
and codepoint not in allowed_unicode_codepoints
|
||||
):
|
||||
errors.append((lineno, colno, char, codepoint))
|
||||
|
||||
if errors:
|
||||
for lineno, colno, char, codepoint in errors:
|
||||
safe_char = repr(char)[1:-1] # nicely escape things like \u202f
|
||||
print(
|
||||
f"Invalid character at line {lineno}, column {colno}: U+{codepoint:04X} ({safe_char})"
|
||||
)
|
||||
|
||||
if errors and fix:
|
||||
print(f"Attempting to fix {filename}...")
|
||||
num_replacements = 0
|
||||
new_contents = ""
|
||||
for char in text:
|
||||
codepoint = ord(char)
|
||||
if codepoint in substitutions:
|
||||
num_replacements += 1
|
||||
new_contents += substitutions[codepoint]
|
||||
else:
|
||||
new_contents += char
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write(new_contents)
|
||||
print(f"Fixed {num_replacements} of {len(errors)} errors in {filename}.")
|
||||
|
||||
return bool(errors)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user