diff --git a/build_docs.py b/build_docs.py index 63b561c..ff9eb59 100755 --- a/build_docs.py +++ b/build_docs.py @@ -1265,25 +1265,26 @@ def proofread_canonicals( purge(http, *paths_to_purge) +# Python 3.12 onwards doesn't use self-closing tags for +_canonical_re = re.compile( + b"""""" +) + + def _check_canonical_rel(file: Path, www_root: Path): # Check for a canonical relation link in the HTML. # If one exists, ensure that the target exists # or otherwise remove the canonical link element. - prefix = b'' - pfx_len = len(prefix) - sfx_len = len(suffix) html = file.read_bytes() - try: - start = html.index(prefix) - end = html.index(suffix, start + pfx_len) - except ValueError: + canonical = _canonical_re.search(html) + if canonical is None: return None - target = html[start + pfx_len : end].decode(errors="surrogateescape") + target = canonical[1].decode(encoding="UTF-8", errors="surrogateescape") if (www_root / target).exists(): return None logging.info("Removing broken canonical from %s to %s", file, target) - file.write_bytes(html[:start] + html[end + sfx_len :]) + start, end = canonical.span() + file.write_bytes(html[:start] + html[end:]) return file