Files
wordsearch/app/normaliser.py
2026-05-04 09:45:17 +01:00

73 lines
2.2 KiB
Python

"""Word normalisation: produce display + grid forms with prefix stripping."""
from __future__ import annotations
import re
import sys
from dataclasses import dataclass
PREFIXES = {
"mr", "mrs", "ms", "miss",
"dr", "sir", "dame", "lord", "lady", "master",
"captain", "capt", "cpt",
"professor", "prof",
"saint", "st",
}
@dataclass
class Normalised:
display: str
grid: str
stripped_prefixes: list[str]
skipped: bool = False
warning: str | None = None
def _strip_prefix_tokens(tokens: list[str]) -> tuple[list[str], list[str]]:
"""Remove leading prefix tokens. Returns (stripped_prefixes, remaining_tokens)."""
stripped: list[str] = []
i = 0
while i < len(tokens):
bare = tokens[i].rstrip(".").lower()
if bare in PREFIXES:
stripped.append(tokens[i])
i += 1
else:
break
return stripped, tokens[i:]
def normalise(word: str) -> Normalised:
display = word
tokens = word.split()
if not tokens:
return Normalised(display=display, grid="", stripped_prefixes=[],
skipped=True, warning="empty word")
stripped, remainder = _strip_prefix_tokens(tokens)
if not remainder:
# Word is *only* prefix tokens — keep raw form, warn.
joined = "".join(tokens)
grid = re.sub(r"[^A-Za-z0-9]", "", joined).upper()
warning = f"word {word!r} is only prefix tokens; keeping as {grid}"
print(f"[normaliser] warning: {warning}", file=sys.stderr)
return Normalised(display=display, grid=grid, stripped_prefixes=[],
warning=warning)
joined = "".join(remainder)
grid = re.sub(r"[^A-Za-z0-9]", "", joined).upper()
if not grid:
warning = f"word {word!r} normalises to empty after stripping; skipping"
print(f"[normaliser] warning: {warning}", file=sys.stderr)
return Normalised(display=display, grid="", stripped_prefixes=stripped,
skipped=True, warning=warning)
return Normalised(display=display, grid=grid, stripped_prefixes=stripped)
def normalise_all(words: list[str]) -> list[Normalised]:
return [normalise(w) for w in words]