"""Word normalisation: produce display + grid forms with prefix stripping.""" from __future__ import annotations import re import sys from dataclasses import dataclass PREFIXES = { "mr", "mrs", "ms", "miss", "dr", "sir", "dame", "lord", "lady", "master", "captain", "capt", "cpt", "professor", "prof", "saint", "st", } @dataclass class Normalised: display: str grid: str stripped_prefixes: list[str] skipped: bool = False warning: str | None = None def _strip_prefix_tokens(tokens: list[str]) -> tuple[list[str], list[str]]: """Remove leading prefix tokens. Returns (stripped_prefixes, remaining_tokens).""" stripped: list[str] = [] i = 0 while i < len(tokens): bare = tokens[i].rstrip(".").lower() if bare in PREFIXES: stripped.append(tokens[i]) i += 1 else: break return stripped, tokens[i:] def normalise(word: str) -> Normalised: display = word tokens = word.split() if not tokens: return Normalised(display=display, grid="", stripped_prefixes=[], skipped=True, warning="empty word") stripped, remainder = _strip_prefix_tokens(tokens) if not remainder: # Word is *only* prefix tokens — keep raw form, warn. joined = "".join(tokens) grid = re.sub(r"[^A-Za-z0-9]", "", joined).upper() warning = f"word {word!r} is only prefix tokens; keeping as {grid}" print(f"[normaliser] warning: {warning}", file=sys.stderr) return Normalised(display=display, grid=grid, stripped_prefixes=[], warning=warning) joined = "".join(remainder) grid = re.sub(r"[^A-Za-z0-9]", "", joined).upper() if not grid: warning = f"word {word!r} normalises to empty after stripping; skipping" print(f"[normaliser] warning: {warning}", file=sys.stderr) return Normalised(display=display, grid="", stripped_prefixes=stripped, skipped=True, warning=warning) return Normalised(display=display, grid=grid, stripped_prefixes=stripped) def normalise_all(words: list[str]) -> list[Normalised]: return [normalise(w) for w in words]