73 lines
2.2 KiB
Python
73 lines
2.2 KiB
Python
"""Word normalisation: produce display + grid forms with prefix stripping."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import sys
|
|
from dataclasses import dataclass
|
|
|
|
PREFIXES = {
|
|
"mr", "mrs", "ms", "miss",
|
|
"dr", "sir", "dame", "lord", "lady", "master",
|
|
"captain", "capt", "cpt",
|
|
"professor", "prof",
|
|
"saint", "st",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class Normalised:
|
|
display: str
|
|
grid: str
|
|
stripped_prefixes: list[str]
|
|
skipped: bool = False
|
|
warning: str | None = None
|
|
|
|
|
|
def _strip_prefix_tokens(tokens: list[str]) -> tuple[list[str], list[str]]:
|
|
"""Remove leading prefix tokens. Returns (stripped_prefixes, remaining_tokens)."""
|
|
stripped: list[str] = []
|
|
i = 0
|
|
while i < len(tokens):
|
|
bare = tokens[i].rstrip(".").lower()
|
|
if bare in PREFIXES:
|
|
stripped.append(tokens[i])
|
|
i += 1
|
|
else:
|
|
break
|
|
return stripped, tokens[i:]
|
|
|
|
|
|
def normalise(word: str) -> Normalised:
|
|
display = word
|
|
tokens = word.split()
|
|
if not tokens:
|
|
return Normalised(display=display, grid="", stripped_prefixes=[],
|
|
skipped=True, warning="empty word")
|
|
|
|
stripped, remainder = _strip_prefix_tokens(tokens)
|
|
|
|
if not remainder:
|
|
# Word is *only* prefix tokens — keep raw form, warn.
|
|
joined = "".join(tokens)
|
|
grid = re.sub(r"[^A-Za-z0-9]", "", joined).upper()
|
|
warning = f"word {word!r} is only prefix tokens; keeping as {grid}"
|
|
print(f"[normaliser] warning: {warning}", file=sys.stderr)
|
|
return Normalised(display=display, grid=grid, stripped_prefixes=[],
|
|
warning=warning)
|
|
|
|
joined = "".join(remainder)
|
|
grid = re.sub(r"[^A-Za-z0-9]", "", joined).upper()
|
|
|
|
if not grid:
|
|
warning = f"word {word!r} normalises to empty after stripping; skipping"
|
|
print(f"[normaliser] warning: {warning}", file=sys.stderr)
|
|
return Normalised(display=display, grid="", stripped_prefixes=stripped,
|
|
skipped=True, warning=warning)
|
|
|
|
return Normalised(display=display, grid=grid, stripped_prefixes=stripped)
|
|
|
|
|
|
def normalise_all(words: list[str]) -> list[Normalised]:
|
|
return [normalise(w) for w in words]
|