Initial commit
This commit is contained in:
72
app/normaliser.py
Normal file
72
app/normaliser.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Word normalisation: produce display + grid forms with prefix stripping."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
|
||||
PREFIXES = {
|
||||
"mr", "mrs", "ms", "miss",
|
||||
"dr", "sir", "dame", "lord", "lady", "master",
|
||||
"captain", "capt", "cpt",
|
||||
"professor", "prof",
|
||||
"saint", "st",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Normalised:
|
||||
display: str
|
||||
grid: str
|
||||
stripped_prefixes: list[str]
|
||||
skipped: bool = False
|
||||
warning: str | None = None
|
||||
|
||||
|
||||
def _strip_prefix_tokens(tokens: list[str]) -> tuple[list[str], list[str]]:
|
||||
"""Remove leading prefix tokens. Returns (stripped_prefixes, remaining_tokens)."""
|
||||
stripped: list[str] = []
|
||||
i = 0
|
||||
while i < len(tokens):
|
||||
bare = tokens[i].rstrip(".").lower()
|
||||
if bare in PREFIXES:
|
||||
stripped.append(tokens[i])
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
return stripped, tokens[i:]
|
||||
|
||||
|
||||
def normalise(word: str) -> Normalised:
|
||||
display = word
|
||||
tokens = word.split()
|
||||
if not tokens:
|
||||
return Normalised(display=display, grid="", stripped_prefixes=[],
|
||||
skipped=True, warning="empty word")
|
||||
|
||||
stripped, remainder = _strip_prefix_tokens(tokens)
|
||||
|
||||
if not remainder:
|
||||
# Word is *only* prefix tokens — keep raw form, warn.
|
||||
joined = "".join(tokens)
|
||||
grid = re.sub(r"[^A-Za-z0-9]", "", joined).upper()
|
||||
warning = f"word {word!r} is only prefix tokens; keeping as {grid}"
|
||||
print(f"[normaliser] warning: {warning}", file=sys.stderr)
|
||||
return Normalised(display=display, grid=grid, stripped_prefixes=[],
|
||||
warning=warning)
|
||||
|
||||
joined = "".join(remainder)
|
||||
grid = re.sub(r"[^A-Za-z0-9]", "", joined).upper()
|
||||
|
||||
if not grid:
|
||||
warning = f"word {word!r} normalises to empty after stripping; skipping"
|
||||
print(f"[normaliser] warning: {warning}", file=sys.stderr)
|
||||
return Normalised(display=display, grid="", stripped_prefixes=stripped,
|
||||
skipped=True, warning=warning)
|
||||
|
||||
return Normalised(display=display, grid=grid, stripped_prefixes=stripped)
|
||||
|
||||
|
||||
def normalise_all(words: list[str]) -> list[Normalised]:
|
||||
return [normalise(w) for w in words]
|
||||
Reference in New Issue
Block a user