# Morpheme Detector from SATE v1 import os import json import stanza import re nlp = stanza.Pipeline( lang="en", processors="tokenize,pos,lemma", tokenize_pretokenized=False, ) _EXPECTED_SUFFIXES = { "Plural": {"s", "es"}, "Possessive": {"'s", "s"}, "Comparative": {"er"}, "Superlative": {"est"}, "3rd Person Singular": {"s", "es"}, "Past Tense": {"ed"}, "Past Participle": {"ed", "en", "n"}, "Progressive": {"ing"}, "Gerund": {"ing"}, } _CONTRACTION_PARTICLES = { "'ll": "will", # we'll, he'll "'d": "would/had", # I'd, she'd "'ve": "have", # we've, they've "'re": "are", # you're, they're "'m": "am", # I'm "n't": "not", # isn't, didn't "'s": "is/has", # what's, she's } _S_TOKENS = {"'s", "’s"} def is_possessive_candidate(tok): return tok.text in _S_TOKENS and tok.upos == "PART" def lcp(a: str, b: str) -> str: i = 0 while i < min(len(a), len(b)) and a[i].lower() == b[i].lower(): i += 1 return a[:i] def strip_doubling(lemma: str, suf: str) -> str: if suf and len(suf) >= 2 and suf[0] == lemma[-1]: cand = suf[1:] if any(cand in v for v in _EXPECTED_SUFFIXES.values()): return cand return suf def get_suffix(lemma: str, surface: str) -> str: return strip_doubling(lemma, surface[len(lcp(lemma, surface)):]) def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None: if raw_suf in expected_set: return raw_suf if lemma.lower().endswith("y") and raw_suf.startswith("i"): alt = raw_suf[1:] if alt in expected_set: return alt return None def preprocess_text(text: str) -> tuple[str, list[int]]: original_words = text.split() position_map = [] # position_map[original_index] = cleaned_index cleaned_words = [] for i, word in enumerate(original_words): if re.match(r'\[.*\]', word): position_map.append(-1) else: position_map.append(len(cleaned_words)) cleaned_words.append(word) cleaned_text = ' '.join(cleaned_words) return cleaned_text, position_map def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int: for original_index, cleaned_pos in enumerate(position_map): if cleaned_pos == cleaned_index: return original_index return cleaned_index def extract_inflectional_morphemes(text: str): cleaned_text, position_map = preprocess_text(text) doc = nlp(cleaned_text) results = [] for sent in doc.sentences: words = sent.words i = 0 while i < len(words): w = words[i] surf, lem, pos = w.text, w.lemma, w.upos feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)} low_txt = surf.lower() if is_possessive_candidate(w) and i > 0: prev = words[i - 1] if prev.upos in {"NOUN", "PROPN"}: results.append({ "word": prev.text + surf, "lemma": prev.lemma, "index": calculate_adjusted_index(i - 1, position_map), "inflectional_morpheme": "Possessive", "morpheme_form": "'/s", }) else: results.append({ "word": prev.text + surf, "lemma": prev.lemma, "index": calculate_adjusted_index(i - 1, position_map), "inflectional_morpheme": "Contraction", "morpheme_form": "'/s", }) i += 1 continue if low_txt in _CONTRACTION_PARTICLES and i > 0: prev = words[i - 1] results.append({ "word": prev.text + surf, "lemma": prev.lemma, "index": calculate_adjusted_index(i - 1, position_map), "inflectional_morpheme": "Contraction", "morpheme_form": low_txt, }) i += 1 continue if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}: low_lem, low_surf = lem.lower(), surf.lower() suf = get_suffix(low_lem, low_surf) morpheme_form = "/s" if suf in {"s", "es"} and low_lem + suf == low_surf else "" results.append({ "word": surf, "lemma": lem, "index": calculate_adjusted_index(i, position_map), "inflectional_morpheme": "Possessive", "morpheme_form": morpheme_form, }) i += 1 continue inflect_type = None if pos == "NOUN" and feats.get("Number") == "Plur": inflect_type = "Plural" elif pos == "ADJ" and feats.get("Degree") == "Cmp": inflect_type = "Comparative" elif pos == "ADJ" and feats.get("Degree") == "Sup": inflect_type = "Superlative" elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3": inflect_type = "3rd Person Singular" elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past": inflect_type = "Past Tense" elif pos == "VERB" and feats.get("VerbForm") == "Part": if feats.get("Tense") == "Past" or w.xpos == "VBN": inflect_type = "Past Participle" elif feats.get("Tense") == "Pres" or w.xpos == "VBG": inflect_type = "Progressive" if inflect_type: if surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}: i += 1 continue raw_suffix = get_suffix(lem, low_txt) canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type]) morpheme_form = f"/{canon}" if canon else "" results.append({ "word": surf, "lemma": lem, "index": calculate_adjusted_index(i, position_map), "inflectional_morpheme": inflect_type, "morpheme_form": morpheme_form, }) i += 1 return results if __name__ == "__main__": print("First Test") print(extract_inflectional_morphemes("I see that the elephant is a bounce bounceing three balls at a ball")) print("Second Test") print(extract_inflectional_morphemes("I see that the elephant is bounceing a ball"))