Spaces:
Runtime error
Runtime error
| # !pip install gr-nlp-toolkit | |
| from gr_nlp_toolkit import Pipeline | |
| # Instantiate the Pipeline | |
| nlp_pos_ner_dp_with_g2g = Pipeline("pos,ner,dp,g2g") | |
| def greeklish_to_greek(text: str) -> str: | |
| """ | |
| Convert Greeklish (Greek written with Latin characters) to Greek. ("larisa" -> "λαρισα") | |
| Args: | |
| text (str): The Greeklish text to convert. | |
| Returns: | |
| str: The transliterated Greek text. | |
| Examples: | |
| >>> greeklish_to_greek("H thessaloniki einai wraia polh") | |
| 'η θεσσαλονικη ειναι ωραια πολη' | |
| """ | |
| doc = nlp_pos_ner_dp_with_g2g(text) | |
| return " ".join([token.text for token in doc.tokens]) | |
| def process_ner(text: str) -> dict: | |
| """ | |
| Process text to extract Named Entity Recognition (NER) information. | |
| Args: | |
| text (str): The text to process. | |
| Returns: | |
| dict: A dictionary with the text and the NER value. | |
| Examples: | |
| >>> process_ner("Η Αργεντινή κέρδισε το Παγκόσμιο Κύπελλο το 2022") | |
| { | |
| 'η': 'O', | |
| 'αργεντινη': 'S-ORG', | |
| 'κερδισε': 'O', | |
| 'το': 'O', | |
| 'παγκοσμιο': 'B-EVENT', | |
| 'κυπελλο': 'E-EVENT', | |
| 'το': 'O', | |
| '2022': 'S-DATE' | |
| } | |
| NER Possible Labels List: | |
| ner_labels = [ | |
| 'O', 'S-GPE', 'S-ORG', 'S-CARDINAL', 'B-ORG', 'E-ORG', 'B-DATE', 'E-DATE', 'S-NORP', | |
| 'B-GPE', 'E-GPE', 'S-EVENT', 'S-DATE', 'S-PRODUCT', 'S-LOC', 'I-ORG', 'S-PERSON', | |
| 'S-ORDINAL', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'B-LAW', 'I-LAW', 'E-LAW', 'B-MONEY', | |
| 'I-MONEY', 'E-MONEY', 'B-EVENT', 'I-EVENT', 'E-EVENT', 'B-FAC', 'E-FAC', 'I-DATE', | |
| 'S-PERCENT', 'B-QUANTITY', 'E-QUANTITY', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'E-WORK_OF_ART', | |
| 'I-FAC', 'S-LAW', 'S-TIME', 'B-LOC', 'E-LOC', 'I-LOC', 'S-FAC', 'B-TIME', 'E-TIME', | |
| 'S-WORK_OF_ART', 'B-PRODUCT', 'E-PRODUCT', 'B-CARDINAL', 'E-CARDINAL', 'S-MONEY', | |
| 'S-LANGUAGE', 'I-TIME', 'I-PRODUCT', 'I-GPE', 'I-QUANTITY', 'B-NORP', 'E-NORP', | |
| 'S-QUANTITY', 'B-PERCENT', 'I-PERCENT', 'E-PERCENT', 'I-CARDINAL', 'B-ORDINAL', | |
| 'I-ORDINAL', 'E-ORDINAL' | |
| ] | |
| """ | |
| doc = nlp_pos_ner_dp_with_g2g(text) | |
| ner_dict = {token.text: token.ner for token in doc.tokens} | |
| return ner_dict | |
| def process_pos(text: str) -> dict: | |
| """ | |
| Process text to extract Part-of-Speech information (UPOS tags and morphological features). | |
| # Complete list of UPOS (https://universaldependencies.org/u/pos/ & https://github.com/nlpaueb/gr-nlp-toolkit/blob/main/gr_nlp_toolkit/configs/pos_labels.py) | |
| ADJ: adjective | |
| ADP: adposition | |
| ADV: adverb | |
| AUX: auxiliary | |
| CCONJ: coordinating conjunction | |
| DET: determiner | |
| INTJ: interjection | |
| NOUN: noun | |
| NUM: numeral | |
| PART: particle | |
| PRON: pronoun | |
| PROPN: proper noun | |
| PUNCT: punctuation | |
| SCONJ: subordinating conjunction | |
| SYM: symbol | |
| VERB: verb | |
| X: other | |
| # Complete list of the morphological features can be found here: (https://github.com/nlpaueb/gr-nlp-toolkit/blob/main/gr_nlp_toolkit/configs/pos_labels.py | |
| Due to the large number of features, only the most common ones are listed here: | |
| - Aspect | |
| - Case | |
| - Definite | |
| - Mood | |
| - Number | |
| - Person | |
| - PronType | |
| - Tense | |
| - Gender | |
| - VerbForm | |
| - Voice | |
| Args: | |
| text (str): The text to process. | |
| Returns: | |
| dict: A dictionary with the text and the POS information, containing UPOS and morphological features as keys. | |
| Examples: | |
| >>> process_pos("Μου αρέσει να διαβάζω τα post του Andrew Ng στο Twitter.") | |
| { | |
| 'μου': {'UPOS': 'PRON', 'Morphological_Features': {'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '1', 'Poss': '_', 'PronType': 'Prs'}}, | |
| 'αρεσει': {'UPOS': 'VERB', 'Morphological_Features': {'Aspect': 'Imp', 'Case': '_', 'Gender': '_', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}}, | |
| 'να': {'UPOS': 'AUX', 'Morphological_Features': {'Aspect': '_', 'Mood': '_', 'Number': '_', 'Person': '_', 'Tense': '_', 'VerbForm': '_', 'Voice': '_'}}, | |
| 'διαβαζω': {'UPOS': 'VERB', 'Morphological_Features': {'Aspect': 'Imp', 'Case': '_', 'Gender': '_', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}}, | |
| 'τα': {'UPOS': 'DET', 'Morphological_Features': {'Case': 'Acc', 'Definite': 'Def', 'Gender': 'Neut', 'Number': 'Plur', 'PronType': 'Art'}}, | |
| 'post': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}}, | |
| 'του': {'UPOS': 'DET', 'Morphological_Features': {'Case': 'Gen', 'Definite': 'Def', 'Gender': 'Masc', 'Number': 'Sing', 'PronType': 'Art'}}, | |
| 'andrew': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}}, | |
| 'ng': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}}, | |
| 'στο': {'UPOS': '_', 'Morphological_Features': {}}, | |
| 'twitter': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}}, | |
| '.': {'UPOS': 'PUNCT', 'Morphological_Features': {}} | |
| } | |
| """ | |
| doc = nlp_pos_ner_dp_with_g2g(text) | |
| pos_dict = { | |
| token.text: {"UPOS": token.upos, "Morphological_Features": token.feats} | |
| for token in doc.tokens | |
| } | |
| return pos_dict | |
| def process_dp(text: str) -> dict: | |
| """ | |
| Process text to extract Dependency Parsing information. | |
| This method analyzes the given text and returns dependency parsing information for each word, | |
| including its syntactic head and dependency relation. | |
| Args: | |
| text (str): The text to process. | |
| Returns: | |
| dict: A dictionary where each key is a word from the input text, and the value is another | |
| dictionary containing: | |
| - 'Head': The position of the syntactic head of the word (0 indicates the root). | |
| - 'Deprel': The dependency relation to the head. | |
| Examples: | |
| >>> process_dp("Προτιμώ την πρωινή πτήση από την Αθήνα στη Θεσσαλονίκη.") | |
| { | |
| 'προτιμω': {'Head': 0, 'Deprel': 'root'}, | |
| 'την': {'Head': 4, 'Deprel': 'det'}, | |
| 'πρωινη': {'Head': 4, 'Deprel': 'amod'}, | |
| 'πτηση': {'Head': 1, 'Deprel': 'obj'}, | |
| 'απο': {'Head': 7, 'Deprel': 'case'}, | |
| 'την': {'Head': 7, 'Deprel': 'det'}, | |
| 'αθηνα': {'Head': 4, 'Deprel': 'nmod'}, | |
| 'στη': {'Head': 9, 'Deprel': 'case'}, | |
| 'θεσσαλονικη': {'Head': 4, 'Deprel': 'nmod'}, | |
| '.': {'Head': 1, 'Deprel': 'punct'} | |
| } | |
| Dependency Parsing Possible Labels List: | |
| dp_labels = [ | |
| 'obl', 'obj', 'dep', 'mark', 'case', 'flat', 'nummod', 'obl:arg', 'punct', 'cop', | |
| 'acl:relcl', 'expl', 'nsubj', 'csubj:pass', 'root', 'advmod', 'nsubj:pass', 'ccomp', | |
| 'conj', 'amod', 'xcomp', 'aux', 'appos', 'csubj', 'fixed', 'nmod', 'iobj', 'parataxis', | |
| 'orphan', 'det', 'advcl', 'vocative', 'compound', 'cc', 'discourse', 'acl', 'obl:agent' | |
| ] | |
| """ | |
| doc = nlp_pos_ner_dp_with_g2g(text) | |
| dp_dict = { | |
| token.text: {"Head": token.head, "Deprel": token.deprel} for token in doc.tokens | |
| } | |
| return dp_dict | |