import re from transformers import AutoTokenizer, AutoModelForTokenClassification import gradio as gr # # Load the trained model and tokenizer # model_checkpoint = "BERTPOS" # model = AutoModelForTokenClassification.from_pretrained(model_checkpoint) # tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # load model from Huggingface tokenizer = AutoTokenizer.from_pretrained("syke9p3/bert-tagalog-base-uncased-pos-tagger") model = AutoModelForTokenClassification.from_pretrained("syke9p3/bert-tagalog-base-uncased-pos-tagger") pos_tag_mapping = { '[PAD]': 0, 'NNC': 1, 'NNP': 2, 'NNPA': 3, 'NNCA': 4, 'PR': 5, 'PRS': 6, 'PRP': 7, 'PRSP': 8, 'PRO': 9, 'PRQ': 10, 'PRQP': 11, 'PRL': 12, 'PRC': 13, 'PRF': 14, 'PRI': 15, 'DT': 16, 'DTC': 17, 'DTP': 18, 'DTPP': 19, 'LM': 20, 'CC': 21, 'CCT': 22, 'CCR': 23, 'CCB': 24, 'CCA': 25, 'PM': 26, 'PMP': 27, 'PME': 28, 'PMQ': 29, 'PMC': 30, 'PMSC': 31, 'PMS': 32, 'VB': 33, 'VBW': 34, 'VBS': 35, 'VBN': 36, 'VBTS': 37, 'VBTR': 38, 'VBTF': 39, 'VBTP': 40, 'VBAF': 41, 'VBOF': 42, 'VBOB': 43, 'VBOL': 44, 'VBOI': 45, 'VBRF': 46, 'JJ': 47, 'JJD': 48, 'JJC': 49, 'JJCC': 50, 'JJCS': 51, 'JJCN': 52, 'JJCF': 53, 'JJCB': 54, 'JJT': 55, 'RB': 56, 'RBD': 57, 'RBN': 58, 'RBK': 59, 'RBP': 60, 'RBB': 61, 'RBR': 62, 'RBQ': 63, 'RBT': 64, 'RBF': 65, 'RBW': 66, 'RBM': 67, 'RBL': 68, 'RBI': 69, 'RBS': 70, 'RBJ': 71, 'RBY': 72, 'RBLI': 73, 'TS': 74, 'FW': 75, 'CD': 76, 'CCB_CCP': 77, 'CCR_CCA': 78, 'CCR_CCB': 79, 'CCR_CCP': 80, 'CCR_LM': 81, 'CCT_CCA': 82, 'CCT_CCP': 83, 'CCT_LM': 84, 'CCU_DTP': 85, 'CDB_CCA': 86, 'CDB_CCP': 87, 'CDB_LM': 88, 'CDB_NNC': 89, 'CDB_NNC_CCP': 90, 'JJCC_CCP': 91, 'JJCC_JJD': 92, 'JJCN_CCP': 93, 'JJCN_LM': 94, 'JJCS_CCB': 95, 'JJCS_CCP': 96, 'JJCS_JJC': 97, 'JJCS_JJC_CCP': 98, 'JJCS_JJD': 99, '[UNK]': 100, '[CLS]': 101, '[SEP]': 102, 'JJCS_JJN': 103, 'JJCS_JJN_CCP': 104, 'JJCS_RBF': 105, 'JJCS_VBAF': 106, 'JJCS_VBAF_CCP': 107, 'JJCS_VBN_CCP': 108, 'JJCS_VBOF': 109, 'JJCS_VBOF_CCP': 110, 'JJCS_VBN': 111, 'RBQ_CCP': 112, 'JJC_CCB': 113, 'JJC_CCP': 114, 'JJC_PRL': 115, 'JJD_CCA': 116, 'JJD_CCB': 117, 'JJD_CCP': 118, 'JJD_CCT': 119, 'JJD_NNC': 120, 'JJD_NNP': 121, 'JJN_CCA': 122, 'JJN_CCB': 123, 'JJN_CCP': 124, 'JJN_NNC': 125, 'JJN_NNC_CCP': 126, 'JJD_NNC_CCP': 127, 'NNC_CCA': 128, 'NNC_CCB': 129, 'NNC_CCP': 130, 'NNC_NNC_CCP': 131, 'NN': 132, 'JJN': 133, 'NNP_CCA': 134, 'NNP_CCP': 135, 'NNP_NNP': 136, 'PRC_CCB': 137, 'PRC_CCP': 138, 'PRF_CCP': 139, 'PRQ_CCP': 140, 'PRQ_LM': 141, 'PRS_CCB': 142, 'PRS_CCP': 143, 'PRSP_CCP': 144, 'PRSP_CCP_NNP': 145, 'PRL_CCP': 146, 'PRL_LM': 147, 'PRO_CCB': 148, 'PRO_CCP': 149, 'VBS_CCP': 150, 'VBTR_CCP': 151, 'VBTS_CCA': 152, 'VBTS_CCP': 153, 'VBTS_JJD': 154, 'VBTS_LM': 155, 'VBAF_CCP': 156, 'VBOB_CCP': 157, 'VBOF_CCP': 158, 'VBOF_CCP_NNP': 159, 'VBRF_CCP': 160, 'CCP': 161, 'CDB': 162, 'RBW_CCP': 163, 'RBD_CCP': 164, 'DTCP': 165, 'VBH': 166, 'VBTS_VBOF': 167, 'PRI_CCP': 168, 'VBTR_VBAF_CCP': 169, 'DQL': 170, 'DQR': 171, 'RBT_CCP': 172, 'VBW_CCP': 173, 'RBI_CCP': 174, 'VBN_CCP': 175, 'VBTR_VBAF': 176, 'VBTF_CCP': 177, 'JJCS_JJD_NNC': 178, 'CCU': 179, 'RBL_CCP': 180, 'VBTR_VBRF_CCP': 181, 'PRP_CCP': 182, 'VBTR_VBRF': 183, 'VBH_CCP': 184, 'VBTS_VBAF': 185, 'VBTF_VBOF': 186, 'VBTR_VBOF': 187, 'VBTF_VBAF': 188, 'JJCS_JJD_CCB': 189, 'JJCS_JJD_CCP': 190, 'RBM_CCP': 191, 'NNCS': 192, 'PRI_CCB': 193, 'NNA': 194, 'VBTR_VBOB': 195, 'DC': 196, 'JJD_CP': 197, 'NC': 198, 'NC_CCP': 199, 'VBO': 200, 'JJD_CC': 201, 'VBF': 202, 'CP': 203, 'NP': 204, 'N': 205, 'F': 206, 'CT': 207, 'MS': 208, 'BTF': 209, 'CA': 210, 'VBOF_RBR': 211, 'DP': 212, } num_labels = len(pos_tag_mapping) id2label = {idx: tag for tag, idx in pos_tag_mapping.items()} label2id = {tag: idx for tag, idx in pos_tag_mapping.items()} special_symbols = ['-', '&', "\"", "[", "]", "/", "$", "(", ")", "%", ":", "'", '.', '?', ','] def symbol2token(symbol): # Check if the symbol is a comma if symbol == ',': return '[PMC] ' elif symbol == '.': return '[PMP] ' # Check if the symbol is in the list of special symbols elif symbol in special_symbols: return '[PMS] ' # If the symbol is not a comma or in the special symbols list, keep it as it is return symbol def preprocess_untagged_sentence(sentence): # Define regex pattern to capture all special symbols special_symbols_regex = '|'.join([re.escape(sym) for sym in ['-', '&', "\"", "[", "]", "/", "$", "(", ")", "%", ":", "'", '.']]) # Replace all special symbols with spaces around them sentence = re.sub(rf'({special_symbols_regex})', r' \1 ', sentence) # Remove extra whitespaces sentence = re.sub(r'\s+', ' ', sentence).strip() upper = sentence # Convert the sentence to lowercase sentence = sentence.lower() # Loop through the sentence and convert special symbols to tokens [PMS], [PMC], or [PMP] new_sentence = "" i = 0 while i < len(sentence): if any(sentence[i:].startswith(symbol) for symbol in special_symbols): # Check for ellipsis and replace with '[PMS]' if i + 2 < len(sentence) and sentence[i:i + 3] == '...': new_sentence += '[PMS]' i += 3 # Check for single special symbols elif i + 1 == len(sentence): new_sentence += symbol2token(sentence[i]) break elif sentence[i + 1] == ' ' and i == 0: new_sentence += symbol2token(sentence[i]) i += 1 elif sentence[i - 1] == ' ' and sentence[i + 1] == ' ': new_sentence += symbol2token(sentence[i]) i += 1 elif sentence[i - 1] != ' ': new_sentence += '' else: word_after_symbol = "" while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any( sentence[i + 1:].startswith(symbol) for symbol in special_symbols): word_after_symbol += sentence[i + 1] i += 1 new_sentence += word_after_symbol # Check for special symbols at the start of the sentence elif any(sentence[i:].startswith(symbol) for symbol in special_symbols): if i + 1 < len(sentence) and (sentence[i + 1] == ' ' and sentence[i - 1] != ' '): new_sentence += '[PMS] ' i += 1 elif i + 1 == len(sentence): new_sentence += '[PMS] ' break else: word_after_symbol = "" while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any( sentence[i + 1:].startswith(symbol) for symbol in special_symbols): word_after_symbol += sentence[i + 1] i += 1 new_sentence += word_after_symbol else: new_sentence += sentence[i] i += 1 print("Sentence after:", new_sentence.split()) print("---") return new_sentence, upper def preprocess_sentence(tagged_sentence): # Remove the line identifier (e.g., SNT.80188.3) sentence = re.sub(r'SNT\.\d+\.\d+\s+', '', tagged_sentence) special_symbols = ['-', '&', ",", "\"", "[", "]", "/", "$", "(", ")", "%", ":", "'", '.'] # Construct the regex pattern for extracting words inside including special symbols special_symbols_regex = '|'.join([re.escape(sym) for sym in special_symbols]) regex_pattern = r'<(?:[^<>]+? )?([a-zA-Z0-9.,&"!?{}]+)>'.format(special_symbols_regex) words = re.findall(regex_pattern, tagged_sentence) # Join the words to form a sentence sentence = ' '.join(words) sentence = sentence.lower() # print("---") # print("Sentence before:", sentence) # Loop through the sentence and convert hyphen to '[PMP]' if the next character is a space new_sentence = "" i = 0 # print("Length: ", len(sentence)) while i < len(sentence): # print(f"{i+1} == {len(sentence)}: {sentence[i]}") if any(sentence[i:].startswith(symbol) for symbol in special_symbols): if i + 2 < len(sentence) and sentence[i:i + 3] == '...': # Ellipsis found, replace with '[PMS]' new_sentence += symbol2token(sentence[i]) i += 3 elif i + 1 == len(sentence): new_sentence += symbol2token(sentence[i]) break elif sentence[i + 1] == ' ' and i == 0: new_sentence += symbol2token(sentence[i]) i += 1 elif sentence[i - 1] == ' ' and sentence[i + 1] == ' ': new_sentence += symbol2token(sentence[i]) i += 1 elif sentence[i - 1] != ' ': new_sentence += '' else: word_after_symbol = "" while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any( sentence[i + 1:].startswith(symbol) for symbol in special_symbols): word_after_symbol += sentence[i + 1] i += 1 new_sentence += word_after_symbol elif any(sentence[i:].startswith(symbol) for symbol in special_symbols): if i + 1 < len(sentence) and (sentence[i + 1] == ' ' and sentence[i - 1] != ' '): new_sentence += '[PMS] ' i += 1 elif i + 1 == len(sentence): new_sentence += '[PMS] ' break else: word_after_symbol = "" while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any( sentence[i + 1:].startswith(symbol) for symbol in special_symbols): word_after_symbol += sentence[i + 1] i += 1 new_sentence += word_after_symbol else: new_sentence += sentence[i] i += 1 print("Sentence after:", new_sentence.split()) print("---") return new_sentence def extract_tags(input_sentence): tags = re.findall(r'<([A-Z_]+)\s.*?>', input_sentence) return tags def align_tokenization(sentence, tags): print("Sentence \n: ", sentence) sentence = sentence.split() print("Sentence Split\n: ", sentence) tokenized_sentence = tokenizer.tokenize(' '.join(sentence)) # tokenized_sentence_string = " ".join(tokenized_sentence) # print("ID2Token_string\n: ", tokenized_sentence_string) aligned_tagging = [] current_word = '' index = 0 # index of the current word in the sentence and tagging for token in tokenized_sentence: current_word += re.sub(r'^##', '', token) print("Current word after replacing ##: ", current_word) print("sentence[index]: ", sentence[index]) if sentence[index] == current_word: # if we completed a word print("completed a word: ", current_word) current_word = '' aligned_tagging.append(tags[index]) index += 1 else: # otherwise insert padding print("incomplete word: ", current_word) aligned_tagging.append(0) print("---") decoded_tags = [list(pos_tag_mapping.keys())[list(pos_tag_mapping.values()).index(tag_id)] for tag_id in aligned_tagging] print("Tokenized Sentence\n: ", tokenized_sentence) print("Tags\n: ", decoded_tags) assert len(tokenized_sentence) == len(aligned_tagging) aligned_tagging = [0] + aligned_tagging return tokenized_sentence, aligned_tagging def process_tagged_sentence(tagged_sentence): # print(tagged_sentence) # Preprocess the input tagged sentence and extract the words and tags sentence = preprocess_sentence(tagged_sentence) tags = extract_tags(tagged_sentence) # returns the tags (eto ilagay mo sa tags.txt) encoded_tags = [pos_tag_mapping[tag] for tag in tags] # Align tokens by adding padding if needed tokenized_sentence, encoded_tags = align_tokenization(sentence, encoded_tags) encoded_sentence = tokenizer(sentence, padding="max_length" ,truncation=True, max_length=128) # Create attention mask (1 for real tokens, 0 for padding) attention_mask = [1] * len(encoded_sentence['input_ids']) print("len(encoded_sentence['input_ids']):", len(encoded_sentence['input_ids'])) while len(encoded_sentence['input_ids']) < 128: encoded_sentence['input_ids'].append(0) # Pad with zeros attention_mask.append(0) # Pad attention mask while len(encoded_tags) < 128: encoded_tags.append(0) # Pad with the ID of '[PAD]' encoded_sentence['encoded_tags'] = encoded_tags decoded_sentence = tokenizer.convert_ids_to_tokens(encoded_sentence['input_ids'], skip_special_tokens=False) decoded_tags = [list(pos_tag_mapping.keys())[list(pos_tag_mapping.values()).index(tag_id)] for tag_id in encoded_tags] # word_tag_pairs = list(zip(decoded_sentence, decoded_tags)) print(encoded_sentence) print("Sentence:", decoded_sentence) print("Tags:", decoded_tags) print("Decoded Sentence and Tags:", word_tag_pairs) print("---") return encoded_sentence import torch import torch.nn.functional as F def tag_sentence(input_sentence): # Preprocess the input tagged sentence and extract the words and tags sentence, upper = preprocess_untagged_sentence(input_sentence) # Tokenize the sentence and decode it encoded_sentence = tokenizer(sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt") # Pass the encoded sentence to the model to get the predicted logits with torch.no_grad(): model_output = model(**encoded_sentence) # Get the logits and apply softmax to convert them into probabilities logits = model_output.logits probabilities = F.softmax(logits, dim=-1) # Get the predicted tag for each token in the sentence predicted_tags = torch.argmax(probabilities, dim=-1) # Convert the predicted tags to their corresponding labels using id2label labels = [id2label[tag.item()] for tag in predicted_tags[0] if id2label[tag.item()] != '[PAD]'] return labels # Example usage: test_sentence = 'Ang bahay ay maganda na para bang may kumikislap sa bintana .' def predict_tags(test_sentence): sentence, upper = preprocess_untagged_sentence(test_sentence) words_list = upper.split() print("Words: ", words_list) predicted_tags = tag_sentence(test_sentence) print(predicted_tags) pairs = list(zip(words_list, predicted_tags)) return pairs predict_tags(test_sentence) def get_readme(): return """ ---- # BERT Tagalog Part of Speech Tagger (BERTTPOST) ## 👥 Developed by - Saya-ang, Kenth G. ([@syke9p3](https://github.com/syke9p3)) - Gozum, Denise Julianne S. ([@Xenoxianne](https://github.com/Xenoxianne)) - Hamor, Mary Grizelle D. ([@mnemoria](https://github.com/mnemoria)) - Mabansag, Ria Karen B. ([@riavx](https://github.com/riavx)) ## 📋 Model Details ### Model Description - **Model type:** BERT Tagalog Base Uncased - **Languages (NLP):** Tagalog, Filipino - **Finetuned from model**: [GKLMIP/bert-tagalog-base-uncased](https://huggingface.co/GKLMIP/bert-tagalog-base-uncased) ### Dataset 1000 annotated sentences from Sagum et. al.'s Tagalog Corpora based on MGNN Tagset convention. | Dataset | Number of Sentences | Percentage | |----------------|-----------------|------------| | Training Set | 800 | 80% | | Testing Set | 200 | 20% | ### Preprocessing A corpus was used containing tagged sentences in Tagalog language. The dataset comprises sentences with each word annotated with its corresponding POS tag in the format of ``````. To prepare the corpus for training, the following preprocessing steps were performed: 1. **Removal of Line Identifier**: the line identifier, such as ```SNT.108970.2066```, was removed from each tagged sentence. 2. **Symbol Conversion**: for the BERT model, certain special symbols like hyphens, quotes, commas, etc., were converted into special tokens (```PMP```, ```PMS```, ```PMC```) to preserve their meaning during tokenization. 3. **Alignment of Tokenization**: the BERT tokenized words and their corresponding POS tags were aligned to ensure that the tokenization and tagging are consistent. ### Training This model was trained using PyTorch library with the following hyperparameters set: | **Hyperparamter** | **Value** | |---------------- |--------- | Batch Size | 8 | | Training Epoch | 5 | | Learning-rate | 2e-5 | | Optimizer | Adam | ## ⚒️ Languages and Technologies [![Hugging Face](https://img.shields.io/badge/Hugging%20Face%20-FFD21E?style=for-the-badge&logo=huggingface&logoColor=black)](https://jupyter.org/) [![Python](https://img.shields.io/badge/Python-F7CC42?style=for-the-badge&logo=python&logoColor=black)](https://www.python.org/) [![Gradio](https://img.shields.io/badge/Gradio-F08705?style=for-the-badge&logo=gradio&logoColor=white)](https://pytorch.org/) [![Jupyter Notebook](https://img.shields.io/badge/Jupyter%20Notebook-F37626?style=for-the-badge&logo=jupyter&logoColor=white)](https://jupyter.org/) [![PyTorch](https://img.shields.io/badge/PyTorch-EE4C2C?style=for-the-badge&logo=pytorch&logoColor=white)](https://pytorch.org/) ## 🏷️ Tags | Part of Speech | Tags | |-----------------------------------------------|------| | **Noun** | ![NNC](https://img.shields.io/badge/NNC-84CC16?style=for-the-badge&logoColor=white) | | Common Noun | NNC | | Proper Noun | NNP | | Proper Noun Abbreviation | NNPA | | Common Noun Abbreviation | NNCA | | **Pronoun** | ![PR](https://img.shields.io/badge/PR-0D9488?style=for-the-badge&logoColor=white) | | as Subject (Palagyo)/Personal Pronouns Singular | PRS | | Personal Pronouns | PRP | | Possessive Subject (Paari) | PRSP | | Pointing to an Object Demonstrative/(Paturol/Pamatlig) | PRO | | Question/Interrogative (Pananong)/Singular | PRQ | | Question/Interrogative Plural | PRQP | | Location (Panlunan) | PRL | | Comparison (Panulad) | PRC | | Found (Pahimaton) | PRF | | Indefinite | PRI | | **Determiner** | ![DT](https://img.shields.io/badge/DT-16A34A?style=for-the-badge&logoColor=white) | | Determiner (Pantukoy) for Common Noun Plural | DTC | | Determiner (Pantukoy) for Proper Noun | DTP | | Determiner (Pantukoy) for Proper Noun Plural | DTPP | | **Conjunctions (Pang-ugnay)** | ![CC](https://img.shields.io/badge/CC-16A34ADB2777?style=for-the-badge&logoColor=white) | | **Lexical Marker** | ![LM](https://img.shields.io/badge/LM-EAB308?style=for-the-badge&logoColor=white) | | Ligatures (Pang-angkop) | CCP | | Preposition (Pang-ukol) | CCU | | **Verb (Pandiwa)** | ![VB](https://img.shields.io/badge/VB-2563EB?style=for-the-badge&logoColor=white) | | Neutral/Infinitive | VBW | | Auxiliary, Modal/Pseudo-verbs | VBS | | Existential | VBH | | Non-existential | VBN | | Time Past (Perfective) | VBTS | | Time Present (Imperfective) | VBTR | | Time Future (Contemplative) | VBTF | | Recent past | VBTP | | Actor Focus | VBAF | | Object/Goal Focus | VBOF | | Benefactive Focus | VBOB | | Locative Focus | VBOL | | Instrumental Focus | VBOI | | Referential/Measurement Focus | VBRF | | **Adjective** | ![JJ](https://img.shields.io/badge/JJ-0D9488?style=for-the-badge&logoColor=white) | | Describing (Panlarawan) | JJD | | Used for Comparison (same level) (Pahambing Magkatulad) | JJC | | Comparison Comparative (more) (Palamang) | JJCC | | Comparison Superlative (most) (Pasukdol) | JJCS | | Comparison Negation (not quite) (Di-Magkatulad) | JJCN | | Describing Number (Pamilang) | JJN | | **Adverb (Pang-Abay)** | ![RB](https://img.shields.io/badge/RB-DB2777?style=for-the-badge&logoColor=white) | | Describing “How” (Pamaraan) | RBD | | Number (Panggaano/Panukat) | RBN | | Conditional (Kondisyunal) | RBK | | Causative (Pananhi) | RBP | | Benefactive (Benepaktibo) | RBB | | Referential (Pangkaukulan) | RBR | | Question (Pananong) | RBQ | | Agree (Panang-ayon) | RBT | | Disagree (Pananggi) | RBF | | Frequency (Pamanahon) | RBW | | Possibility (Pang-agam) | RBM | | Place (Panlunan) | RBL | | Enclitics (Paningit) | RBI | | Interjections (Sambitla) | RBJ | | Social Formula (Pormularyong Panlipunan) | RBS | |**Cardinal Number (Bilang)** | ![CD](https://img.shields.io/badge/CD-2563EB?style=for-the-badge&logoColor=white) | | Digit, Rank, Count | CDB | | **Topicless (Walang Paksa)** | ![TS](https://img.shields.io/badge/TS-0891B2?style=for-the-badge&logoColor=white) | | **Foreign Words** | ![FW](https://img.shields.io/badge/FW-EA580C?style=for-the-badge&logoColor=white) | | **Punctuation (Pananda)** | ![PM](https://img.shields.io/badge/PM-DB2777?style=for-the-badge&logoColor=white) | | Period | PMP | | Exclamation Point | PME | | Question Mark | PMQ | | Comma | PMC | | Semi-colon | PMSC | | Other Symbols | PMS | ## Bias, Risks, and Limitations This model has not been fully tested so please use with caution. """ tagger = gr.Interface( predict_tags, gr.Textbox(placeholder="Enter sentence here..."), ["highlight"], title="BERT Filipino Part of Speech Tagger", description="Enter a text in Tagalog to classify the tags for each word. Each word to tag needs to be space separated.", examples=[ ["Ang bahay ay lumiliwanag na para bang may kumikislap sa bintana"], ["Naisip ko na kumain na lang tayo sa pinakasikat na restaurant sa Manila ."], ], article=get_readme() ) tagger.launch()