Spaces:

syke9p3
/

bert-tagalog-base-uncased-part-of-speech-tagger

Sleeping

bert-tagalog-base-uncased-part-of-speech-tagger

File size: 24,689 Bytes

import re
from transformers import AutoTokenizer, AutoModelForTokenClassification
import gradio as gr

# # Load the trained model and tokenizer
# model_checkpoint = "BERTPOS"
# model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


# load model from Huggingface
tokenizer = AutoTokenizer.from_pretrained("syke9p3/bert-tagalog-base-uncased-pos-tagger")
model = AutoModelForTokenClassification.from_pretrained("syke9p3/bert-tagalog-base-uncased-pos-tagger")

pos_tag_mapping = {
    '[PAD]': 0,
    'NNC': 1,
    'NNP': 2,
    'NNPA': 3,
    'NNCA': 4,
    'PR': 5,
    'PRS': 6,
    'PRP': 7,
    'PRSP': 8,
    'PRO': 9,
    'PRQ': 10,
    'PRQP': 11,
    'PRL': 12,
    'PRC': 13,
    'PRF': 14,
    'PRI': 15,
    'DT': 16,
    'DTC': 17,
    'DTP': 18,
    'DTPP': 19,
    'LM': 20,
    'CC': 21,
    'CCT': 22,
    'CCR': 23,
    'CCB': 24,
    'CCA': 25,
    'PM': 26,
    'PMP': 27,
    'PME': 28,
    'PMQ': 29,
    'PMC': 30,
    'PMSC': 31,
    'PMS': 32,
    'VB': 33,
    'VBW': 34,
    'VBS': 35,
    'VBN': 36,
    'VBTS': 37,
    'VBTR': 38,
    'VBTF': 39,
    'VBTP': 40,
    'VBAF': 41,
    'VBOF': 42,
    'VBOB': 43,
    'VBOL': 44,
    'VBOI': 45,
    'VBRF': 46,
    'JJ': 47,
    'JJD': 48,
    'JJC': 49,
    'JJCC': 50,
    'JJCS': 51,
    'JJCN': 52,
    'JJCF': 53,
    'JJCB': 54,
    'JJT': 55,
    'RB': 56,
    'RBD': 57,
    'RBN': 58,
    'RBK': 59,
    'RBP': 60,
    'RBB': 61,
    'RBR': 62,
    'RBQ': 63,
    'RBT': 64,
    'RBF': 65,
    'RBW': 66,
    'RBM': 67,
    'RBL': 68,
    'RBI': 69,
    'RBS': 70,
    'RBJ': 71,
    'RBY': 72,
    'RBLI': 73,
    'TS': 74,
    'FW': 75,
    'CD': 76,
    'CCB_CCP': 77,
    'CCR_CCA': 78,
    'CCR_CCB': 79,
    'CCR_CCP': 80,
    'CCR_LM': 81,
    'CCT_CCA': 82,
    'CCT_CCP': 83,
    'CCT_LM': 84,
    'CCU_DTP': 85,
    'CDB_CCA': 86,
    'CDB_CCP': 87,
    'CDB_LM': 88,
    'CDB_NNC': 89,
    'CDB_NNC_CCP': 90,
    'JJCC_CCP': 91,
    'JJCC_JJD': 92,
    'JJCN_CCP': 93,
    'JJCN_LM': 94,
    'JJCS_CCB': 95,
    'JJCS_CCP': 96,
    'JJCS_JJC': 97,
    'JJCS_JJC_CCP': 98,
    'JJCS_JJD': 99,
    '[UNK]': 100,
    '[CLS]': 101,
    '[SEP]': 102,
    'JJCS_JJN': 103,
    'JJCS_JJN_CCP': 104,
    'JJCS_RBF': 105,
    'JJCS_VBAF': 106,
    'JJCS_VBAF_CCP': 107,
    'JJCS_VBN_CCP': 108,
    'JJCS_VBOF': 109,
    'JJCS_VBOF_CCP': 110,
    'JJCS_VBN': 111,
    'RBQ_CCP': 112,
    'JJC_CCB': 113,
    'JJC_CCP': 114,
    'JJC_PRL': 115,
    'JJD_CCA': 116,
    'JJD_CCB': 117,
    'JJD_CCP': 118,
    'JJD_CCT': 119,
    'JJD_NNC': 120,
    'JJD_NNP': 121,
    'JJN_CCA': 122,
    'JJN_CCB': 123,
    'JJN_CCP': 124,
    'JJN_NNC': 125,
    'JJN_NNC_CCP': 126,
    'JJD_NNC_CCP': 127,
    'NNC_CCA': 128,
    'NNC_CCB': 129,
    'NNC_CCP': 130,
    'NNC_NNC_CCP': 131,
    'NN': 132,
    'JJN': 133,
    'NNP_CCA': 134,
    'NNP_CCP': 135,
    'NNP_NNP': 136,
    'PRC_CCB': 137,
    'PRC_CCP': 138,
    'PRF_CCP': 139,
    'PRQ_CCP': 140,
    'PRQ_LM': 141,
    'PRS_CCB': 142,
    'PRS_CCP': 143,
    'PRSP_CCP': 144,
    'PRSP_CCP_NNP': 145,
    'PRL_CCP': 146,
    'PRL_LM': 147,
    'PRO_CCB': 148,
    'PRO_CCP': 149,
    'VBS_CCP': 150,
    'VBTR_CCP': 151,
    'VBTS_CCA': 152,
    'VBTS_CCP': 153,
    'VBTS_JJD': 154,
    'VBTS_LM': 155,
    'VBAF_CCP': 156,
    'VBOB_CCP': 157,
    'VBOF_CCP': 158,
    'VBOF_CCP_NNP': 159,
    'VBRF_CCP': 160,
    'CCP': 161,
    'CDB': 162,
    'RBW_CCP': 163,
    'RBD_CCP': 164,
    'DTCP': 165,
    'VBH': 166,
    'VBTS_VBOF': 167,
    'PRI_CCP': 168,
    'VBTR_VBAF_CCP': 169,
    'DQL': 170,
    'DQR': 171,
    'RBT_CCP': 172,
    'VBW_CCP': 173,
    'RBI_CCP': 174,
    'VBN_CCP': 175,
    'VBTR_VBAF': 176,
    'VBTF_CCP': 177,
    'JJCS_JJD_NNC': 178,
    'CCU': 179,
    'RBL_CCP': 180,
    'VBTR_VBRF_CCP': 181,
    'PRP_CCP': 182,
    'VBTR_VBRF': 183,
    'VBH_CCP': 184,
    'VBTS_VBAF': 185,
    'VBTF_VBOF': 186,
    'VBTR_VBOF': 187,
    'VBTF_VBAF': 188,
    'JJCS_JJD_CCB': 189,
    'JJCS_JJD_CCP': 190,
    'RBM_CCP': 191,
    'NNCS': 192,
    'PRI_CCB': 193,
    'NNA': 194,
    'VBTR_VBOB': 195,
    'DC': 196,
    'JJD_CP': 197,
    'NC': 198,
    'NC_CCP': 199,
    'VBO': 200,
    'JJD_CC': 201,
    'VBF': 202,
    'CP': 203,
    'NP': 204,
    'N': 205,
    'F': 206,
    'CT': 207,
    'MS': 208,
    'BTF': 209,
    'CA': 210,
    'VBOF_RBR': 211,
    'DP': 212,
}


num_labels = len(pos_tag_mapping)
id2label = {idx: tag for tag, idx in pos_tag_mapping.items()}
label2id = {tag: idx for tag, idx in pos_tag_mapping.items()}

special_symbols = ['-', '&', "\"", "[", "]", "/", "$", "(", ")", "%", ":", "'", '.', '?', ',']

def symbol2token(symbol):

    # Check if the symbol is a comma
    if symbol == ',':
        return '[PMC] '

    elif symbol == '.':
        return '[PMP] '

    # Check if the symbol is in the list of special symbols
    elif symbol in special_symbols:
        return '[PMS] '

    # If the symbol is not a comma or in the special symbols list, keep it as it is
    return symbol

def preprocess_untagged_sentence(sentence):
    # Define regex pattern to capture all special symbols
    special_symbols_regex = '|'.join([re.escape(sym) for sym in ['-', '&', "\"", "[", "]", "/", "$", "(", ")", "%", ":", "'", '.']])

    # Replace all special symbols with spaces around them
    sentence = re.sub(rf'({special_symbols_regex})', r' \1 ', sentence)

    # Remove extra whitespaces
    sentence = re.sub(r'\s+', ' ', sentence).strip()

    upper = sentence

    # Convert the sentence to lowercase
    sentence = sentence.lower()

    # Loop through the sentence and convert special symbols to tokens [PMS], [PMC], or [PMP]
    new_sentence = ""
    i = 0
    while i < len(sentence):
        if any(sentence[i:].startswith(symbol) for symbol in special_symbols):
            # Check for ellipsis and replace with '[PMS]'
            if i + 2 < len(sentence) and sentence[i:i + 3] == '...':
                new_sentence += '[PMS]'
                i += 3
            # Check for single special symbols
            elif i + 1 == len(sentence):
                new_sentence += symbol2token(sentence[i])
                break
            elif sentence[i + 1] == ' ' and i == 0:
                new_sentence += symbol2token(sentence[i])
                i += 1
            elif sentence[i - 1] == ' ' and sentence[i + 1] == ' ':
                new_sentence += symbol2token(sentence[i])
                i += 1
            elif sentence[i - 1] != ' ':
                new_sentence += ''
            else:
                word_after_symbol = ""
                while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any(
                        sentence[i + 1:].startswith(symbol) for symbol in special_symbols):
                    word_after_symbol += sentence[i + 1]
                    i += 1
                new_sentence += word_after_symbol
        # Check for special symbols at the start of the sentence
        elif any(sentence[i:].startswith(symbol) for symbol in special_symbols):
            if i + 1 < len(sentence) and (sentence[i + 1] == ' ' and sentence[i - 1] != ' '):
                new_sentence += '[PMS] '
                i += 1
            elif i + 1 == len(sentence):
                new_sentence += '[PMS] '
                break
            else:
                word_after_symbol = ""
                while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any(
                        sentence[i + 1:].startswith(symbol) for symbol in special_symbols):
                    word_after_symbol += sentence[i + 1]
                    i += 1
                new_sentence += word_after_symbol
        else:
            new_sentence += sentence[i]
        i += 1

    print("Sentence after:", new_sentence.split())
    print("---")

    return new_sentence, upper


def preprocess_sentence(tagged_sentence):
    # Remove the line identifier (e.g., SNT.80188.3)
    sentence = re.sub(r'SNT\.\d+\.\d+\s+', '', tagged_sentence)
    special_symbols = ['-', '&', ",", "\"", "[", "]", "/", "$", "(", ")", "%", ":", "'", '.']
    # Construct the regex pattern for extracting words inside <TAGS> including special symbols
    special_symbols_regex = '|'.join([re.escape(sym) for sym in special_symbols])
    regex_pattern = r'<(?:[^<>]+? )?([a-zA-Z0-9.,&"!?{}]+)>'.format(special_symbols_regex)
    words = re.findall(regex_pattern, tagged_sentence)

    # Join the words to form a sentence
    sentence = ' '.join(words)
    sentence = sentence.lower()


    # print("---")
    # print("Sentence before:", sentence)

    # Loop through the sentence and convert hyphen to '[PMP]' if the next character is a space
    new_sentence = ""
    i = 0
    # print("Length: ", len(sentence))
    while i < len(sentence):
        # print(f"{i+1} == {len(sentence)}: {sentence[i]}")

        if any(sentence[i:].startswith(symbol) for symbol in special_symbols):
            if i + 2 < len(sentence) and sentence[i:i + 3] == '...':
                # Ellipsis found, replace with '[PMS]'
                new_sentence += symbol2token(sentence[i])
                i += 3
            elif i + 1 == len(sentence):
                new_sentence += symbol2token(sentence[i])
                break
            elif sentence[i + 1] == ' ' and i == 0:
                new_sentence += symbol2token(sentence[i])
                i += 1
            elif sentence[i - 1] == ' ' and sentence[i + 1] == ' ':
                new_sentence += symbol2token(sentence[i])
                i += 1
            elif sentence[i - 1] != ' ':
                new_sentence += ''
            else:
                word_after_symbol = ""
                while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any(
                        sentence[i + 1:].startswith(symbol) for symbol in special_symbols):
                    word_after_symbol += sentence[i + 1]
                    i += 1
                new_sentence += word_after_symbol
        elif any(sentence[i:].startswith(symbol) for symbol in special_symbols):
            if i + 1 < len(sentence) and (sentence[i + 1] == ' ' and sentence[i - 1] != ' '):
                new_sentence += '[PMS] '
                i += 1
            elif i + 1 == len(sentence):
                new_sentence += '[PMS] '
                break
            else:
                word_after_symbol = ""
                while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any(
                        sentence[i + 1:].startswith(symbol) for symbol in special_symbols):
                    word_after_symbol += sentence[i + 1]
                    i += 1
                new_sentence += word_after_symbol
        else:
            new_sentence += sentence[i]
        i += 1

    print("Sentence after:", new_sentence.split())
    print("---")

    return new_sentence
def extract_tags(input_sentence):
    tags = re.findall(r'<([A-Z_]+)\s.*?>', input_sentence)
    return tags

def align_tokenization(sentence, tags):

    print("Sentence \n: ", sentence)
    sentence = sentence.split()
    print("Sentence Split\n: ", sentence)

    tokenized_sentence = tokenizer.tokenize(' '.join(sentence))
    # tokenized_sentence_string = " ".join(tokenized_sentence)
    # print("ID2Token_string\n: ", tokenized_sentence_string)

    aligned_tagging = []
    current_word = ''
    index = 0  # index of the current word in the sentence and tagging

    for token in tokenized_sentence:
        current_word += re.sub(r'^##', '', token)
        print("Current word after replacing ##: ", current_word)
        print("sentence[index]: ", sentence[index])

        if sentence[index] == current_word:  # if we completed a word
            print("completed a word: ", current_word)
            current_word = ''
            aligned_tagging.append(tags[index])
            index += 1
        else:  # otherwise insert padding
            print("incomplete word: ", current_word)
            aligned_tagging.append(0)

        print("---")

    decoded_tags = [list(pos_tag_mapping.keys())[list(pos_tag_mapping.values()).index(tag_id)] for tag_id in
                    aligned_tagging]
    print("Tokenized Sentence\n: ", tokenized_sentence)
    print("Tags\n: ", decoded_tags)

    assert len(tokenized_sentence) == len(aligned_tagging)

    aligned_tagging = [0] + aligned_tagging
    return tokenized_sentence, aligned_tagging


def process_tagged_sentence(tagged_sentence):
    # print(tagged_sentence)

    # Preprocess the input tagged sentence and extract the words and tags
    sentence = preprocess_sentence(tagged_sentence)
    tags = extract_tags(tagged_sentence) # returns the tags (eto ilagay mo sa tags.txt)


    encoded_tags = [pos_tag_mapping[tag] for tag in tags]

    # Align tokens by adding padding if needed
    tokenized_sentence, encoded_tags = align_tokenization(sentence, encoded_tags)
    encoded_sentence = tokenizer(sentence, padding="max_length" ,truncation=True, max_length=128)

    # Create attention mask (1 for real tokens, 0 for padding)
    attention_mask = [1] * len(encoded_sentence['input_ids'])
    print("len(encoded_sentence['input_ids']):", len(encoded_sentence['input_ids']))
    while len(encoded_sentence['input_ids']) < 128:
        encoded_sentence['input_ids'].append(0)  # Pad with zeros
        attention_mask.append(0)  # Pad attention mask


    while len(encoded_tags) < 128:
        encoded_tags.append(0)  # Pad with the ID of '[PAD]'

    encoded_sentence['encoded_tags'] = encoded_tags

    decoded_sentence = tokenizer.convert_ids_to_tokens(encoded_sentence['input_ids'], skip_special_tokens=False)

    decoded_tags = [list(pos_tag_mapping.keys())[list(pos_tag_mapping.values()).index(tag_id)] for tag_id in
                    encoded_tags]

    #
    word_tag_pairs = list(zip(decoded_sentence, decoded_tags))
    print(encoded_sentence)
    print("Sentence:", decoded_sentence)
    print("Tags:", decoded_tags)
    print("Decoded Sentence and Tags:", word_tag_pairs)
    print("---")

    return encoded_sentence

import torch
import torch.nn.functional as F

def tag_sentence(input_sentence):
    # Preprocess the input tagged sentence and extract the words and tags
    sentence, upper = preprocess_untagged_sentence(input_sentence)

    # Tokenize the sentence and decode it
    encoded_sentence = tokenizer(sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # Pass the encoded sentence to the model to get the predicted logits
    with torch.no_grad():
        model_output = model(**encoded_sentence)

    # Get the logits and apply softmax to convert them into probabilities
    logits = model_output.logits
    probabilities = F.softmax(logits, dim=-1)

    # Get the predicted tag for each token in the sentence
    predicted_tags = torch.argmax(probabilities, dim=-1)

    # Convert the predicted tags to their corresponding labels using id2label
    labels = [id2label[tag.item()] for tag in predicted_tags[0] if id2label[tag.item()] != '[PAD]']

    return labels

# Example usage:
test_sentence = 'Ang bahay ay maganda na para bang may kumikislap sa bintana .'

def predict_tags(test_sentence):

    sentence, upper = preprocess_untagged_sentence(test_sentence)
    words_list = upper.split()
    print("Words: ", words_list)
    predicted_tags = tag_sentence(test_sentence)
    print(predicted_tags)

    pairs = list(zip(words_list, predicted_tags))
    return pairs

predict_tags(test_sentence)

def get_readme():
    return """

    ----

    <!-- ---- -->

    # BERT Tagalog Part of Speech Tagger (BERTTPOST)

    ## 👥 Developed by
    - Saya-ang, Kenth G. ([@syke9p3](https://github.com/syke9p3))
    - Gozum, Denise Julianne S. ([@Xenoxianne](https://github.com/Xenoxianne))
    - Hamor, Mary Grizelle D. ([@mnemoria](https://github.com/mnemoria))
    - Mabansag, Ria Karen B. ([@riavx](https://github.com/riavx))


    ## 📋 Model Details

    <!-- Provide a longer summary of what this model is. -->
    ### Model Description

    - **Model type:** BERT Tagalog Base Uncased
    - **Languages (NLP):** Tagalog, Filipino
    - **Finetuned from model**: [GKLMIP/bert-tagalog-base-uncased](https://huggingface.co/GKLMIP/bert-tagalog-base-uncased)

    ### Dataset

    1000 annotated sentences from Sagum et. al.'s Tagalog Corpora based on MGNN Tagset convention.

    | Dataset        | Number of Sentences | Percentage |
    |----------------|-----------------|------------|
    | Training Set   | 800           | 80%        |
    | Testing Set    | 200             | 20%        |

    ### Preprocessing
    A corpus was used containing tagged sentences in Tagalog language. The dataset comprises sentences with each word annotated with its corresponding POS tag in the format of ```<TAG word>```. To prepare the corpus for training, the following preprocessing steps were performed:
    1. **Removal of Line Identifier**: the line identifier, such as ```SNT.108970.2066```, was removed from each tagged sentence.
    2. **Symbol Conversion**: for the BERT model, certain special symbols like hyphens, quotes, commas, etc., were converted into special tokens (```PMP```, ```PMS```, ```PMC```) to preserve their meaning during tokenization.
    3. **Alignment of Tokenization**: the BERT tokenized words and their corresponding POS tags were aligned to ensure that the tokenization and tagging are consistent.


    ### Training

    This model was trained using PyTorch library with the following hyperparameters set:

    | **Hyperparamter**   |  **Value** |   
    |---------------- |---------
    | Batch Size      |  8 |
    | Training Epoch  |  5 |
    | Learning-rate   |  2e-5 |
    | Optimizer       |  Adam |



    ## ⚒️ Languages and Technologies

    [![Hugging Face](https://img.shields.io/badge/Hugging%20Face%20-FFD21E?style=for-the-badge&logo=huggingface&logoColor=black)](https://jupyter.org/)

    [![Python](https://img.shields.io/badge/Python-F7CC42?style=for-the-badge&logo=python&logoColor=black)](https://www.python.org/)

    [![Gradio](https://img.shields.io/badge/Gradio-F08705?style=for-the-badge&logo=gradio&logoColor=white)](https://pytorch.org/)

    [![Jupyter Notebook](https://img.shields.io/badge/Jupyter%20Notebook-F37626?style=for-the-badge&logo=jupyter&logoColor=white)](https://jupyter.org/)

    [![PyTorch](https://img.shields.io/badge/PyTorch-EE4C2C?style=for-the-badge&logo=pytorch&logoColor=white)](https://pytorch.org/)





    ## 🏷️ Tags

    | Part of Speech                                 | Tags |
    |-----------------------------------------------|------|
    | **Noun**                                       | ![NNC](https://img.shields.io/badge/NNC-84CC16?style=for-the-badge&logoColor=white)  |
    | Common Noun                                   | NNC  |
    | Proper Noun                                   | NNP  |
    | Proper Noun Abbreviation                      | NNPA |
    | Common Noun Abbreviation                      | NNCA |
    | **Pronoun**                                       | ![PR](https://img.shields.io/badge/PR-0D9488?style=for-the-badge&logoColor=white)     |
    | as Subject (Palagyo)/Personal Pronouns Singular | PRS  |
    | Personal Pronouns                             | PRP  |
    | Possessive Subject (Paari)                    | PRSP |
    | Pointing to an Object Demonstrative/(Paturol/Pamatlig) | PRO  |
    | Question/Interrogative (Pananong)/Singular    | PRQ  |
    | Question/Interrogative Plural                 | PRQP |
    | Location (Panlunan)                           | PRL  |
    | Comparison (Panulad)                          | PRC  |
    | Found (Pahimaton)                             | PRF  |
    | Indefinite                                    | PRI  |
    | **Determiner**      | ![DT](https://img.shields.io/badge/DT-16A34A?style=for-the-badge&logoColor=white)      |
    | Determiner (Pantukoy) for Common Noun Plural  | DTC  |
    | Determiner (Pantukoy) for Proper Noun         | DTP  |
    | Determiner (Pantukoy) for Proper Noun Plural  | DTPP |
    | **Conjunctions (Pang-ugnay)**                     | ![CC](https://img.shields.io/badge/CC-16A34ADB2777?style=for-the-badge&logoColor=white)   |
    | **Lexical Marker**                                | ![LM](https://img.shields.io/badge/LM-EAB308?style=for-the-badge&logoColor=white)     |
    | Ligatures (Pang-angkop)                       | CCP  |
    | Preposition (Pang-ukol)                       | CCU  |
    | **Verb (Pandiwa)**                                | ![VB](https://img.shields.io/badge/VB-2563EB?style=for-the-badge&logoColor=white)     |
    | Neutral/Infinitive                            | VBW  |
    | Auxiliary, Modal/Pseudo-verbs                 | VBS  |
    | Existential                                  | VBH  |
    | Non-existential                              | VBN  |
    | Time Past (Perfective)                       | VBTS |
    | Time Present (Imperfective)                  | VBTR |
    | Time Future (Contemplative)                  | VBTF |
    | Recent past                                  | VBTP |
    | Actor Focus                                  | VBAF |
    | Object/Goal Focus                            | VBOF |
    | Benefactive Focus                            | VBOB |
    | Locative Focus                               | VBOL |
    | Instrumental Focus                           | VBOI |
    | Referential/Measurement Focus                | VBRF |
    | **Adjective**                     | ![JJ](https://img.shields.io/badge/JJ-0D9488?style=for-the-badge&logoColor=white)    |
    | Describing (Panlarawan)                      | JJD  |
    | Used for Comparison (same level) (Pahambing Magkatulad) | JJC  |
    | Comparison Comparative (more) (Palamang)     | JJCC |
    | Comparison Superlative (most) (Pasukdol)      | JJCS |
    | Comparison Negation (not quite) (Di-Magkatulad) | JJCN |
    | Describing Number (Pamilang)                 | JJN  |
    | **Adverb (Pang-Abay)**                           | ![RB](https://img.shields.io/badge/RB-DB2777?style=for-the-badge&logoColor=white)    |
    | Describing “How” (Pamaraan)                  | RBD  |
    | Number (Panggaano/Panukat)                   | RBN  |
    | Conditional (Kondisyunal)                   | RBK  |
    | Causative (Pananhi)                          | RBP  |
    | Benefactive (Benepaktibo)                    | RBB  |
    | Referential (Pangkaukulan)                   | RBR  |
    | Question (Pananong)                          | RBQ  |
    | Agree (Panang-ayon)                          | RBT  |
    | Disagree (Pananggi)                          | RBF  |
    | Frequency (Pamanahon)                        | RBW  |
    | Possibility (Pang-agam)                      | RBM  |
    | Place (Panlunan)                             | RBL  |
    | Enclitics (Paningit)                         | RBI  |
    | Interjections (Sambitla)                     | RBJ  |
    | Social Formula (Pormularyong Panlipunan)     | RBS  |
    |**Cardinal Number (Bilang)**                    | ![CD](https://img.shields.io/badge/CD-2563EB?style=for-the-badge&logoColor=white)    |
    | Digit, Rank, Count                           | CDB  |
    | **Topicless (Walang Paksa)**                     | ![TS](https://img.shields.io/badge/TS-0891B2?style=for-the-badge&logoColor=white)    |
    | **Foreign Words**                               | ![FW](https://img.shields.io/badge/FW-EA580C?style=for-the-badge&logoColor=white)    |
    | **Punctuation (Pananda)**                        | ![PM](https://img.shields.io/badge/PM-DB2777?style=for-the-badge&logoColor=white)    |
    | Period                                       | PMP  |
    | Exclamation Point                            | PME  |
    | Question Mark                                | PMQ  |
    | Comma                                        | PMC  |
    | Semi-colon                                   | PMSC |
    | Other Symbols                                      | PMS  |
        
    ## Bias, Risks, and Limitations

    This model has not been fully tested so please use with caution.

    """


tagger = gr.Interface(
    predict_tags,
    gr.Textbox(placeholder="Enter sentence here..."),
    ["highlight"],
    title="BERT Filipino Part of Speech Tagger",
    description="Enter a text in Tagalog to classify the tags for each word. Each word to tag needs to be space separated.",
    examples=[
        ["Ang bahay ay lumiliwanag na para bang may kumikislap sa bintana"],
        ["Naisip ko na kumain na lang tayo sa pinakasikat na restaurant sa Manila ."],
    ],
    article=get_readme()
)

tagger.launch()