syke9p3 commited on
Commit
1605685
·
verified ·
1 Parent(s): 681a4c2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +533 -0
app.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
3
+ import gradio as gr
4
+
5
+ # # Load the trained model and tokenizer
6
+ # model_checkpoint = "BERTPOS"
7
+ # model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
8
+ # tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
9
+
10
+
11
+ # load model from Huggingface
12
+ tokenizer = AutoTokenizer.from_pretrained("syke9p3/bert-tagalog-base-uncased-pos-tagger")
13
+ model = AutoModelForTokenClassification.from_pretrained("syke9p3/bert-tagalog-base-uncased-pos-tagger")
14
+
15
+ pos_tag_mapping = {
16
+ '[PAD]': 0,
17
+ 'NNC': 1,
18
+ 'NNP': 2,
19
+ 'NNPA': 3,
20
+ 'NNCA': 4,
21
+ 'PR': 5,
22
+ 'PRS': 6,
23
+ 'PRP': 7,
24
+ 'PRSP': 8,
25
+ 'PRO': 9,
26
+ 'PRQ': 10,
27
+ 'PRQP': 11,
28
+ 'PRL': 12,
29
+ 'PRC': 13,
30
+ 'PRF': 14,
31
+ 'PRI': 15,
32
+ 'DT': 16,
33
+ 'DTC': 17,
34
+ 'DTP': 18,
35
+ 'DTPP': 19,
36
+ 'LM': 20,
37
+ 'CC': 21,
38
+ 'CCT': 22,
39
+ 'CCR': 23,
40
+ 'CCB': 24,
41
+ 'CCA': 25,
42
+ 'PM': 26,
43
+ 'PMP': 27,
44
+ 'PME': 28,
45
+ 'PMQ': 29,
46
+ 'PMC': 30,
47
+ 'PMSC': 31,
48
+ 'PMS': 32,
49
+ 'VB': 33,
50
+ 'VBW': 34,
51
+ 'VBS': 35,
52
+ 'VBN': 36,
53
+ 'VBTS': 37,
54
+ 'VBTR': 38,
55
+ 'VBTF': 39,
56
+ 'VBTP': 40,
57
+ 'VBAF': 41,
58
+ 'VBOF': 42,
59
+ 'VBOB': 43,
60
+ 'VBOL': 44,
61
+ 'VBOI': 45,
62
+ 'VBRF': 46,
63
+ 'JJ': 47,
64
+ 'JJD': 48,
65
+ 'JJC': 49,
66
+ 'JJCC': 50,
67
+ 'JJCS': 51,
68
+ 'JJCN': 52,
69
+ 'JJCF': 53,
70
+ 'JJCB': 54,
71
+ 'JJT': 55,
72
+ 'RB': 56,
73
+ 'RBD': 57,
74
+ 'RBN': 58,
75
+ 'RBK': 59,
76
+ 'RBP': 60,
77
+ 'RBB': 61,
78
+ 'RBR': 62,
79
+ 'RBQ': 63,
80
+ 'RBT': 64,
81
+ 'RBF': 65,
82
+ 'RBW': 66,
83
+ 'RBM': 67,
84
+ 'RBL': 68,
85
+ 'RBI': 69,
86
+ 'RBS': 70,
87
+ 'RBJ': 71,
88
+ 'RBY': 72,
89
+ 'RBLI': 73,
90
+ 'TS': 74,
91
+ 'FW': 75,
92
+ 'CD': 76,
93
+ 'CCB_CCP': 77,
94
+ 'CCR_CCA': 78,
95
+ 'CCR_CCB': 79,
96
+ 'CCR_CCP': 80,
97
+ 'CCR_LM': 81,
98
+ 'CCT_CCA': 82,
99
+ 'CCT_CCP': 83,
100
+ 'CCT_LM': 84,
101
+ 'CCU_DTP': 85,
102
+ 'CDB_CCA': 86,
103
+ 'CDB_CCP': 87,
104
+ 'CDB_LM': 88,
105
+ 'CDB_NNC': 89,
106
+ 'CDB_NNC_CCP': 90,
107
+ 'JJCC_CCP': 91,
108
+ 'JJCC_JJD': 92,
109
+ 'JJCN_CCP': 93,
110
+ 'JJCN_LM': 94,
111
+ 'JJCS_CCB': 95,
112
+ 'JJCS_CCP': 96,
113
+ 'JJCS_JJC': 97,
114
+ 'JJCS_JJC_CCP': 98,
115
+ 'JJCS_JJD': 99,
116
+ '[UNK]': 100,
117
+ '[CLS]': 101,
118
+ '[SEP]': 102,
119
+ 'JJCS_JJN': 103,
120
+ 'JJCS_JJN_CCP': 104,
121
+ 'JJCS_RBF': 105,
122
+ 'JJCS_VBAF': 106,
123
+ 'JJCS_VBAF_CCP': 107,
124
+ 'JJCS_VBN_CCP': 108,
125
+ 'JJCS_VBOF': 109,
126
+ 'JJCS_VBOF_CCP': 110,
127
+ 'JJCS_VBN': 111,
128
+ 'RBQ_CCP': 112,
129
+ 'JJC_CCB': 113,
130
+ 'JJC_CCP': 114,
131
+ 'JJC_PRL': 115,
132
+ 'JJD_CCA': 116,
133
+ 'JJD_CCB': 117,
134
+ 'JJD_CCP': 118,
135
+ 'JJD_CCT': 119,
136
+ 'JJD_NNC': 120,
137
+ 'JJD_NNP': 121,
138
+ 'JJN_CCA': 122,
139
+ 'JJN_CCB': 123,
140
+ 'JJN_CCP': 124,
141
+ 'JJN_NNC': 125,
142
+ 'JJN_NNC_CCP': 126,
143
+ 'JJD_NNC_CCP': 127,
144
+ 'NNC_CCA': 128,
145
+ 'NNC_CCB': 129,
146
+ 'NNC_CCP': 130,
147
+ 'NNC_NNC_CCP': 131,
148
+ 'NN': 132,
149
+ 'JJN': 133,
150
+ 'NNP_CCA': 134,
151
+ 'NNP_CCP': 135,
152
+ 'NNP_NNP': 136,
153
+ 'PRC_CCB': 137,
154
+ 'PRC_CCP': 138,
155
+ 'PRF_CCP': 139,
156
+ 'PRQ_CCP': 140,
157
+ 'PRQ_LM': 141,
158
+ 'PRS_CCB': 142,
159
+ 'PRS_CCP': 143,
160
+ 'PRSP_CCP': 144,
161
+ 'PRSP_CCP_NNP': 145,
162
+ 'PRL_CCP': 146,
163
+ 'PRL_LM': 147,
164
+ 'PRO_CCB': 148,
165
+ 'PRO_CCP': 149,
166
+ 'VBS_CCP': 150,
167
+ 'VBTR_CCP': 151,
168
+ 'VBTS_CCA': 152,
169
+ 'VBTS_CCP': 153,
170
+ 'VBTS_JJD': 154,
171
+ 'VBTS_LM': 155,
172
+ 'VBAF_CCP': 156,
173
+ 'VBOB_CCP': 157,
174
+ 'VBOF_CCP': 158,
175
+ 'VBOF_CCP_NNP': 159,
176
+ 'VBRF_CCP': 160,
177
+ 'CCP': 161,
178
+ 'CDB': 162,
179
+ 'RBW_CCP': 163,
180
+ 'RBD_CCP': 164,
181
+ 'DTCP': 165,
182
+ 'VBH': 166,
183
+ 'VBTS_VBOF': 167,
184
+ 'PRI_CCP': 168,
185
+ 'VBTR_VBAF_CCP': 169,
186
+ 'DQL': 170,
187
+ 'DQR': 171,
188
+ 'RBT_CCP': 172,
189
+ 'VBW_CCP': 173,
190
+ 'RBI_CCP': 174,
191
+ 'VBN_CCP': 175,
192
+ 'VBTR_VBAF': 176,
193
+ 'VBTF_CCP': 177,
194
+ 'JJCS_JJD_NNC': 178,
195
+ 'CCU': 179,
196
+ 'RBL_CCP': 180,
197
+ 'VBTR_VBRF_CCP': 181,
198
+ 'PRP_CCP': 182,
199
+ 'VBTR_VBRF': 183,
200
+ 'VBH_CCP': 184,
201
+ 'VBTS_VBAF': 185,
202
+ 'VBTF_VBOF': 186,
203
+ 'VBTR_VBOF': 187,
204
+ 'VBTF_VBAF': 188,
205
+ 'JJCS_JJD_CCB': 189,
206
+ 'JJCS_JJD_CCP': 190,
207
+ 'RBM_CCP': 191,
208
+ 'NNCS': 192,
209
+ 'PRI_CCB': 193,
210
+ 'NNA': 194,
211
+ 'VBTR_VBOB': 195,
212
+ 'DC': 196,
213
+ 'JJD_CP': 197,
214
+ 'NC': 198,
215
+ 'NC_CCP': 199,
216
+ 'VBO': 200,
217
+ 'JJD_CC': 201,
218
+ 'VBF': 202,
219
+ 'CP': 203,
220
+ 'NP': 204,
221
+ 'N': 205,
222
+ 'F': 206,
223
+ 'CT': 207,
224
+ 'MS': 208,
225
+ 'BTF': 209,
226
+ 'CA': 210,
227
+ 'VBOF_RBR': 211,
228
+ 'DP': 212,
229
+ }
230
+
231
+
232
+ num_labels = len(pos_tag_mapping)
233
+ id2label = {idx: tag for tag, idx in pos_tag_mapping.items()}
234
+ label2id = {tag: idx for tag, idx in pos_tag_mapping.items()}
235
+
236
+ special_symbols = ['-', '&', "\"", "[", "]", "/", "$", "(", ")", "%", ":", "'", '.', '?', ',']
237
+
238
+ def symbol2token(symbol):
239
+
240
+ # Check if the symbol is a comma
241
+ if symbol == ',':
242
+ return '[PMC] '
243
+
244
+ elif symbol == '.':
245
+ return '[PMP] '
246
+
247
+ # Check if the symbol is in the list of special symbols
248
+ elif symbol in special_symbols:
249
+ return '[PMS] '
250
+
251
+ # If the symbol is not a comma or in the special symbols list, keep it as it is
252
+ return symbol
253
+
254
+ def preprocess_untagged_sentence(sentence):
255
+ # Define regex pattern to capture all special symbols
256
+ special_symbols_regex = '|'.join([re.escape(sym) for sym in ['-', '&', "\"", "[", "]", "/", "$", "(", ")", "%", ":", "'", '.']])
257
+
258
+ # Replace all special symbols with spaces around them
259
+ sentence = re.sub(rf'({special_symbols_regex})', r' \1 ', sentence)
260
+
261
+ # Remove extra whitespaces
262
+ sentence = re.sub(r'\s+', ' ', sentence).strip()
263
+
264
+ upper = sentence
265
+
266
+ # Convert the sentence to lowercase
267
+ sentence = sentence.lower()
268
+
269
+ # Loop through the sentence and convert special symbols to tokens [PMS], [PMC], or [PMP]
270
+ new_sentence = ""
271
+ i = 0
272
+ while i < len(sentence):
273
+ if any(sentence[i:].startswith(symbol) for symbol in special_symbols):
274
+ # Check for ellipsis and replace with '[PMS]'
275
+ if i + 2 < len(sentence) and sentence[i:i + 3] == '...':
276
+ new_sentence += '[PMS]'
277
+ i += 3
278
+ # Check for single special symbols
279
+ elif i + 1 == len(sentence):
280
+ new_sentence += symbol2token(sentence[i])
281
+ break
282
+ elif sentence[i + 1] == ' ' and i == 0:
283
+ new_sentence += symbol2token(sentence[i])
284
+ i += 1
285
+ elif sentence[i - 1] == ' ' and sentence[i + 1] == ' ':
286
+ new_sentence += symbol2token(sentence[i])
287
+ i += 1
288
+ elif sentence[i - 1] != ' ':
289
+ new_sentence += ''
290
+ else:
291
+ word_after_symbol = ""
292
+ while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any(
293
+ sentence[i + 1:].startswith(symbol) for symbol in special_symbols):
294
+ word_after_symbol += sentence[i + 1]
295
+ i += 1
296
+ new_sentence += word_after_symbol
297
+ # Check for special symbols at the start of the sentence
298
+ elif any(sentence[i:].startswith(symbol) for symbol in special_symbols):
299
+ if i + 1 < len(sentence) and (sentence[i + 1] == ' ' and sentence[i - 1] != ' '):
300
+ new_sentence += '[PMS] '
301
+ i += 1
302
+ elif i + 1 == len(sentence):
303
+ new_sentence += '[PMS] '
304
+ break
305
+ else:
306
+ word_after_symbol = ""
307
+ while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any(
308
+ sentence[i + 1:].startswith(symbol) for symbol in special_symbols):
309
+ word_after_symbol += sentence[i + 1]
310
+ i += 1
311
+ new_sentence += word_after_symbol
312
+ else:
313
+ new_sentence += sentence[i]
314
+ i += 1
315
+
316
+ print("Sentence after:", new_sentence.split())
317
+ print("---")
318
+
319
+ return new_sentence, upper
320
+
321
+
322
+ def preprocess_sentence(tagged_sentence):
323
+ # Remove the line identifier (e.g., SNT.80188.3)
324
+ sentence = re.sub(r'SNT\.\d+\.\d+\s+', '', tagged_sentence)
325
+ special_symbols = ['-', '&', ",", "\"", "[", "]", "/", "$", "(", ")", "%", ":", "'", '.']
326
+ # Construct the regex pattern for extracting words inside <TAGS> including special symbols
327
+ special_symbols_regex = '|'.join([re.escape(sym) for sym in special_symbols])
328
+ regex_pattern = r'<(?:[^<>]+? )?([a-zA-Z0-9.,&"!?{}]+)>'.format(special_symbols_regex)
329
+ words = re.findall(regex_pattern, tagged_sentence)
330
+
331
+ # Join the words to form a sentence
332
+ sentence = ' '.join(words)
333
+ sentence = sentence.lower()
334
+
335
+
336
+ # print("---")
337
+ # print("Sentence before:", sentence)
338
+
339
+ # Loop through the sentence and convert hyphen to '[PMP]' if the next character is a space
340
+ new_sentence = ""
341
+ i = 0
342
+ # print("Length: ", len(sentence))
343
+ while i < len(sentence):
344
+ # print(f"{i+1} == {len(sentence)}: {sentence[i]}")
345
+
346
+ if any(sentence[i:].startswith(symbol) for symbol in special_symbols):
347
+ if i + 2 < len(sentence) and sentence[i:i + 3] == '...':
348
+ # Ellipsis found, replace with '[PMS]'
349
+ new_sentence += symbol2token(sentence[i])
350
+ i += 3
351
+ elif i + 1 == len(sentence):
352
+ new_sentence += symbol2token(sentence[i])
353
+ break
354
+ elif sentence[i + 1] == ' ' and i == 0:
355
+ new_sentence += symbol2token(sentence[i])
356
+ i += 1
357
+ elif sentence[i - 1] == ' ' and sentence[i + 1] == ' ':
358
+ new_sentence += symbol2token(sentence[i])
359
+ i += 1
360
+ elif sentence[i - 1] != ' ':
361
+ new_sentence += ''
362
+ else:
363
+ word_after_symbol = ""
364
+ while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any(
365
+ sentence[i + 1:].startswith(symbol) for symbol in special_symbols):
366
+ word_after_symbol += sentence[i + 1]
367
+ i += 1
368
+ new_sentence += word_after_symbol
369
+ elif any(sentence[i:].startswith(symbol) for symbol in special_symbols):
370
+ if i + 1 < len(sentence) and (sentence[i + 1] == ' ' and sentence[i - 1] != ' '):
371
+ new_sentence += '[PMS] '
372
+ i += 1
373
+ elif i + 1 == len(sentence):
374
+ new_sentence += '[PMS] '
375
+ break
376
+ else:
377
+ word_after_symbol = ""
378
+ while i + 1 < len(sentence) and sentence[i + 1] != ' ' and not any(
379
+ sentence[i + 1:].startswith(symbol) for symbol in special_symbols):
380
+ word_after_symbol += sentence[i + 1]
381
+ i += 1
382
+ new_sentence += word_after_symbol
383
+ else:
384
+ new_sentence += sentence[i]
385
+ i += 1
386
+
387
+ print("Sentence after:", new_sentence.split())
388
+ print("---")
389
+
390
+ return new_sentence
391
+ def extract_tags(input_sentence):
392
+ tags = re.findall(r'<([A-Z_]+)\s.*?>', input_sentence)
393
+ return tags
394
+
395
+ def align_tokenization(sentence, tags):
396
+
397
+ print("Sentence \n: ", sentence)
398
+ sentence = sentence.split()
399
+ print("Sentence Split\n: ", sentence)
400
+
401
+ tokenized_sentence = tokenizer.tokenize(' '.join(sentence))
402
+ # tokenized_sentence_string = " ".join(tokenized_sentence)
403
+ # print("ID2Token_string\n: ", tokenized_sentence_string)
404
+
405
+ aligned_tagging = []
406
+ current_word = ''
407
+ index = 0 # index of the current word in the sentence and tagging
408
+
409
+ for token in tokenized_sentence:
410
+ current_word += re.sub(r'^##', '', token)
411
+ print("Current word after replacing ##: ", current_word)
412
+ print("sentence[index]: ", sentence[index])
413
+
414
+ if sentence[index] == current_word: # if we completed a word
415
+ print("completed a word: ", current_word)
416
+ current_word = ''
417
+ aligned_tagging.append(tags[index])
418
+ index += 1
419
+ else: # otherwise insert padding
420
+ print("incomplete word: ", current_word)
421
+ aligned_tagging.append(0)
422
+
423
+ print("---")
424
+
425
+ decoded_tags = [list(pos_tag_mapping.keys())[list(pos_tag_mapping.values()).index(tag_id)] for tag_id in
426
+ aligned_tagging]
427
+ print("Tokenized Sentence\n: ", tokenized_sentence)
428
+ print("Tags\n: ", decoded_tags)
429
+
430
+ assert len(tokenized_sentence) == len(aligned_tagging)
431
+
432
+ aligned_tagging = [0] + aligned_tagging
433
+ return tokenized_sentence, aligned_tagging
434
+
435
+
436
+ def process_tagged_sentence(tagged_sentence):
437
+ # print(tagged_sentence)
438
+
439
+ # Preprocess the input tagged sentence and extract the words and tags
440
+ sentence = preprocess_sentence(tagged_sentence)
441
+ tags = extract_tags(tagged_sentence) # returns the tags (eto ilagay mo sa tags.txt)
442
+
443
+
444
+ encoded_tags = [pos_tag_mapping[tag] for tag in tags]
445
+
446
+ # Align tokens by adding padding if needed
447
+ tokenized_sentence, encoded_tags = align_tokenization(sentence, encoded_tags)
448
+ encoded_sentence = tokenizer(sentence, padding="max_length" ,truncation=True, max_length=128)
449
+
450
+ # Create attention mask (1 for real tokens, 0 for padding)
451
+ attention_mask = [1] * len(encoded_sentence['input_ids'])
452
+ print("len(encoded_sentence['input_ids']):", len(encoded_sentence['input_ids']))
453
+ while len(encoded_sentence['input_ids']) < 128:
454
+ encoded_sentence['input_ids'].append(0) # Pad with zeros
455
+ attention_mask.append(0) # Pad attention mask
456
+
457
+
458
+ while len(encoded_tags) < 128:
459
+ encoded_tags.append(0) # Pad with the ID of '[PAD]'
460
+
461
+ encoded_sentence['encoded_tags'] = encoded_tags
462
+
463
+ decoded_sentence = tokenizer.convert_ids_to_tokens(encoded_sentence['input_ids'], skip_special_tokens=False)
464
+
465
+ decoded_tags = [list(pos_tag_mapping.keys())[list(pos_tag_mapping.values()).index(tag_id)] for tag_id in
466
+ encoded_tags]
467
+
468
+ #
469
+ word_tag_pairs = list(zip(decoded_sentence, decoded_tags))
470
+ print(encoded_sentence)
471
+ print("Sentence:", decoded_sentence)
472
+ print("Tags:", decoded_tags)
473
+ print("Decoded Sentence and Tags:", word_tag_pairs)
474
+ print("---")
475
+
476
+ return encoded_sentence
477
+
478
+ import torch
479
+ import torch.nn.functional as F
480
+
481
+ def tag_sentence(input_sentence):
482
+ # Preprocess the input tagged sentence and extract the words and tags
483
+ sentence, upper = preprocess_untagged_sentence(input_sentence)
484
+
485
+ # Tokenize the sentence and decode it
486
+ encoded_sentence = tokenizer(sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
487
+
488
+ # Pass the encoded sentence to the model to get the predicted logits
489
+ with torch.no_grad():
490
+ model_output = model(**encoded_sentence)
491
+
492
+ # Get the logits and apply softmax to convert them into probabilities
493
+ logits = model_output.logits
494
+ probabilities = F.softmax(logits, dim=-1)
495
+
496
+ # Get the predicted tag for each token in the sentence
497
+ predicted_tags = torch.argmax(probabilities, dim=-1)
498
+
499
+ # Convert the predicted tags to their corresponding labels using id2label
500
+ labels = [id2label[tag.item()] for tag in predicted_tags[0] if id2label[tag.item()] != '[PAD]']
501
+
502
+ return labels
503
+
504
+ # Example usage:
505
+ test_sentence = 'Ang bahay ay maganda na para bang may kumikislap sa bintana .'
506
+
507
+ def predict_tags(test_sentence):
508
+
509
+ sentence, upper = preprocess_untagged_sentence(test_sentence)
510
+ words_list = upper.split()
511
+ print("Words: ", words_list)
512
+ predicted_tags = tag_sentence(test_sentence)
513
+ print(predicted_tags)
514
+
515
+ pairs = list(zip(words_list, predicted_tags))
516
+ return pairs
517
+
518
+ predict_tags(test_sentence)
519
+
520
+
521
+ tagger = gr.Interface(
522
+ predict_tags,
523
+ gr.Textbox(placeholder="Enter sentence here..."),
524
+ ["highlight"],
525
+ title="BERT Filipino Part of Speech Tagger",
526
+ description="Enter a text in Tagalog to classify the tags for each word. Each word to tag needs to be space separated.",
527
+ examples=[
528
+ ["Ang bahay ay lumiliwanag na para bang may kumikislap sa bintana"],
529
+ ["Naisip ko na kumain na lang tayo sa pinakasikat na restaurant sa Manila ."],
530
+ ],
531
+ )
532
+
533
+ tagger.launch()