Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Fix a bug where input text is not unicode-normalized
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import re
|
|
|
2 |
from pathlib import Path
|
3 |
|
4 |
import gradio as gr
|
@@ -72,13 +73,17 @@ class MecabTokenizer:
|
|
72 |
mecab_tokenizer = MecabTokenizer()
|
73 |
|
74 |
|
|
|
|
|
|
|
|
|
75 |
def get_texts_from_file(file_path):
|
76 |
texts = []
|
77 |
with open(file_path) as f:
|
78 |
for line in f:
|
79 |
line = line.strip()
|
80 |
if line:
|
81 |
-
texts.append(line)
|
82 |
|
83 |
return texts
|
84 |
|
@@ -214,7 +219,7 @@ with gr.Blocks() as demo:
|
|
214 |
similar_entities = gr.State([])
|
215 |
|
216 |
text_input = gr.Textbox(label="Input Text")
|
217 |
-
text_input.change(fn=lambda text: [text], inputs=text_input, outputs=texts)
|
218 |
texts_file = gr.File(label="Input Texts")
|
219 |
texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=texts)
|
220 |
topk_input = gr.Number(5, label="Top K", interactive=True)
|
|
|
1 |
import re
|
2 |
+
import unicodedata
|
3 |
from pathlib import Path
|
4 |
|
5 |
import gradio as gr
|
|
|
73 |
mecab_tokenizer = MecabTokenizer()
|
74 |
|
75 |
|
76 |
+
def normalize_text(text: str) -> str:
|
77 |
+
return unicodedata.normalize("NFKC", text)
|
78 |
+
|
79 |
+
|
80 |
def get_texts_from_file(file_path):
|
81 |
texts = []
|
82 |
with open(file_path) as f:
|
83 |
for line in f:
|
84 |
line = line.strip()
|
85 |
if line:
|
86 |
+
texts.append(normalize_text(line))
|
87 |
|
88 |
return texts
|
89 |
|
|
|
219 |
similar_entities = gr.State([])
|
220 |
|
221 |
text_input = gr.Textbox(label="Input Text")
|
222 |
+
text_input.change(fn=lambda text: [normalize_text(text)], inputs=text_input, outputs=texts)
|
223 |
texts_file = gr.File(label="Input Texts")
|
224 |
texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=texts)
|
225 |
topk_input = gr.Number(5, label="Top K", interactive=True)
|