Spaces:
Sleeping
Sleeping
Fix a bug where input text is not unicode-normalized
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import re
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
import gradio as gr
|
|
@@ -72,13 +73,17 @@ class MecabTokenizer:
|
|
| 72 |
mecab_tokenizer = MecabTokenizer()
|
| 73 |
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
def get_texts_from_file(file_path):
|
| 76 |
texts = []
|
| 77 |
with open(file_path) as f:
|
| 78 |
for line in f:
|
| 79 |
line = line.strip()
|
| 80 |
if line:
|
| 81 |
-
texts.append(line)
|
| 82 |
|
| 83 |
return texts
|
| 84 |
|
|
@@ -214,7 +219,7 @@ with gr.Blocks() as demo:
|
|
| 214 |
similar_entities = gr.State([])
|
| 215 |
|
| 216 |
text_input = gr.Textbox(label="Input Text")
|
| 217 |
-
text_input.change(fn=lambda text: [text], inputs=text_input, outputs=texts)
|
| 218 |
texts_file = gr.File(label="Input Texts")
|
| 219 |
texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=texts)
|
| 220 |
topk_input = gr.Number(5, label="Top K", interactive=True)
|
|
|
|
| 1 |
import re
|
| 2 |
+
import unicodedata
|
| 3 |
from pathlib import Path
|
| 4 |
|
| 5 |
import gradio as gr
|
|
|
|
| 73 |
mecab_tokenizer = MecabTokenizer()
|
| 74 |
|
| 75 |
|
| 76 |
+
def normalize_text(text: str) -> str:
|
| 77 |
+
return unicodedata.normalize("NFKC", text)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
def get_texts_from_file(file_path):
|
| 81 |
texts = []
|
| 82 |
with open(file_path) as f:
|
| 83 |
for line in f:
|
| 84 |
line = line.strip()
|
| 85 |
if line:
|
| 86 |
+
texts.append(normalize_text(line))
|
| 87 |
|
| 88 |
return texts
|
| 89 |
|
|
|
|
| 219 |
similar_entities = gr.State([])
|
| 220 |
|
| 221 |
text_input = gr.Textbox(label="Input Text")
|
| 222 |
+
text_input.change(fn=lambda text: [normalize_text(text)], inputs=text_input, outputs=texts)
|
| 223 |
texts_file = gr.File(label="Input Texts")
|
| 224 |
texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=texts)
|
| 225 |
topk_input = gr.Number(5, label="Top K", interactive=True)
|