Spaces:

studio-ousia
/

luxe-demo

Sleeping

singletongue commited on Feb 13

Commit

17276eb

verified ·

1 Parent(s): aaaa32a

Fix a bug where input text is not unicode-normalized

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import re
 from pathlib import Path
 import gradio as gr
@@ -72,13 +73,17 @@ class MecabTokenizer:
 mecab_tokenizer = MecabTokenizer()
 def get_texts_from_file(file_path):
     texts = []
     with open(file_path) as f:
         for line in f:
             line = line.strip()
             if line:
-                texts.append(line)
     return texts
@@ -214,7 +219,7 @@ with gr.Blocks() as demo:
     similar_entities = gr.State([])
     text_input = gr.Textbox(label="Input Text")
-    text_input.change(fn=lambda text: [text], inputs=text_input, outputs=texts)
     texts_file = gr.File(label="Input Texts")
     texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=texts)
     topk_input = gr.Number(5, label="Top K", interactive=True)

 import re
+import unicodedata
 from pathlib import Path
 import gradio as gr
 mecab_tokenizer = MecabTokenizer()
+def normalize_text(text: str) -> str:
+    return unicodedata.normalize("NFKC", text)
 def get_texts_from_file(file_path):
     texts = []
     with open(file_path) as f:
         for line in f:
             line = line.strip()
             if line:
+                texts.append(normalize_text(line))
     return texts
     similar_entities = gr.State([])
     text_input = gr.Textbox(label="Input Text")
+    text_input.change(fn=lambda text: [normalize_text(text)], inputs=text_input, outputs=texts)
     texts_file = gr.File(label="Input Texts")
     texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=texts)
     topk_input = gr.Number(5, label="Top K", interactive=True)