Spaces:

studio-ousia
/

luxe-demo

Running

App Files Files Community

singletongue commited on Mar 25

Commit

a3c6550

verified ·

1 Parent(s): fd70e43

Enable truncation in tokenization

Browse files

Files changed (1) hide show

app.py +4 -5

app.py CHANGED Viewed

@@ -160,13 +160,13 @@ def get_topk_entities_from_texts(
     for text in texts:
         text = normalize_text(text).strip()
-        tokenized_examples = tokenizer(text, return_tensors="pt")
         model_outputs = model(**tokenized_examples)
         token_spans = get_token_spans(tokenizer, text)
         entity_spans = get_predicted_entity_spans(model_outputs.ner_logits[0], token_spans, entity_span_sensitivity)
         batch_entity_spans.append(entity_spans)
-        tokenized_examples = tokenizer(text, entity_spans=entity_spans or None, return_tensors="pt")
         model_outputs = model(**tokenized_examples)
         if model_outputs.topic_entity_logits is not None:
@@ -258,7 +258,7 @@ def replace_entities(
                     normal_entity_counts[entity] = 1
     for entity, text in new_entity_text_pairs:
-        tokenized_inputs = tokenizer(text[:MAX_TEXT_LENGTH], return_tensors="pt")
         model_outputs = model(**tokenized_inputs)
         entity_embedding = model.entity_predictions.transform(model_outputs.last_hidden_state[:, 0])[0]
         if entity.startswith(CATEGORY_ENTITY_PREFIX):
@@ -368,7 +368,6 @@ with gr.Blocks() as demo:
     gr.Markdown(
         """Studio Ousia で開発中の次世代知識強化言語モデル **LUXE** の動作デモです。
         入力されたテキストに対して、テキスト中に出現するエンティティ（事物）と、テキスト全体の主題となるエンティティおよびカテゴリを予測します。
         デフォルトのLUXEは、エンティティおよびカテゴリとして、それぞれ日本語 Wikipedia における被リンク数上位50万件および10万件の項目を使用しています。
         予測対象のエンティティを任意のものに置き換えて推論を行うことも可能です（下記「LUXE のエンティティ語彙を置き換える」を参照してください）。""",
         line_breaks=True,
@@ -430,7 +429,7 @@ with gr.Blocks() as demo:
             line_breaks=True,
         )
         gr.Markdown(
-            f"「エンティティ」と「エンティティの説明文」の2列からなる CSV ファイル（最大{MAX_ENTITY_FILE_LINES}行）をアップロードできます。"
         )
         new_entity_text_pairs_file = gr.File(label="エンティティと説明文の CSV ファイル", height="128px")
         gr.Markdown("CSV ファイルから読み込まれた項目が以下の表に表示されます。表の内容を直接編集することも可能です。")

     for text in texts:
         text = normalize_text(text).strip()
+        tokenized_examples = tokenizer(text, truncation=True, return_tensors="pt")
         model_outputs = model(**tokenized_examples)
         token_spans = get_token_spans(tokenizer, text)
         entity_spans = get_predicted_entity_spans(model_outputs.ner_logits[0], token_spans, entity_span_sensitivity)
         batch_entity_spans.append(entity_spans)
+        tokenized_examples = tokenizer(text, entity_spans=entity_spans or None, truncation=True, return_tensors="pt")
         model_outputs = model(**tokenized_examples)
         if model_outputs.topic_entity_logits is not None:
                     normal_entity_counts[entity] = 1
     for entity, text in new_entity_text_pairs:
+        tokenized_inputs = tokenizer(text[:MAX_TEXT_LENGTH], truncation=True, return_tensors="pt")
         model_outputs = model(**tokenized_inputs)
         entity_embedding = model.entity_predictions.transform(model_outputs.last_hidden_state[:, 0])[0]
         if entity.startswith(CATEGORY_ENTITY_PREFIX):
     gr.Markdown(
         """Studio Ousia で開発中の次世代知識強化言語モデル **LUXE** の動作デモです。
         入力されたテキストに対して、テキスト中に出現するエンティティ（事物）と、テキスト全体の主題となるエンティティおよびカテゴリを予測します。
         デフォルトのLUXEは、エンティティおよびカテゴリとして、それぞれ日本語 Wikipedia における被リンク数上位50万件および10万件の項目を使用しています。
         予測対象のエンティティを任意のものに置き換えて推論を行うことも可能です（下記「LUXE のエンティティ語彙を置き換える」を参照してください）。""",
         line_breaks=True,
             line_breaks=True,
         )
         gr.Markdown(
+            f"「エンティティ」と「エンティティの説明文（最大{MAX_TEXT_LENGTH}文字）」の2列からなる CSV ファイル（最大{MAX_ENTITY_FILE_LINES}行）をアップロードできます。"
         )
         new_entity_text_pairs_file = gr.File(label="エンティティと説明文の CSV ファイル", height="128px")
         gr.Markdown("CSV ファイルから読み込まれた項目が以下の表に表示されます。表の内容を直接編集することも可能です。")