Spaces:

studio-ousia
/

luxe-demo

Running on CPU Upgrade

App Files Files Community

singletongue commited on 27 days ago

Commit

fd70e43

verified ·

1 Parent(s): 913e5a4

Change MAX_TEXT_FILE_LINES to 10, clean up entity names, modify some UI components

Browse files

Files changed (1) hide show

app.py +182 -213

app.py CHANGED Viewed

@@ -2,22 +2,21 @@ import csv
 import re
 import unicodedata
 from collections import defaultdict
-from pathlib import Path
 import gradio as gr
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import unidic_lite
 from bm25s.hf import BM25HF, TokenizerHF
-from fugashi import GenericTagger
 from transformers import AutoModelForPreTraining, AutoTokenizer
 ALIAS_SEP = "|"
 ENTITY_SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[MASK]", "[MASK2]"]
 MAX_TEXT_LENGTH = 800
-MAX_TEXT_FILE_LINES = 100
 MAX_ENTITY_FILE_LINES = 1000
 repo_id = "studio-ousia/luxe"
@@ -37,32 +36,21 @@ ignore_category_patterns = [
 ]
-class MecabTokenizer:
-    def __init__(self):
-        unidic_dir = unidic_lite.DICDIR
-        mecabrc_file = Path(unidic_dir, "mecabrc")
-        mecab_option = f"-d {unidic_dir} -r {mecabrc_file}"
-        self.tagger = GenericTagger(mecab_option)
-    def __call__(self, text: str) -> list[tuple[str, str, tuple[int, int]]]:
-        outputs = []
-        end = 0
-        for node in self.tagger(text):
-            word = node.surface.strip()
-            pos = node.feature[0]
-            start = text.index(word, end)
-            end = start + len(word)
-            outputs.append((word, pos, (start, end)))
-        return outputs
-mecab_tokenizer = MecabTokenizer()
 def normalize_text(text: str) -> str:
-    return unicodedata.normalize("NFKC", text)
 def get_texts_from_file(file_path: str | None):
@@ -73,36 +61,20 @@ def get_texts_from_file(file_path: str | None):
                 reader = csv.DictReader(f, fieldnames=["text"])
                 for i, row in enumerate(reader):
                     if i >= MAX_TEXT_FILE_LINES:
-                        gr.Info(f"{MAX_TEXT_FILE_LINES}行目までのデータを読み込みました。")
                         break
-                    text = normalize_text(row["text"]).strip()
-                    if text != "":
                         texts.append(text[:MAX_TEXT_LENGTH])
         except Exception as e:
-            gr.Warning("ファイルを正しく読み込めませんでした。")
             print(e)
             texts = []
     return texts
-def get_noun_spans_from_text(text: str) -> list[tuple[int, int]]:
-    last_pos = None
-    noun_spans = []
-    for word, pos, (start, end) in mecab_tokenizer(text):
-        if pos == "名詞":
-            if len(noun_spans) > 0 and last_pos == "名詞":
-                noun_spans[-1] = (noun_spans[-1][0], end)
-            else:
-                noun_spans.append((start, end))
-        last_pos = pos
-    return noun_spans
 def get_token_spans(tokenizer, text: str) -> list[tuple[int, int]]:
     token_spans = []
     end = 0
@@ -147,12 +119,17 @@ def get_predicted_entity_spans(
 def get_topk_entities_from_texts(
     models,
-    texts: list[str],
     k: int = 5,
     entity_span_sensitivity: float = 1.0,
     nayose_coef: float = 1.0,
     entity_replaced_counts: bool = False,
 ) -> tuple[list[list[tuple[int, int]]], list[list[str]], list[list[str]], list[list[list[str]]]]:
     model, tokenizer, bm25_tokenizer, bm25_retriever = models
     batch_entity_spans: list[list[tuple[int, int]]] = []
@@ -177,7 +154,12 @@ def get_topk_entities_from_texts(
         and any(re.search(pattern, entity) for pattern in ignore_category_patterns)
     ]
     for text in texts:
         tokenized_examples = tokenizer(text, return_tensors="pt")
         model_outputs = model(**tokenized_examples)
         token_spans = get_token_spans(tokenizer, text)
@@ -188,14 +170,14 @@ def get_topk_entities_from_texts(
         model_outputs = model(**tokenized_examples)
         if model_outputs.topic_entity_logits is not None:
-            _, topk_normal_entity_ids = model_outputs.topic_entity_logits[0].topk(k)
             topk_normal_entities.append([id2normal_entity[id_] for id_ in topk_normal_entity_ids.tolist()])
         else:
             topk_normal_entities.append([])
         if model_outputs.topic_category_logits is not None:
             model_outputs.topic_category_logits[:, ignore_category_entity_ids] = float("-inf")
-            _, topk_category_entity_ids = model_outputs.topic_category_logits[0].topk(k)
             topk_category_entities.append([id2category_entity[id_] for id_ in topk_category_entity_ids.tolist()])
         else:
             topk_category_entities.append([])
@@ -211,7 +193,7 @@ def get_topk_entities_from_texts(
                 )
                 span_entity_logits += nayose_coef * nayose_scores
-            _, topk_span_entity_ids = span_entity_logits.topk(k)
             topk_span_entities.append(
                 [[id2normal_entity[id_] for id_ in ids] for ids in topk_span_entity_ids.tolist()]
             )
@@ -221,51 +203,6 @@ def get_topk_entities_from_texts(
     return texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities
-def get_selected_entity(evt: gr.SelectData):
-    return evt.value[0]
-def get_similar_entities(models, query_entity: str, k: int = 10) -> list[str]:
-    model, tokenizer, _, _ = models
-    query_entity_id = tokenizer.entity_vocab[query_entity]
-    id2normal_entity = {
-        entity_id: entity
-        for entity, entity_id in tokenizer.entity_vocab.items()
-        if entity_id < model.config.num_normal_entities
-    }
-    id2category_entity = {
-        entity_id - model.config.num_normal_entities: entity
-        for entity, entity_id in tokenizer.entity_vocab.items()
-        if entity_id >= model.config.num_normal_entities
-    }
-    ignore_category_entity_ids = [
-        entity_id - model.config.num_normal_entities
-        for entity, entity_id in tokenizer.entity_vocab.items()
-        if entity_id >= model.config.num_normal_entities
-        and any(re.search(pattern, entity) for pattern in ignore_category_patterns)
-    ]
-    entity_embeddings = model.luke.entity_embeddings.entity_embeddings.weight
-    normal_entity_embeddings = entity_embeddings[: model.config.num_normal_entities]
-    category_entity_embeddings = entity_embeddings[model.config.num_normal_entities :]
-    if query_entity_id < model.config.num_normal_entities:
-        topk_entity_scores = normal_entity_embeddings[query_entity_id] @ normal_entity_embeddings.T
-        topk_entity_ids = topk_entity_scores.topk(k + 1).indices[1:]
-        topk_entities = [id2normal_entity[entity_id] for entity_id in topk_entity_ids.tolist()]
-    else:
-        query_entity_id -= model.config.num_normal_entities
-        topk_entity_scores = category_entity_embeddings[query_entity_id] @ category_entity_embeddings.T
-        topk_entity_scores[ignore_category_entity_ids] = float("-inf")
-        topk_entity_ids = topk_entity_scores.topk(k + 1).indices[1:]
-        topk_entities = [id2category_entity[entity_id] for entity_id in topk_entity_ids.tolist()]
-    return topk_entities
 def get_new_entity_text_pairs_from_file(file_path: str | None) -> list[list[str]]:
     new_entity_text_pairs = []
     if file_path is not None:
@@ -274,7 +211,7 @@ def get_new_entity_text_pairs_from_file(file_path: str | None) -> list[list[str]
                 reader = csv.DictReader(f, fieldnames=["entity", "text"])
                 for i, row in enumerate(reader):
                     if i >= MAX_ENTITY_FILE_LINES:
-                        gr.Info(f"{MAX_ENTITY_FILE_LINES}行目までのデータを読み込みました。")
                         break
                     entity = normalize_text(row["entity"]).strip()
@@ -282,7 +219,7 @@ def get_new_entity_text_pairs_from_file(file_path: str | None) -> list[list[str]
                     if entity != "" and text != "":
                         new_entity_text_pairs.append([entity, text])
         except Exception as e:
-            gr.Warning("ファイルを正しく読み込めませんでした。")
             print(e)
             new_entity_text_pairs = []
@@ -290,90 +227,109 @@ def get_new_entity_text_pairs_from_file(file_path: str | None) -> list[list[str]
 def replace_entities(
-    models,
-    new_entity_text_pairs: list[tuple[str, str]],
-    entity_replaced_counts: int,
-    new_num_category_entities: int = 0,
-    new_entity_counts: list[int] | None = None,
-    new_padding_idx: int = 0,
-) -> True:
-    model, tokenizer, bm25_tokenizer, bm25_retriever = models
-    gr.Info("トークナイザのエンティティの語彙を置き換えています...", duration=5)
-    new_entity_tokens = ENTITY_SPECIAL_TOKENS + [entity for entity, _ in new_entity_text_pairs]
-    new_entity_vocab = {}
-    for entity in new_entity_tokens:
-        if entity not in new_entity_vocab:
-            new_entity_vocab[entity] = len(new_entity_vocab)
-    new_entity_vocab = {entity: entity_id for entity_id, entity in enumerate(new_entity_tokens)}
-    tokenizer.entity_vocab = new_entity_vocab
-    tokenizer.entity_pad_token_id = tokenizer.entity_vocab["[PAD]"]
-    tokenizer.entity_unk_token_id = tokenizer.entity_vocab["[UNK]"]
-    tokenizer.entity_mask_token_id = tokenizer.entity_vocab["[MASK]"]
-    tokenizer.entity_mask2_token_id = tokenizer.entity_vocab["[MASK2]"]
-    gr.Info("モデルのエンティティの埋め込みを置き換えています...", duration=5)
-    new_entity_embeddings_dict = defaultdict(list)
-    for entity_special_token in ENTITY_SPECIAL_TOKENS:
-        entity_special_token_id = tokenizer.entity_vocab[entity_special_token]
-        new_entity_embeddings_dict[entity_special_token_id].append(
-            model.luke.entity_embeddings.entity_embeddings.weight.data[entity_special_token_id]
-        )
     for entity, text in new_entity_text_pairs:
-        entity_id = tokenizer.entity_vocab[entity]
         tokenized_inputs = tokenizer(text[:MAX_TEXT_LENGTH], return_tensors="pt")
         model_outputs = model(**tokenized_inputs)
-        entity_embeddings = model.entity_predictions.transform(model_outputs.last_hidden_state[:, 0])
-        new_entity_embeddings_dict[entity_id].append(entity_embeddings[0])
-    assert len(new_entity_embeddings_dict) == len(tokenizer.entity_vocab)
-    new_entity_embeddings = torch.vstack(
-        [
-            sum(new_entity_embeddings_dict[i]) / len(new_entity_embeddings_dict[i])
-            for i in range(len(new_entity_embeddings_dict))
-        ]
-    )
-    new_entity_vocab_size, new_entity_emb_size = new_entity_embeddings.size()
-    assert new_entity_vocab_size == len(tokenizer.entity_vocab)
-    new_num_normal_entities = new_entity_vocab_size - new_num_category_entities
-    if new_entity_counts is not None and any(count < 1 for count in new_entity_counts):
-        raise ValueError("All items in new_entity_counts must be greater than zero")
     if model.config.normalize_entity_embeddings:
-        new_entity_embeddings = F.normalize(new_entity_embeddings)
-    new_entity_embeddings_module = nn.Embedding(
-        new_entity_vocab_size,
-        new_entity_emb_size,
-        padding_idx=new_padding_idx,
         device=model.luke.entity_embeddings.entity_embeddings.weight.device,
         dtype=model.luke.entity_embeddings.entity_embeddings.weight.dtype,
     )
-    new_entity_embeddings_module.weight.data = new_entity_embeddings.data
-    model.luke.entity_embeddings.entity_embeddings = new_entity_embeddings_module
-    new_entity_decoder_module = nn.Linear(new_entity_emb_size, new_entity_vocab_size, bias=False)
-    model.entity_predictions.decoder = new_entity_decoder_module
-    model.entity_predictions.bias = nn.Parameter(torch.zeros(new_entity_vocab_size))
     model.tie_weights()
-    if hasattr(model, "entity_log_probs"):
-        del model.entity_log_probs
-    model.config.entity_vocab_size = new_entity_vocab_size
-    model.config.num_normal_entities = new_num_normal_entities
-    model.config.num_category_entities = new_num_category_entities
-    model.config.entity_counts = new_entity_counts
-    gr.Info("モデルとトークナイザのエンティティの置き換えが完了しました", duration=5)
     return entity_replaced_counts + 1
@@ -385,14 +341,15 @@ with gr.Blocks() as demo:
     bm25_tokenizer.load_vocab_from_hub("studio-ousia/luxe-nayose-bm25")
     bm25_retriever = BM25HF.load_from_hub("studio-ousia/luxe-nayose-bm25")
     # Hint: gr.State に callable を渡すと、それが state の初期値を設定するための関数とみなされて
     # __call__ が引数なしで実行されてしまうため、gr.State の引数に model や tokenizer を単体で渡すとエラーになってしまう。
     # ここでは、モデル一式のタプル（callable でない）を渡すことで、そのようなエラーを回避している。
     # cf. https://www.gradio.app/docs/gradio/state#param-state-value
     models = gr.State((model, tokenizer, bm25_tokenizer, bm25_retriever))
-    input_texts = gr.State([])
-    output_texts = gr.State([])
     entity_replaced_counts = gr.State(0)
@@ -400,26 +357,59 @@ with gr.Blocks() as demo:
     entity_span_sensitivity = gr.State(1.0)
     nayose_coef = gr.State(1.0)
     batch_entity_spans = gr.State([])
     topk_normal_entities = gr.State([])
     topk_category_entities = gr.State([])
     topk_span_entities = gr.State([])
-    selected_entity = gr.State()
-    similar_entities = gr.State([])
-    gr.Markdown("# 📝 LUXE Demo")
     gr.Markdown("## 入力テキスト")
     with gr.Tab(label="直接入力"):
         text_input = gr.Textbox(label=f"入力テキスト（最大{MAX_TEXT_LENGTH}文字）", max_length=MAX_TEXT_LENGTH)
     with gr.Tab(label="ファイルアップロード"):
-        gr.Markdown(f"1行1事例のテキストファイル（最大{MAX_TEXT_FILE_LINES}行）をアップロードできます。")
         texts_file = gr.File(label="入力テキストファイル")
     with gr.Accordion(label="ハイパーパラメータ", open=False):
-        topk_input = gr.Number(5, label="エンティティ件数", interactive=True)
         entity_span_sensitivity_input = gr.Slider(
             minimum=0.0, maximum=5.0, value=1.0, step=0.1, label="エンティティ検出の積極度", interactive=True
         )
@@ -427,25 +417,23 @@ with gr.Blocks() as demo:
             minimum=0.0, maximum=2.0, value=1.0, step=0.1, label="文字列一致の優先度", interactive=True
         )
-    text_input.change(fn=lambda text: [normalize_text(text)], inputs=text_input, outputs=input_texts)
-    texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=input_texts)
     topk_input.change(fn=lambda val: val, inputs=topk_input, outputs=topk)
     entity_span_sensitivity_input.change(
         fn=lambda val: val, inputs=entity_span_sensitivity_input, outputs=entity_span_sensitivity
     )
     nayose_coef_input.change(fn=lambda val: val, inputs=nayose_coef_input, outputs=nayose_coef)
-    with gr.Accordion(label="LUXEのエンティティ語彙を置き換える", open=False):
         gr.Markdown(
-            """LUXEのモデルのエンティティの語彙を任意のエンティティ集合に置き換えます。
-            エンティティと共に与えられるエンティティの説明文から、エンティティの埋め込みが計算されます。""",
             line_breaks=True,
         )
         gr.Markdown(
-            f"「エンティティ」と「エンティティの説明文」の2列からなるCSVファイル（最大{MAX_ENTITY_FILE_LINES}行）をアップロードできます。"
         )
-        new_entity_text_pairs_file = gr.File(label="エンティティと説明文のCSVファイル", height="128px")
-        gr.Markdown("CSVファイルから読み込まれた項目が以下の表に表示されます。表の内容を直接編集することも可能���す。")
         new_entity_text_pairs_input = gr.Dataframe(
             # value=sample_new_entity_text_pairs,
             headers=["entity", "text"],
@@ -454,41 +442,28 @@ with gr.Blocks() as demo:
             label="エンティティと説明文",
             interactive=True,
         )
         replace_entity_button = gr.Button(value="エンティティ語彙を置き換える")
-        gr.Markdown("LUXEのモデルのエンティティ語彙は、デモページの再読み込み時にリセットされます。")
     new_entity_text_pairs_file.change(
         fn=get_new_entity_text_pairs_from_file, inputs=new_entity_text_pairs_file, outputs=new_entity_text_pairs_input
     )
     replace_entity_button.click(
         fn=replace_entities,
-        inputs=[models, new_entity_text_pairs_input, entity_replaced_counts],
         outputs=entity_replaced_counts,
     )
-    submit_button = gr.Button(value="予測実行", variant="huggingface")
-    submit_button.click(
-        fn=get_topk_entities_from_texts,
-        inputs=[models, input_texts, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
-        outputs=[output_texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
-    )
-    text_input.submit(
-        fn=get_topk_entities_from_texts,
-        inputs=[models, input_texts, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
-        outputs=[output_texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
-    )
     gr.Markdown("---")
-    gr.Markdown("## 出力エンティティ")
-    @gr.render(
-        inputs=[output_texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities]
-    )
     def render_topk_entities(
-        output_texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities
     ):
         for text, entity_spans, normal_entities, category_entities, span_entities in zip(
-            output_texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities
         ):
             highlighted_text_value = []
             cur = 0
@@ -503,7 +478,10 @@ with gr.Blocks() as demo:
                 highlighted_text_value.append((text[cur:], None))
             gr.HighlightedText(
-                value=highlighted_text_value, color_map={"Entity": "green"}, combine_adjacent=False, label="Text"
             )
             # gr.Textbox(text, label="Text")
@@ -512,31 +490,22 @@ with gr.Blocks() as demo:
                     label="テキスト全体に関連するエンティティ",
                     components=["text"],
                     samples=[[entity] for entity in normal_entities],
-                ).select(fn=get_selected_entity, outputs=selected_entity)
             if category_entities:
                 gr.Dataset(
                     label="テキスト全体に関連するカテゴリ",
                     components=["text"],
                     samples=[[entity] for entity in category_entities],
-                ).select(fn=get_selected_entity, outputs=selected_entity)
-            span_texts = [text[start:end] for start, end in entity_spans]
-            for span_text, entities in zip(span_texts, span_entities):
-                gr.Dataset(
-                    label=f"「{span_text}」に対応するエンティティ",
-                    components=["text"],
-                    samples=[[entity] for entity in entities],
-                ).select(fn=get_selected_entity, outputs=selected_entity)
-    #     gr.Markdown("---")
-    #     gr.Markdown("## 選択されたエンティティの類似エンティティ")
-    # selected_entity.change(fn=get_similar_entities, inputs=[models, selected_entity], outputs=similar_entities)
-    # @gr.render(inputs=[selected_entity, similar_entities])
-    # def render_similar_entities(selected_entity, similar_entities):
-    #     gr.Textbox(selected_entity, label="Selected Entity")
-    #     gr.Dataset(label="Similar Entities", components=["text"], samples=[[entity] for entity in similar_entities])
 demo.launch()

 import re
 import unicodedata
 from collections import defaultdict
+from itertools import chain
 import gradio as gr
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from bm25s.hf import BM25HF, TokenizerHF
 from transformers import AutoModelForPreTraining, AutoTokenizer
 ALIAS_SEP = "|"
+CATEGORY_ENTITY_PREFIX = "Category:"
 ENTITY_SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[MASK]", "[MASK2]"]
 MAX_TEXT_LENGTH = 800
+MAX_TEXT_FILE_LINES = 10
 MAX_ENTITY_FILE_LINES = 1000
 repo_id = "studio-ousia/luxe"
 ]
+def clean_default_entity_vocab(tokenizer):
+    entity_vocab = {}
+    for entity, entity_id in tokenizer.entity_vocab.items():
+        if entity.startswith("ja:"):
+            entity = entity.removeprefix("ja:")
+        elif entity.startswith("Category:ja:"):
+            entity = "Category:" + entity.removeprefix("Category:ja:")
+        entity_vocab[entity] = entity_id
+    tokenizer.entity_vocab = entity_vocab
 def normalize_text(text: str) -> str:
+    return unicodedata.normalize("NFKC", text).strip()
 def get_texts_from_file(file_path: str | None):
                 reader = csv.DictReader(f, fieldnames=["text"])
                 for i, row in enumerate(reader):
                     if i >= MAX_TEXT_FILE_LINES:
+                        gr.Info(f"{MAX_TEXT_FILE_LINES}行目までのデータを読み込みました。", duration=5)
                         break
+                    text = row["text"]
+                    if text.strip() != "":
                         texts.append(text[:MAX_TEXT_LENGTH])
         except Exception as e:
+            gr.Warning("ファイルを正しく読み込めませんでした。", duration=5)
             print(e)
             texts = []
     return texts
 def get_token_spans(tokenizer, text: str) -> list[tuple[int, int]]:
     token_spans = []
     end = 0
 def get_topk_entities_from_texts(
     models,
+    texts: str | list[str],
     k: int = 5,
     entity_span_sensitivity: float = 1.0,
     nayose_coef: float = 1.0,
     entity_replaced_counts: bool = False,
 ) -> tuple[list[list[tuple[int, int]]], list[list[str]], list[list[str]], list[list[list[str]]]]:
+    gr.Info("LUXEによる予測を実行しています。", duration=5)
+    if isinstance(texts, str):
+        texts = [texts]
     model, tokenizer, bm25_tokenizer, bm25_retriever = models
     batch_entity_spans: list[list[tuple[int, int]]] = []
         and any(re.search(pattern, entity) for pattern in ignore_category_patterns)
     ]
+    entity_k = min(k, len(id2normal_entity))
+    category_k = min(k, len(id2category_entity))
     for text in texts:
+        text = normalize_text(text).strip()
         tokenized_examples = tokenizer(text, return_tensors="pt")
         model_outputs = model(**tokenized_examples)
         token_spans = get_token_spans(tokenizer, text)
         model_outputs = model(**tokenized_examples)
         if model_outputs.topic_entity_logits is not None:
+            _, topk_normal_entity_ids = model_outputs.topic_entity_logits[0].topk(entity_k)
             topk_normal_entities.append([id2normal_entity[id_] for id_ in topk_normal_entity_ids.tolist()])
         else:
             topk_normal_entities.append([])
         if model_outputs.topic_category_logits is not None:
             model_outputs.topic_category_logits[:, ignore_category_entity_ids] = float("-inf")
+            _, topk_category_entity_ids = model_outputs.topic_category_logits[0].topk(category_k)
             topk_category_entities.append([id2category_entity[id_] for id_ in topk_category_entity_ids.tolist()])
         else:
             topk_category_entities.append([])
                 )
                 span_entity_logits += nayose_coef * nayose_scores
+            _, topk_span_entity_ids = span_entity_logits.topk(entity_k)
             topk_span_entities.append(
                 [[id2normal_entity[id_] for id_ in ids] for ids in topk_span_entity_ids.tolist()]
             )
     return texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities
 def get_new_entity_text_pairs_from_file(file_path: str | None) -> list[list[str]]:
     new_entity_text_pairs = []
     if file_path is not None:
                 reader = csv.DictReader(f, fieldnames=["entity", "text"])
                 for i, row in enumerate(reader):
                     if i >= MAX_ENTITY_FILE_LINES:
+                        gr.Info(f"{MAX_ENTITY_FILE_LINES}行目までのデータを読み込みました。", duration=5)
                         break
                     entity = normalize_text(row["entity"]).strip()
                     if entity != "" and text != "":
                         new_entity_text_pairs.append([entity, text])
         except Exception as e:
+            gr.Warning("ファイルを正しく読み込めませんでした。", duration=5)
             print(e)
             new_entity_text_pairs = []
 def replace_entities(
+    models, new_entity_text_pairs: list[tuple[str, str]], entity_replaced_counts: int, preserve_default_entities: bool
+) -> int:
+    if len(new_entity_text_pairs) == 0:
+        return entity_replaced_counts
+    gr.Info("LUXEのモデルとトークナイザのエンティティ語彙を更新しています。完了までお待ちください。", duration=5)
+    model, tokenizer, bm25_tokenizer, bm25_retriever = models
+    normal_entity_embeddings = defaultdict(list)  # entity -> list of embeddings
+    category_entity_embeddings = defaultdict(list)  # entity -> list of embeddings
+    normal_entity_counts = {}  # entity -> count (int)
+    category_entity_counts = {}  # entity -> count (int)
+    for entity, entity_id in sorted(tokenizer.entity_vocab.items(), key=lambda x: x[1]):
+        if entity in ENTITY_SPECIAL_TOKENS or preserve_default_entities:
+            entity_embedding = model.luke.entity_embeddings.entity_embeddings.weight.data[entity_id]
+            if entity.startswith(CATEGORY_ENTITY_PREFIX):
+                category_entity_embeddings[entity].append(entity_embedding)
+                if model.config.entity_counts is not None:
+                    category_entity_counts[entity] = model.config.entity_counts[entity_id]
+                else:
+                    category_entity_counts[entity] = 1
+            else:
+                normal_entity_embeddings[entity].append(entity_embedding)
+                if model.config.entity_counts is not None:
+                    normal_entity_counts[entity] = model.config.entity_counts[entity_id]
+                else:
+                    normal_entity_counts[entity] = 1
     for entity, text in new_entity_text_pairs:
         tokenized_inputs = tokenizer(text[:MAX_TEXT_LENGTH], return_tensors="pt")
         model_outputs = model(**tokenized_inputs)
+        entity_embedding = model.entity_predictions.transform(model_outputs.last_hidden_state[:, 0])[0]
+        if entity.startswith(CATEGORY_ENTITY_PREFIX):
+            category_entity_embeddings[entity].append(entity_embedding)
+            category_entity_counts.setdefault(entity, 1)
+        else:
+            normal_entity_embeddings[entity].append(entity_embedding)
+            normal_entity_counts.setdefault(entity, 1)
+    num_normal_entities = len(normal_entity_embeddings)
+    num_category_entities = len(category_entity_embeddings)
+    entity_embeddings = {
+        entity: sum(embeddings) / len(embeddings)
+        for entity, embeddings in chain(normal_entity_embeddings.items(), category_entity_embeddings.items())
+    }
+    entity_vocab = {entity: entity_id for entity_id, entity in enumerate(entity_embeddings.keys())}
+    entity_counts = [
+        category_entity_counts[entity] if entity.startswith(CATEGORY_ENTITY_PREFIX) else normal_entity_counts[entity]
+        for entity in entity_vocab.keys()
+    ]
+    tokenizer.entity_vocab = entity_vocab
+    tokenizer.entity_pad_token_id = entity_vocab["[PAD]"]
+    tokenizer.entity_unk_token_id = entity_vocab["[UNK]"]
+    tokenizer.entity_mask_token_id = entity_vocab["[MASK]"]
+    tokenizer.entity_mask2_token_id = entity_vocab["[MASK2]"]
+    entity_embeddings_tensor = torch.vstack(list(entity_embeddings.values()))
     if model.config.normalize_entity_embeddings:
+        entity_embeddings_tensor = F.normalize(entity_embeddings_tensor)
+    entity_vocab_size, entity_emb_size = entity_embeddings_tensor.size()
+    entity_embeddings_module = nn.Embedding(
+        entity_vocab_size,
+        entity_emb_size,
+        padding_idx=tokenizer.entity_pad_token_id,
         device=model.luke.entity_embeddings.entity_embeddings.weight.device,
         dtype=model.luke.entity_embeddings.entity_embeddings.weight.dtype,
     )
+    entity_embeddings_module.weight.data = entity_embeddings_tensor.data
+    model.luke.entity_embeddings.entity_embeddings = entity_embeddings_module
+    entity_decoder_module = nn.Linear(entity_emb_size, entity_vocab_size, bias=False)
+    model.entity_predictions.decoder = entity_decoder_module
+    model.entity_predictions.bias = nn.Parameter(torch.zeros(entity_vocab_size))
     model.tie_weights()
+    if model.config.entity_counts is not None:
+        total_normal_entity_count = sum(entity_counts[:num_normal_entities])
+        total_category_entity_count = sum(entity_counts[num_normal_entities:])
+        entity_counts_tensor = torch.tensor(entity_counts, dtype=model.dtype, device=model.device)
+        total_entity_counts = torch.tensor(
+            [total_normal_entity_count] * num_normal_entities + [total_category_entity_count] * num_category_entities,
+            dtype=model.dtype,
+            device=model.device,
+        )
+        entity_log_probs = torch.log(entity_counts_tensor / total_entity_counts)
+        model.entity_log_probs = entity_log_probs
+    model.config.entity_vocab_size = entity_vocab_size
+    model.config.num_normal_entities = num_normal_entities
+    model.config.num_category_entities = num_category_entities
+    if model.config.entity_counts is not None:
+        model.config.entity_counts = entity_counts
+    gr.Info("LUXEのモデルとトークナイザのエンティティ語彙の更新が完了しました。", duration=5)
     return entity_replaced_counts + 1
     bm25_tokenizer.load_vocab_from_hub("studio-ousia/luxe-nayose-bm25")
     bm25_retriever = BM25HF.load_from_hub("studio-ousia/luxe-nayose-bm25")
+    clean_default_entity_vocab(tokenizer)
     # Hint: gr.State に callable を渡すと、それが state の初期値を設定するための関数とみなされて
     # __call__ が引数なしで実行されてしまうため、gr.State の引数に model や tokenizer を単体で渡すとエラーになってしまう。
     # ここでは、モデル一式のタプル（callable でない）を渡すことで、そのようなエラーを回避している。
     # cf. https://www.gradio.app/docs/gradio/state#param-state-value
     models = gr.State((model, tokenizer, bm25_tokenizer, bm25_retriever))
+    texts_input = gr.State([])
     entity_replaced_counts = gr.State(0)
     entity_span_sensitivity = gr.State(1.0)
     nayose_coef = gr.State(1.0)
+    texts = gr.State([])
     batch_entity_spans = gr.State([])
     topk_normal_entities = gr.State([])
     topk_category_entities = gr.State([])
     topk_span_entities = gr.State([])
+    gr.Markdown("# 📝 LUXE Demo (β版)")
+    gr.Markdown(
+        """Studio Ousia で開発中の次世代知識強化言語モデル **LUXE** の動作デモです。
+        入力されたテキストに対して、テキスト中に出現するエンティティ（事物）と、テキスト全体の主題となるエンティティおよびカテゴリを予測します。
+        デフォルトのLUXEは、エンティティおよびカテゴリとして、それぞれ日本語 Wikipedia における被リンク数上位50万件および10万件の項目を使用しています。
+        予測対象のエンティティを任意のものに置き換えて推論を行うことも可能です（下記「LUXE のエンティティ語彙を置き換える」を参照してください）。""",
+        line_breaks=True,
+    )
     gr.Markdown("## 入力テキスト")
     with gr.Tab(label="直接入力"):
         text_input = gr.Textbox(label=f"入力テキスト（最大{MAX_TEXT_LENGTH}文字）", max_length=MAX_TEXT_LENGTH)
+        text_submit_button = gr.Button(value="予測実行", variant="huggingface")
     with gr.Tab(label="ファイルアップロード"):
+        gr.Markdown(
+            f"""1行1事例のテキストファイル（最大{MAX_TEXT_FILE_LINES}行）をアップロードできます。
+            アップロードされたテキストのそれぞれに対して推論が実行されます。""",
+            line_breaks=True,
+        )
         texts_file = gr.File(label="入力テキストファイル")
+        texts_submit_button = gr.Button(value="予測実行", variant="huggingface")
+    text_input.submit(
+        fn=get_topk_entities_from_texts,
+        inputs=[models, text_input, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
+        outputs=[texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
+    )
+    text_submit_button.click(
+        fn=get_topk_entities_from_texts,
+        inputs=[models, text_input, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
+        outputs=[texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
+    )
+    texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=texts_input)
+    texts_submit_button.click(
+        fn=get_topk_entities_from_texts,
+        inputs=[models, texts_input, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
+        outputs=[texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
+    )
+    gr.Markdown("---")
     with gr.Accordion(label="ハイパーパラメータ", open=False):
+        topk_input = gr.Number(5, label="予測するエンティティの件数 (Top K)", interactive=True)
         entity_span_sensitivity_input = gr.Slider(
             minimum=0.0, maximum=5.0, value=1.0, step=0.1, label="エンティティ検出の積極度", interactive=True
         )
             minimum=0.0, maximum=2.0, value=1.0, step=0.1, label="文字列一致の優先度", interactive=True
         )
     topk_input.change(fn=lambda val: val, inputs=topk_input, outputs=topk)
     entity_span_sensitivity_input.change(
         fn=lambda val: val, inputs=entity_span_sensitivity_input, outputs=entity_span_sensitivity
     )
     nayose_coef_input.change(fn=lambda val: val, inputs=nayose_coef_input, outputs=nayose_coef)
+    with gr.Accordion(label="LUXE のエンティティ語彙を置き換える", open=False):
         gr.Markdown(
+            """LUXE のモデルとトークナイザのエンティティ語彙を任意のエンティティ集合に置き換えます。
+            エンティティとともに与えられるエンティティの説明文から、エンティティの埋め込みが計算され、LUXE の推論に利用されます。""",
             line_breaks=True,
         )
         gr.Markdown(
+            f"「エンティティ」と「エンティティの説明文」の2列からなる CSV ファイル（最大{MAX_ENTITY_FILE_LINES}行）をアップロードできます。"
         )
+        new_entity_text_pairs_file = gr.File(label="エンティティと説明文の CSV ファイル", height="128px")
+        gr.Markdown("CSV ファイルから読み込まれた項目が以下の表に表示されます。表の内容を直接編集することも可能です。")
         new_entity_text_pairs_input = gr.Dataframe(
             # value=sample_new_entity_text_pairs,
             headers=["entity", "text"],
             label="エンティティと説明文",
             interactive=True,
         )
+        preserve_default_entities_checkbox = gr.Checkbox(label="既存のエンティティを保持する", value=True)
         replace_entity_button = gr.Button(value="エンティティ語彙を置き換える")
+        gr.Markdown("LUXE のモデルのエンティティ語彙は、デモページの再読み込み時にリセットされます。")
     new_entity_text_pairs_file.change(
         fn=get_new_entity_text_pairs_from_file, inputs=new_entity_text_pairs_file, outputs=new_entity_text_pairs_input
     )
     replace_entity_button.click(
         fn=replace_entities,
+        inputs=[models, new_entity_text_pairs_input, entity_replaced_counts, preserve_default_entities_checkbox],
         outputs=entity_replaced_counts,
     )
     gr.Markdown("---")
+    gr.Markdown("## 予測されたエンティティとカテゴリ")
+    @gr.render(inputs=[texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities])
     def render_topk_entities(
+        texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities
     ):
         for text, entity_spans, normal_entities, category_entities, span_entities in zip(
+            texts, batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities
         ):
             highlighted_text_value = []
             cur = 0
                 highlighted_text_value.append((text[cur:], None))
             gr.HighlightedText(
+                value=highlighted_text_value,
+                color_map={"Entity": "green"},
+                combine_adjacent=False,
+                label="予測されたエンティティのスパン",
             )
             # gr.Textbox(text, label="Text")
                     label="テキスト全体に関連するエンティティ",
                     components=["text"],
                     samples=[[entity] for entity in normal_entities],
+                )
             if category_entities:
                 gr.Dataset(
                     label="テキスト全体に関連するカテゴリ",
                     components=["text"],
                     samples=[[entity] for entity in category_entities],
+                )
+            with gr.Accordion(label="テキスト中のスパンに対応するエンティティ", open=len(texts) == 1):
+                span_texts = [text[start:end] for start, end in entity_spans]
+                for span_text, entities in zip(span_texts, span_entities):
+                    gr.Dataset(
+                        label=f"「{span_text}」に対応するエンティティ",
+                        components=["text"],
+                        samples=[[entity] for entity in entities],
+                    )
 demo.launch()