Spaces:

powerpuf-bot
/

web-qa

Sleeping

App Files Files Community

Chananchida commited on Feb 22, 2024

Commit

2e5824a

verified ·

1 Parent(s): 0f46926

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -10

app.py CHANGED Viewed

@@ -13,10 +13,6 @@ import re
 from pythainlp.tokenize import sent_tokenize
 from unstructured.partition.html import partition_html
-url = "https://www.dataxet.co/media-landscape/2024-th"
-elements = partition_html(url=url)
-context = [str(element) for element in elements  if len(str(element)) >60]
 DEFAULT_MODEL = 'wangchanberta'
 DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
@@ -25,8 +21,6 @@ MODEL_DICT = {
     'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
 }
-EMBEDDINGS_PATH = 'data/embeddings.pkl'
 def load_model(model_name=DEFAULT_MODEL):
     model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
     tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
@@ -87,7 +81,7 @@ def model_pipeline(model, tokenizer, question, context):
     Answer = tokenizer.decode(predict_answer_tokens)
     return Answer.replace('<unk>','@')
-def predict_test(model, tokenizer, embedding_model, context, question, index):  # sent_tokenize pythainlp
     t = time.time()
     question = question.strip()
     question_vector = get_embeddings(embedding_model, question)
@@ -99,17 +93,22 @@ def predict_test(model, tokenizer, embedding_model, context, question, index):
         most_sim_context = context[indices[0][i]].strip()
         # most_similar_contexts.append(most_sim_context)
         most_similar_contexts += str(i)+': '+most_sim_context + "\n\n"
     return most_similar_contexts
 if __name__ == "__main__":
     embedding_model = load_embedding_model()
     index = set_index(prepare_sentences_vector(get_embeddings(embedding_model, context)))
     def chat_interface(question, history):
-        response = predict_test(model, tokenizer, embedding_model, context, question, index)
         return response
     examples=['ภูมิทัศน์สื่อไทยในปี 2567 มีแนวโน้มว่า ',
@@ -117,7 +116,9 @@ if __name__ == "__main__":
                'ติ๊กต๊อก คือ',
                'รายงานจาก Reuters Institute'
               ]
     interface = gr.ChatInterface(fn=chat_interface,
                                     examples=examples)
-    interface.launch()

 from pythainlp.tokenize import sent_tokenize
 from unstructured.partition.html import partition_html
 DEFAULT_MODEL = 'wangchanberta'
 DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
     'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
 }
 def load_model(model_name=DEFAULT_MODEL):
     model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
     tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
     Answer = tokenizer.decode(predict_answer_tokens)
     return Answer.replace('<unk>','@')
+def predict_test(embedding_model, context, question, index):  # sent_tokenize pythainlp
     t = time.time()
     question = question.strip()
     question_vector = get_embeddings(embedding_model, question)
         most_sim_context = context[indices[0][i]].strip()
         # most_similar_contexts.append(most_sim_context)
         most_similar_contexts += str(i)+': '+most_sim_context + "\n\n"
+    print(most_similar_contexts)
     return most_similar_contexts
 if __name__ == "__main__":
+    url = "https://www.dataxet.co/media-landscape/2024-th"
+    elements = partition_html(url=url)
+    context = [str(element) for element in elements  if len(str(element)) >60]
     embedding_model = load_embedding_model()
     index = set_index(prepare_sentences_vector(get_embeddings(embedding_model, context)))
     def chat_interface(question, history):
+        response = predict_test(embedding_model, context, question, index)
         return response
     examples=['ภูมิทัศน์สื่อไทยในปี 2567 มีแนวโน้มว่า ',
                'ติ๊กต๊อก คือ',
                'รายงานจาก Reuters Institute'
               ]
     interface = gr.ChatInterface(fn=chat_interface,
                                     examples=examples)
+    interface.launch()