Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -13,10 +13,6 @@ import re
|
|
13 |
from pythainlp.tokenize import sent_tokenize
|
14 |
from unstructured.partition.html import partition_html
|
15 |
|
16 |
-
url = "https://www.dataxet.co/media-landscape/2024-th"
|
17 |
-
elements = partition_html(url=url)
|
18 |
-
context = [str(element) for element in elements if len(str(element)) >60]
|
19 |
-
|
20 |
DEFAULT_MODEL = 'wangchanberta'
|
21 |
DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
|
22 |
|
@@ -25,8 +21,6 @@ MODEL_DICT = {
|
|
25 |
'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
|
26 |
}
|
27 |
|
28 |
-
EMBEDDINGS_PATH = 'data/embeddings.pkl'
|
29 |
-
|
30 |
def load_model(model_name=DEFAULT_MODEL):
|
31 |
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
|
32 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
|
@@ -87,7 +81,7 @@ def model_pipeline(model, tokenizer, question, context):
|
|
87 |
Answer = tokenizer.decode(predict_answer_tokens)
|
88 |
return Answer.replace('<unk>','@')
|
89 |
|
90 |
-
def predict_test(
|
91 |
t = time.time()
|
92 |
question = question.strip()
|
93 |
question_vector = get_embeddings(embedding_model, question)
|
@@ -99,17 +93,22 @@ def predict_test(model, tokenizer, embedding_model, context, question, index):
|
|
99 |
most_sim_context = context[indices[0][i]].strip()
|
100 |
# most_similar_contexts.append(most_sim_context)
|
101 |
most_similar_contexts += str(i)+': '+most_sim_context + "\n\n"
|
102 |
-
|
103 |
return most_similar_contexts
|
104 |
|
105 |
|
106 |
|
107 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
108 |
embedding_model = load_embedding_model()
|
109 |
index = set_index(prepare_sentences_vector(get_embeddings(embedding_model, context)))
|
110 |
|
111 |
def chat_interface(question, history):
|
112 |
-
response = predict_test(
|
113 |
return response
|
114 |
|
115 |
examples=['ภูมิทัศน์สื่อไทยในปี 2567 มีแนวโน้มว่า ',
|
@@ -117,7 +116,9 @@ if __name__ == "__main__":
|
|
117 |
'ติ๊กต๊อก คือ',
|
118 |
'รายงานจาก Reuters Institute'
|
119 |
]
|
|
|
|
|
120 |
interface = gr.ChatInterface(fn=chat_interface,
|
121 |
examples=examples)
|
122 |
|
123 |
-
interface.launch()
|
|
|
13 |
from pythainlp.tokenize import sent_tokenize
|
14 |
from unstructured.partition.html import partition_html
|
15 |
|
|
|
|
|
|
|
|
|
16 |
DEFAULT_MODEL = 'wangchanberta'
|
17 |
DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
|
18 |
|
|
|
21 |
'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
|
22 |
}
|
23 |
|
|
|
|
|
24 |
def load_model(model_name=DEFAULT_MODEL):
|
25 |
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
|
26 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
|
|
|
81 |
Answer = tokenizer.decode(predict_answer_tokens)
|
82 |
return Answer.replace('<unk>','@')
|
83 |
|
84 |
+
def predict_test(embedding_model, context, question, index): # sent_tokenize pythainlp
|
85 |
t = time.time()
|
86 |
question = question.strip()
|
87 |
question_vector = get_embeddings(embedding_model, question)
|
|
|
93 |
most_sim_context = context[indices[0][i]].strip()
|
94 |
# most_similar_contexts.append(most_sim_context)
|
95 |
most_similar_contexts += str(i)+': '+most_sim_context + "\n\n"
|
96 |
+
print(most_similar_contexts)
|
97 |
return most_similar_contexts
|
98 |
|
99 |
|
100 |
|
101 |
if __name__ == "__main__":
|
102 |
+
|
103 |
+
url = "https://www.dataxet.co/media-landscape/2024-th"
|
104 |
+
elements = partition_html(url=url)
|
105 |
+
context = [str(element) for element in elements if len(str(element)) >60]
|
106 |
+
|
107 |
embedding_model = load_embedding_model()
|
108 |
index = set_index(prepare_sentences_vector(get_embeddings(embedding_model, context)))
|
109 |
|
110 |
def chat_interface(question, history):
|
111 |
+
response = predict_test(embedding_model, context, question, index)
|
112 |
return response
|
113 |
|
114 |
examples=['ภูมิทัศน์สื่อไทยในปี 2567 มีแนวโน้มว่า ',
|
|
|
116 |
'ติ๊กต๊อก คือ',
|
117 |
'รายงานจาก Reuters Institute'
|
118 |
]
|
119 |
+
|
120 |
+
|
121 |
interface = gr.ChatInterface(fn=chat_interface,
|
122 |
examples=examples)
|
123 |
|
124 |
+
interface.launch()
|