Chananchida commited on
Commit
2e5824a
·
verified ·
1 Parent(s): 0f46926

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -13,10 +13,6 @@ import re
13
  from pythainlp.tokenize import sent_tokenize
14
  from unstructured.partition.html import partition_html
15
 
16
- url = "https://www.dataxet.co/media-landscape/2024-th"
17
- elements = partition_html(url=url)
18
- context = [str(element) for element in elements if len(str(element)) >60]
19
-
20
  DEFAULT_MODEL = 'wangchanberta'
21
  DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
22
 
@@ -25,8 +21,6 @@ MODEL_DICT = {
25
  'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
26
  }
27
 
28
- EMBEDDINGS_PATH = 'data/embeddings.pkl'
29
-
30
  def load_model(model_name=DEFAULT_MODEL):
31
  model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
32
  tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
@@ -87,7 +81,7 @@ def model_pipeline(model, tokenizer, question, context):
87
  Answer = tokenizer.decode(predict_answer_tokens)
88
  return Answer.replace('<unk>','@')
89
 
90
- def predict_test(model, tokenizer, embedding_model, context, question, index): # sent_tokenize pythainlp
91
  t = time.time()
92
  question = question.strip()
93
  question_vector = get_embeddings(embedding_model, question)
@@ -99,17 +93,22 @@ def predict_test(model, tokenizer, embedding_model, context, question, index):
99
  most_sim_context = context[indices[0][i]].strip()
100
  # most_similar_contexts.append(most_sim_context)
101
  most_similar_contexts += str(i)+': '+most_sim_context + "\n\n"
102
-
103
  return most_similar_contexts
104
 
105
 
106
 
107
  if __name__ == "__main__":
 
 
 
 
 
108
  embedding_model = load_embedding_model()
109
  index = set_index(prepare_sentences_vector(get_embeddings(embedding_model, context)))
110
 
111
  def chat_interface(question, history):
112
- response = predict_test(model, tokenizer, embedding_model, context, question, index)
113
  return response
114
 
115
  examples=['ภูมิทัศน์สื่อไทยในปี 2567 มีแนวโน้มว่า ',
@@ -117,7 +116,9 @@ if __name__ == "__main__":
117
  'ติ๊กต๊อก คือ',
118
  'รายงานจาก Reuters Institute'
119
  ]
 
 
120
  interface = gr.ChatInterface(fn=chat_interface,
121
  examples=examples)
122
 
123
- interface.launch()
 
13
  from pythainlp.tokenize import sent_tokenize
14
  from unstructured.partition.html import partition_html
15
 
 
 
 
 
16
  DEFAULT_MODEL = 'wangchanberta'
17
  DEFAULT_SENTENCE_EMBEDDING_MODEL = 'intfloat/multilingual-e5-base'
18
 
 
21
  'wangchanberta-hyp': 'Chananchida/wangchanberta-xet_hyp-params',
22
  }
23
 
 
 
24
  def load_model(model_name=DEFAULT_MODEL):
25
  model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
26
  tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
 
81
  Answer = tokenizer.decode(predict_answer_tokens)
82
  return Answer.replace('<unk>','@')
83
 
84
+ def predict_test(embedding_model, context, question, index): # sent_tokenize pythainlp
85
  t = time.time()
86
  question = question.strip()
87
  question_vector = get_embeddings(embedding_model, question)
 
93
  most_sim_context = context[indices[0][i]].strip()
94
  # most_similar_contexts.append(most_sim_context)
95
  most_similar_contexts += str(i)+': '+most_sim_context + "\n\n"
96
+ print(most_similar_contexts)
97
  return most_similar_contexts
98
 
99
 
100
 
101
  if __name__ == "__main__":
102
+
103
+ url = "https://www.dataxet.co/media-landscape/2024-th"
104
+ elements = partition_html(url=url)
105
+ context = [str(element) for element in elements if len(str(element)) >60]
106
+
107
  embedding_model = load_embedding_model()
108
  index = set_index(prepare_sentences_vector(get_embeddings(embedding_model, context)))
109
 
110
  def chat_interface(question, history):
111
+ response = predict_test(embedding_model, context, question, index)
112
  return response
113
 
114
  examples=['ภูมิทัศน์สื่อไทยในปี 2567 มีแนวโน้มว่า ',
 
116
  'ติ๊กต๊อก คือ',
117
  'รายงานจาก Reuters Institute'
118
  ]
119
+
120
+
121
  interface = gr.ChatInterface(fn=chat_interface,
122
  examples=examples)
123
 
124
+ interface.launch()