Sirinoot commited on
Commit
f6a46b7
·
verified ·
1 Parent(s): 5b11bcc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -2
app.py CHANGED
@@ -155,8 +155,17 @@ def predict_test(model, tokenizer, embedding_model, df, question, index): # sen
155
  mostSimContext = mostSimContext.strip()
156
  mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
157
 
158
-
159
  segments = sent_tokenize(mostSimContext, engine="crfcut")
 
 
 
 
 
 
 
 
 
 
160
  segments_index = set_index(get_embeddings(embedding_model,segments))
161
  _distances,_indices = faiss_search(segments_index, question_vector)
162
  mostSimSegment = segments[_indices[0][0]]
@@ -166,8 +175,10 @@ def predict_test(model, tokenizer, embedding_model, df, question, index): # sen
166
  # Find the start and end indices of mostSimSegment within mostSimContext
167
  start_index = mostSimContext.find(Answer)
168
  end_index = start_index + len(Answer)
169
- print(f"startIndex =>{ start_index} endIndex =>{ end_index}")
 
170
  print(f"mostSimContext{len(mostSimContext)}=>{mostSimContext}\nsegments{len(segments)}=>{segments}\nmostSimSegment{len(mostSimSegment)}=>{mostSimSegment}")
 
171
  _time = time.time() - t
172
  output = {
173
  "user_question": question,
 
155
  mostSimContext = mostSimContext.strip()
156
  mostSimContext = re.sub(r'\s+', ' ', mostSimContext)
157
 
 
158
  segments = sent_tokenize(mostSimContext, engine="crfcut")
159
+
160
+ #=====add
161
+ if (len(segments)==1):
162
+ segments = ' '.join(segments)
163
+
164
+ segments = segments.split('และ')
165
+ segments = [segment.split('หรือ') for segment in segments]
166
+ segments = [sentence for segment in segments for sentence in segment]
167
+ #=====end
168
+
169
  segments_index = set_index(get_embeddings(embedding_model,segments))
170
  _distances,_indices = faiss_search(segments_index, question_vector)
171
  mostSimSegment = segments[_indices[0][0]]
 
175
  # Find the start and end indices of mostSimSegment within mostSimContext
176
  start_index = mostSimContext.find(Answer)
177
  end_index = start_index + len(Answer)
178
+
179
+ print(f"answer {len(answer)} => {answer} || startIndex =>{start_index} || endIndex =>{end_index}")
180
  print(f"mostSimContext{len(mostSimContext)}=>{mostSimContext}\nsegments{len(segments)}=>{segments}\nmostSimSegment{len(mostSimSegment)}=>{mostSimSegment}")
181
+
182
  _time = time.time() - t
183
  output = {
184
  "user_question": question,