Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -353,53 +353,39 @@ import nltk
|
|
353 |
# Load a pre-trained embedding model
|
354 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Use a lightweight model for speed
|
355 |
|
356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
try:
|
358 |
-
#
|
359 |
query_embedding = embedding_model.encode([query])
|
360 |
-
|
361 |
relevant_portions = {}
|
362 |
for doc_id, doc_text in enumerate(document_texts):
|
363 |
-
#
|
364 |
-
|
365 |
|
366 |
-
#
|
367 |
-
|
368 |
|
369 |
-
# Compute cosine
|
370 |
-
similarities = cosine_similarity(query_embedding,
|
371 |
|
372 |
-
# Rank
|
373 |
-
|
374 |
-
enumerate(
|
375 |
key=lambda x: similarities[x[0]],
|
376 |
reverse=True
|
377 |
)
|
378 |
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
for idx, (sentence_idx, sentence) in enumerate(ranked_sentences):
|
383 |
-
if idx >= max_portions: # Stop if we've reached the max number of portions
|
384 |
-
break
|
385 |
-
|
386 |
-
# Get the surrounding sentences for context
|
387 |
-
start_idx = max(0, sentence_idx - portion_size // 2)
|
388 |
-
end_idx = min(len(sentences), sentence_idx + portion_size // 2 + 1)
|
389 |
-
|
390 |
-
# Avoid selecting overlapping portions
|
391 |
-
if any(i in selected_indices for i in range(start_idx, end_idx)):
|
392 |
-
continue
|
393 |
-
|
394 |
-
portion = " ".join(sentences[start_idx:end_idx])
|
395 |
-
doc_relevant_portions.append(portion)
|
396 |
-
|
397 |
-
# Mark indices as selected
|
398 |
-
selected_indices.update(range(start_idx, end_idx))
|
399 |
-
|
400 |
-
# Add results to the final output
|
401 |
relevant_portions[f"Document_{doc_id}"] = doc_relevant_portions
|
402 |
-
|
403 |
return relevant_portions
|
404 |
except Exception as e:
|
405 |
print(f"Error in extracting relevant portions: {e}")
|
|
|
353 |
# Load a pre-trained embedding model
|
354 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Use a lightweight model for speed
|
355 |
|
356 |
+
from sentence_transformers import SentenceTransformer
|
357 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
358 |
+
|
359 |
+
# Load the embedding model globally for efficiency
|
360 |
+
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
361 |
+
|
362 |
+
def extract_relevant_portions(document_texts, query, max_portions=3, chunk_size=500):
|
363 |
try:
|
364 |
+
# Embed the query once
|
365 |
query_embedding = embedding_model.encode([query])
|
366 |
+
|
367 |
relevant_portions = {}
|
368 |
for doc_id, doc_text in enumerate(document_texts):
|
369 |
+
# Split document into chunks (e.g., 500 characters per chunk)
|
370 |
+
chunks = [doc_text[i:i + chunk_size] for i in range(0, len(doc_text), chunk_size)]
|
371 |
|
372 |
+
# Embed all chunks in a single batch
|
373 |
+
chunk_embeddings = embedding_model.encode(chunks)
|
374 |
|
375 |
+
# Compute cosine similarity between query and all chunks
|
376 |
+
similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
|
377 |
|
378 |
+
# Rank chunks by similarity
|
379 |
+
ranked_chunks = sorted(
|
380 |
+
enumerate(chunks),
|
381 |
key=lambda x: similarities[x[0]],
|
382 |
reverse=True
|
383 |
)
|
384 |
|
385 |
+
# Select top chunks based on similarity
|
386 |
+
doc_relevant_portions = [chunk for _, chunk in ranked_chunks[:max_portions]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
relevant_portions[f"Document_{doc_id}"] = doc_relevant_portions
|
388 |
+
|
389 |
return relevant_portions
|
390 |
except Exception as e:
|
391 |
print(f"Error in extracting relevant portions: {e}")
|