thechaiexperiment commited on
Commit
4a50eaf
·
verified ·
1 Parent(s): 053b384

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -35
app.py CHANGED
@@ -353,53 +353,39 @@ import nltk
353
  # Load a pre-trained embedding model
354
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Use a lightweight model for speed
355
 
356
- def extract_relevant_portions(document_texts, query, max_portions=3, portion_size=1, min_query_words=2):
 
 
 
 
 
 
357
  try:
358
- # Generate embedding for the query
359
  query_embedding = embedding_model.encode([query])
360
-
361
  relevant_portions = {}
362
  for doc_id, doc_text in enumerate(document_texts):
363
- # Tokenize document into sentences
364
- sentences = nltk.sent_tokenize(doc_text)
365
 
366
- # Generate embeddings for all sentences
367
- sentence_embeddings = embedding_model.encode(sentences)
368
 
369
- # Compute cosine similarities between the query and all sentences
370
- similarities = cosine_similarity(query_embedding, sentence_embeddings)[0]
371
 
372
- # Rank sentences by similarity scores
373
- ranked_sentences = sorted(
374
- enumerate(sentences),
375
  key=lambda x: similarities[x[0]],
376
  reverse=True
377
  )
378
 
379
- doc_relevant_portions = []
380
- selected_indices = set()
381
-
382
- for idx, (sentence_idx, sentence) in enumerate(ranked_sentences):
383
- if idx >= max_portions: # Stop if we've reached the max number of portions
384
- break
385
-
386
- # Get the surrounding sentences for context
387
- start_idx = max(0, sentence_idx - portion_size // 2)
388
- end_idx = min(len(sentences), sentence_idx + portion_size // 2 + 1)
389
-
390
- # Avoid selecting overlapping portions
391
- if any(i in selected_indices for i in range(start_idx, end_idx)):
392
- continue
393
-
394
- portion = " ".join(sentences[start_idx:end_idx])
395
- doc_relevant_portions.append(portion)
396
-
397
- # Mark indices as selected
398
- selected_indices.update(range(start_idx, end_idx))
399
-
400
- # Add results to the final output
401
  relevant_portions[f"Document_{doc_id}"] = doc_relevant_portions
402
-
403
  return relevant_portions
404
  except Exception as e:
405
  print(f"Error in extracting relevant portions: {e}")
 
353
  # Load a pre-trained embedding model
354
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Use a lightweight model for speed
355
 
356
+ from sentence_transformers import SentenceTransformer
357
+ from sklearn.metrics.pairwise import cosine_similarity
358
+
359
+ # Load the embedding model globally for efficiency
360
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
361
+
362
+ def extract_relevant_portions(document_texts, query, max_portions=3, chunk_size=500):
363
  try:
364
+ # Embed the query once
365
  query_embedding = embedding_model.encode([query])
366
+
367
  relevant_portions = {}
368
  for doc_id, doc_text in enumerate(document_texts):
369
+ # Split document into chunks (e.g., 500 characters per chunk)
370
+ chunks = [doc_text[i:i + chunk_size] for i in range(0, len(doc_text), chunk_size)]
371
 
372
+ # Embed all chunks in a single batch
373
+ chunk_embeddings = embedding_model.encode(chunks)
374
 
375
+ # Compute cosine similarity between query and all chunks
376
+ similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
377
 
378
+ # Rank chunks by similarity
379
+ ranked_chunks = sorted(
380
+ enumerate(chunks),
381
  key=lambda x: similarities[x[0]],
382
  reverse=True
383
  )
384
 
385
+ # Select top chunks based on similarity
386
+ doc_relevant_portions = [chunk for _, chunk in ranked_chunks[:max_portions]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  relevant_portions[f"Document_{doc_id}"] = doc_relevant_portions
388
+
389
  return relevant_portions
390
  except Exception as e:
391
  print(f"Error in extracting relevant portions: {e}")