Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 27

Commit

4a50eaf

verified ·

1 Parent(s): 053b384

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -35

app.py CHANGED Viewed

@@ -353,53 +353,39 @@ import nltk
 # Load a pre-trained embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a lightweight model for speed
-def extract_relevant_portions(document_texts, query, max_portions=3, portion_size=1, min_query_words=2):
     try:
-        # Generate embedding for the query
         query_embedding = embedding_model.encode([query])
         relevant_portions = {}
         for doc_id, doc_text in enumerate(document_texts):
-            # Tokenize document into sentences
-            sentences = nltk.sent_tokenize(doc_text)
-            # Generate embeddings for all sentences
-            sentence_embeddings = embedding_model.encode(sentences)
-            # Compute cosine similarities between the query and all sentences
-            similarities = cosine_similarity(query_embedding, sentence_embeddings)[0]
-            # Rank sentences by similarity scores
-            ranked_sentences = sorted(
-                enumerate(sentences),
                 key=lambda x: similarities[x[0]],
                 reverse=True
             )
-            doc_relevant_portions = []
-            selected_indices = set()
-            for idx, (sentence_idx, sentence) in enumerate(ranked_sentences):
-                if idx >= max_portions:  # Stop if we've reached the max number of portions
-                    break
-                # Get the surrounding sentences for context
-                start_idx = max(0, sentence_idx - portion_size // 2)
-                end_idx = min(len(sentences), sentence_idx + portion_size // 2 + 1)
-                # Avoid selecting overlapping portions
-                if any(i in selected_indices for i in range(start_idx, end_idx)):
-                    continue
-                portion = " ".join(sentences[start_idx:end_idx])
-                doc_relevant_portions.append(portion)
-                # Mark indices as selected
-                selected_indices.update(range(start_idx, end_idx))
-            # Add results to the final output
             relevant_portions[f"Document_{doc_id}"] = doc_relevant_portions
         return relevant_portions
     except Exception as e:
         print(f"Error in extracting relevant portions: {e}")

 # Load a pre-trained embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a lightweight model for speed
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+# Load the embedding model globally for efficiency
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+def extract_relevant_portions(document_texts, query, max_portions=3, chunk_size=500):
     try:
+        # Embed the query once
         query_embedding = embedding_model.encode([query])
         relevant_portions = {}
         for doc_id, doc_text in enumerate(document_texts):
+            # Split document into chunks (e.g., 500 characters per chunk)
+            chunks = [doc_text[i:i + chunk_size] for i in range(0, len(doc_text), chunk_size)]
+            # Embed all chunks in a single batch
+            chunk_embeddings = embedding_model.encode(chunks)
+            # Compute cosine similarity between query and all chunks
+            similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
+            # Rank chunks by similarity
+            ranked_chunks = sorted(
+                enumerate(chunks),
                 key=lambda x: similarities[x[0]],
                 reverse=True
             )
+            # Select top chunks based on similarity
+            doc_relevant_portions = [chunk for _, chunk in ranked_chunks[:max_portions]]
             relevant_portions[f"Document_{doc_id}"] = doc_relevant_portions
         return relevant_portions
     except Exception as e:
         print(f"Error in extracting relevant portions: {e}")