Spaces:

tahirsher
/

Contextual_Awareness_App

Sleeping

App Files Files Community

tahirsher commited on Dec 18, 2024

Commit

b07d002

verified ·

1 Parent(s): c494e5a

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -19

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer, util
 import PyPDF2
 from docx import Document
 # Load the tokenizer and model for sentence embeddings
 @st.cache_resource
@@ -41,31 +42,29 @@ def extract_text_from_word(docx_file):
         st.error(f"Error reading Word document: {e}")
         return ""
-# Compare sentences for similarity
 def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
     similar_sentences = []
-    for i, sent1 in enumerate(doc1_sentences):
-        if not sent1.strip():
-            continue  # Skip empty sentences
-        best_match = None
-        best_score = 0
-        for j, sent2 in enumerate(doc2_sentences):
-            if not sent2.strip():
-                continue
-            try:
-                score = util.pytorch_cos_sim(sentence_model.encode(sent1), sentence_model.encode(sent2)).item()
-                if score > best_score:  # Higher similarity score
-                    best_score = score
-                    best_match = (i, j, score, sent1, sent2)
-            except Exception as e:
-                st.error(f"Error comparing sentences: {e}")
-        if best_match and best_score > 0.6:  # Threshold for similarity
-            similar_sentences.append(best_match)
     return similar_sentences
 # Streamlit UI
 def main():
-    st.title("Comparative Analysis of Two Documents")
     st.sidebar.header("Upload Files")
     # Upload files

 from sentence_transformers import SentenceTransformer, util
 import PyPDF2
 from docx import Document
+import numpy as np
 # Load the tokenizer and model for sentence embeddings
 @st.cache_resource
         st.error(f"Error reading Word document: {e}")
         return ""
+# Optimized comparison using embeddings and matrix operations
 def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
+    # Encode all sentences in batches to get embeddings
+    doc1_embeddings = sentence_model.encode(doc1_sentences, convert_to_tensor=True, batch_size=16)
+    doc2_embeddings = sentence_model.encode(doc2_sentences, convert_to_tensor=True, batch_size=16)
+    # Compute cosine similarity matrix between all pairs
+    similarity_matrix = util.pytorch_cos_sim(doc1_embeddings, doc2_embeddings)
+    # Extract pairs with similarity > threshold
+    threshold = 0.6  # Adjust this for stricter or looser matching
     similar_sentences = []
+    for i, row in enumerate(similarity_matrix):
+        for j, score in enumerate(row):
+            if score >= threshold:
+                similar_sentences.append((i, j, score.item(), doc1_sentences[i], doc2_sentences[j]))
     return similar_sentences
 # Streamlit UI
 def main():
+    st.title("Optimized Comparative Analysis of Two Documents")
     st.sidebar.header("Upload Files")
     # Upload files