tahirsher commited on
Commit
b07d002
·
verified ·
1 Parent(s): c494e5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -19
app.py CHANGED
@@ -3,6 +3,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from sentence_transformers import SentenceTransformer, util
4
  import PyPDF2
5
  from docx import Document
 
6
 
7
  # Load the tokenizer and model for sentence embeddings
8
  @st.cache_resource
@@ -41,31 +42,29 @@ def extract_text_from_word(docx_file):
41
  st.error(f"Error reading Word document: {e}")
42
  return ""
43
 
44
- # Compare sentences for similarity
45
  def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
 
 
 
 
 
 
 
 
 
46
  similar_sentences = []
47
- for i, sent1 in enumerate(doc1_sentences):
48
- if not sent1.strip():
49
- continue # Skip empty sentences
50
- best_match = None
51
- best_score = 0
52
- for j, sent2 in enumerate(doc2_sentences):
53
- if not sent2.strip():
54
- continue
55
- try:
56
- score = util.pytorch_cos_sim(sentence_model.encode(sent1), sentence_model.encode(sent2)).item()
57
- if score > best_score: # Higher similarity score
58
- best_score = score
59
- best_match = (i, j, score, sent1, sent2)
60
- except Exception as e:
61
- st.error(f"Error comparing sentences: {e}")
62
- if best_match and best_score > 0.6: # Threshold for similarity
63
- similar_sentences.append(best_match)
64
  return similar_sentences
65
 
66
  # Streamlit UI
67
  def main():
68
- st.title("Comparative Analysis of Two Documents")
69
  st.sidebar.header("Upload Files")
70
 
71
  # Upload files
 
3
  from sentence_transformers import SentenceTransformer, util
4
  import PyPDF2
5
  from docx import Document
6
+ import numpy as np
7
 
8
  # Load the tokenizer and model for sentence embeddings
9
  @st.cache_resource
 
42
  st.error(f"Error reading Word document: {e}")
43
  return ""
44
 
45
+ # Optimized comparison using embeddings and matrix operations
46
  def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
47
+ # Encode all sentences in batches to get embeddings
48
+ doc1_embeddings = sentence_model.encode(doc1_sentences, convert_to_tensor=True, batch_size=16)
49
+ doc2_embeddings = sentence_model.encode(doc2_sentences, convert_to_tensor=True, batch_size=16)
50
+
51
+ # Compute cosine similarity matrix between all pairs
52
+ similarity_matrix = util.pytorch_cos_sim(doc1_embeddings, doc2_embeddings)
53
+
54
+ # Extract pairs with similarity > threshold
55
+ threshold = 0.6 # Adjust this for stricter or looser matching
56
  similar_sentences = []
57
+
58
+ for i, row in enumerate(similarity_matrix):
59
+ for j, score in enumerate(row):
60
+ if score >= threshold:
61
+ similar_sentences.append((i, j, score.item(), doc1_sentences[i], doc2_sentences[j]))
62
+
 
 
 
 
 
 
 
 
 
 
 
63
  return similar_sentences
64
 
65
  # Streamlit UI
66
  def main():
67
+ st.title("Optimized Comparative Analysis of Two Documents")
68
  st.sidebar.header("Upload Files")
69
 
70
  # Upload files