tahirsher commited on
Commit
c494e5a
·
verified ·
1 Parent(s): 16ab85d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -22
app.py CHANGED
@@ -7,38 +7,58 @@ from docx import Document
7
  # Load the tokenizer and model for sentence embeddings
8
  @st.cache_resource
9
  def load_model():
10
- tokenizer = AutoTokenizer.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
11
- model = AutoModelForCausalLM.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
12
- sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # Smaller, faster sentence embeddings model
13
- return tokenizer, model, sentence_model
 
 
 
 
 
14
 
15
  # Extract text from a PDF file
16
  def extract_text_from_pdf(pdf_file):
17
- pdf_reader = PyPDF2.PdfReader(pdf_file)
18
- text = ""
19
- for page in pdf_reader.pages:
20
- text += page.extract_text()
21
- return text
 
 
 
 
22
 
23
  # Extract text from a Word document
24
  def extract_text_from_word(docx_file):
25
- doc = Document(docx_file)
26
- text = ""
27
- for paragraph in doc.paragraphs:
28
- text += paragraph.text + "\n"
29
- return text
 
 
 
 
30
 
31
  # Compare sentences for similarity
32
  def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
33
  similar_sentences = []
34
  for i, sent1 in enumerate(doc1_sentences):
 
 
35
  best_match = None
36
  best_score = 0
37
  for j, sent2 in enumerate(doc2_sentences):
38
- score = util.pytorch_cos_sim(sentence_model.encode(sent1), sentence_model.encode(sent2)).item()
39
- if score > best_score: # Higher similarity score
40
- best_score = score
41
- best_match = (i, j, score, sent1, sent2)
 
 
 
 
 
42
  if best_match and best_score > 0.6: # Threshold for similarity
43
  similar_sentences.append(best_match)
44
  return similar_sentences
@@ -54,22 +74,53 @@ def main():
54
 
55
  if uploaded_file1 and uploaded_file2:
56
  # Extract text from the uploaded documents
57
- text1 = extract_text_from_pdf(uploaded_file1) if uploaded_file1.name.endswith(".pdf") else extract_text_from_word(uploaded_file1)
58
- text2 = extract_text_from_pdf(uploaded_file2) if uploaded_file2.name.endswith(".pdf") else extract_text_from_word(uploaded_file2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # Split text into sentences
61
  doc1_sentences = text1.split('. ')
62
  doc2_sentences = text2.split('. ')
63
 
64
- # Load model
 
 
 
 
65
  tokenizer, model, sentence_model = load_model()
 
 
 
66
 
67
  # Perform sentence comparison
 
68
  similar_sentences = compare_sentences(doc1_sentences, doc2_sentences, sentence_model)
69
 
70
  # Display results
71
  st.header("Comparative Analysis Results")
 
 
 
72
  if similar_sentences:
 
73
  for match in similar_sentences:
74
  doc1_index, doc2_index, score, sent1, sent2 = match
75
  st.markdown(f"**Document 1 Sentence {doc1_index + 1}:** {sent1}")
@@ -78,7 +129,6 @@ def main():
78
  st.markdown("---")
79
  else:
80
  st.info("No significantly similar sentences found.")
81
-
82
  else:
83
  st.warning("Please upload two documents to compare.")
84
 
 
7
  # Load the tokenizer and model for sentence embeddings
8
  @st.cache_resource
9
  def load_model():
10
+ try:
11
+ tokenizer = AutoTokenizer.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
12
+ model = AutoModelForCausalLM.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
13
+ sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # Smaller, faster sentence embeddings model
14
+ st.success("Model loaded successfully!")
15
+ return tokenizer, model, sentence_model
16
+ except Exception as e:
17
+ st.error(f"Error loading models: {e}")
18
+ return None, None, None
19
 
20
  # Extract text from a PDF file
21
  def extract_text_from_pdf(pdf_file):
22
+ try:
23
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
24
+ text = ""
25
+ for page in pdf_reader.pages:
26
+ text += page.extract_text()
27
+ return text
28
+ except Exception as e:
29
+ st.error(f"Error reading PDF: {e}")
30
+ return ""
31
 
32
  # Extract text from a Word document
33
  def extract_text_from_word(docx_file):
34
+ try:
35
+ doc = Document(docx_file)
36
+ text = ""
37
+ for paragraph in doc.paragraphs:
38
+ text += paragraph.text + "\n"
39
+ return text
40
+ except Exception as e:
41
+ st.error(f"Error reading Word document: {e}")
42
+ return ""
43
 
44
  # Compare sentences for similarity
45
  def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
46
  similar_sentences = []
47
  for i, sent1 in enumerate(doc1_sentences):
48
+ if not sent1.strip():
49
+ continue # Skip empty sentences
50
  best_match = None
51
  best_score = 0
52
  for j, sent2 in enumerate(doc2_sentences):
53
+ if not sent2.strip():
54
+ continue
55
+ try:
56
+ score = util.pytorch_cos_sim(sentence_model.encode(sent1), sentence_model.encode(sent2)).item()
57
+ if score > best_score: # Higher similarity score
58
+ best_score = score
59
+ best_match = (i, j, score, sent1, sent2)
60
+ except Exception as e:
61
+ st.error(f"Error comparing sentences: {e}")
62
  if best_match and best_score > 0.6: # Threshold for similarity
63
  similar_sentences.append(best_match)
64
  return similar_sentences
 
74
 
75
  if uploaded_file1 and uploaded_file2:
76
  # Extract text from the uploaded documents
77
+ if uploaded_file1.name.endswith(".pdf"):
78
+ text1 = extract_text_from_pdf(uploaded_file1)
79
+ else:
80
+ text1 = extract_text_from_word(uploaded_file1)
81
+
82
+ if uploaded_file2.name.endswith(".pdf"):
83
+ text2 = extract_text_from_pdf(uploaded_file2)
84
+ else:
85
+ text2 = extract_text_from_word(uploaded_file2)
86
+
87
+ if not text1.strip():
88
+ st.error("The first document is empty or could not be read.")
89
+ return
90
+ if not text2.strip():
91
+ st.error("The second document is empty or could not be read.")
92
+ return
93
+
94
+ st.write("### Preview of Document 1:")
95
+ st.text(text1[:500]) # Display a preview of Document 1
96
+ st.write("### Preview of Document 2:")
97
+ st.text(text2[:500]) # Display a preview of Document 2
98
 
99
  # Split text into sentences
100
  doc1_sentences = text1.split('. ')
101
  doc2_sentences = text2.split('. ')
102
 
103
+ # Limit sentences for testing purposes (optional)
104
+ doc1_sentences = doc1_sentences[:50] # Remove this line for full processing
105
+ doc2_sentences = doc2_sentences[:50] # Remove this line for full processing
106
+
107
+ # Load models
108
  tokenizer, model, sentence_model = load_model()
109
+ if not sentence_model:
110
+ st.error("Failed to load the sentence embedding model.")
111
+ return
112
 
113
  # Perform sentence comparison
114
+ st.info("Comparing sentences, this may take a moment...")
115
  similar_sentences = compare_sentences(doc1_sentences, doc2_sentences, sentence_model)
116
 
117
  # Display results
118
  st.header("Comparative Analysis Results")
119
+ st.write(f"Number of sentences in Document 1: {len(doc1_sentences)}")
120
+ st.write(f"Number of sentences in Document 2: {len(doc2_sentences)}")
121
+
122
  if similar_sentences:
123
+ st.success(f"Found {len(similar_sentences)} similar sentences!")
124
  for match in similar_sentences:
125
  doc1_index, doc2_index, score, sent1, sent2 = match
126
  st.markdown(f"**Document 1 Sentence {doc1_index + 1}:** {sent1}")
 
129
  st.markdown("---")
130
  else:
131
  st.info("No significantly similar sentences found.")
 
132
  else:
133
  st.warning("Please upload two documents to compare.")
134