Spaces:
Sleeping
Sleeping
File size: 5,349 Bytes
8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
import PyPDF2
from docx import Document
# Load the tokenizer and model for sentence embeddings
@st.cache_resource
def load_model():
try:
tokenizer = AutoTokenizer.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
model = AutoModelForCausalLM.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # Smaller, faster sentence embeddings model
st.success("Model loaded successfully!")
return tokenizer, model, sentence_model
except Exception as e:
st.error(f"Error loading models: {e}")
return None, None, None
# Extract text from a PDF file
def extract_text_from_pdf(pdf_file):
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
except Exception as e:
st.error(f"Error reading PDF: {e}")
return ""
# Extract text from a Word document
def extract_text_from_word(docx_file):
try:
doc = Document(docx_file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
st.error(f"Error reading Word document: {e}")
return ""
# Compare sentences for similarity
def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
similar_sentences = []
for i, sent1 in enumerate(doc1_sentences):
if not sent1.strip():
continue # Skip empty sentences
best_match = None
best_score = 0
for j, sent2 in enumerate(doc2_sentences):
if not sent2.strip():
continue
try:
score = util.pytorch_cos_sim(sentence_model.encode(sent1), sentence_model.encode(sent2)).item()
if score > best_score: # Higher similarity score
best_score = score
best_match = (i, j, score, sent1, sent2)
except Exception as e:
st.error(f"Error comparing sentences: {e}")
if best_match and best_score > 0.6: # Threshold for similarity
similar_sentences.append(best_match)
return similar_sentences
# Streamlit UI
def main():
st.title("Comparative Analysis of Two Documents")
st.sidebar.header("Upload Files")
# Upload files
uploaded_file1 = st.sidebar.file_uploader("Upload the First Document (PDF/Word)", type=["pdf", "docx"])
uploaded_file2 = st.sidebar.file_uploader("Upload the Second Document (PDF/Word)", type=["pdf", "docx"])
if uploaded_file1 and uploaded_file2:
# Extract text from the uploaded documents
if uploaded_file1.name.endswith(".pdf"):
text1 = extract_text_from_pdf(uploaded_file1)
else:
text1 = extract_text_from_word(uploaded_file1)
if uploaded_file2.name.endswith(".pdf"):
text2 = extract_text_from_pdf(uploaded_file2)
else:
text2 = extract_text_from_word(uploaded_file2)
if not text1.strip():
st.error("The first document is empty or could not be read.")
return
if not text2.strip():
st.error("The second document is empty or could not be read.")
return
st.write("### Preview of Document 1:")
st.text(text1[:500]) # Display a preview of Document 1
st.write("### Preview of Document 2:")
st.text(text2[:500]) # Display a preview of Document 2
# Split text into sentences
doc1_sentences = text1.split('. ')
doc2_sentences = text2.split('. ')
# Limit sentences for testing purposes (optional)
doc1_sentences = doc1_sentences[:50] # Remove this line for full processing
doc2_sentences = doc2_sentences[:50] # Remove this line for full processing
# Load models
tokenizer, model, sentence_model = load_model()
if not sentence_model:
st.error("Failed to load the sentence embedding model.")
return
# Perform sentence comparison
st.info("Comparing sentences, this may take a moment...")
similar_sentences = compare_sentences(doc1_sentences, doc2_sentences, sentence_model)
# Display results
st.header("Comparative Analysis Results")
st.write(f"Number of sentences in Document 1: {len(doc1_sentences)}")
st.write(f"Number of sentences in Document 2: {len(doc2_sentences)}")
if similar_sentences:
st.success(f"Found {len(similar_sentences)} similar sentences!")
for match in similar_sentences:
doc1_index, doc2_index, score, sent1, sent2 = match
st.markdown(f"**Document 1 Sentence {doc1_index + 1}:** {sent1}")
st.markdown(f"**Document 2 Sentence {doc2_index + 1}:** {sent2}")
st.markdown(f"**Similarity Score:** {score:.2f}")
st.markdown("---")
else:
st.info("No significantly similar sentences found.")
else:
st.warning("Please upload two documents to compare.")
if __name__ == "__main__":
main()
|