File size: 5,349 Bytes
8ca60f2
 
 
 
 
 
 
 
 
c494e5a
 
 
 
 
 
 
 
 
8ca60f2
 
 
c494e5a
 
 
 
 
 
 
 
 
8ca60f2
 
 
c494e5a
 
 
 
 
 
 
 
 
8ca60f2
 
 
 
 
c494e5a
 
8ca60f2
 
 
c494e5a
 
 
 
 
 
 
 
 
8ca60f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c494e5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ca60f2
 
 
 
 
c494e5a
 
 
 
 
8ca60f2
c494e5a
 
 
8ca60f2
 
c494e5a
8ca60f2
 
 
 
c494e5a
 
 
8ca60f2
c494e5a
8ca60f2
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
import PyPDF2
from docx import Document

# Load the tokenizer and model for sentence embeddings
@st.cache_resource
def load_model():
    try:
        tokenizer = AutoTokenizer.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
        model = AutoModelForCausalLM.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
        sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  # Smaller, faster sentence embeddings model
        st.success("Model loaded successfully!")
        return tokenizer, model, sentence_model
    except Exception as e:
        st.error(f"Error loading models: {e}")
        return None, None, None

# Extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    try:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        st.error(f"Error reading PDF: {e}")
        return ""

# Extract text from a Word document
def extract_text_from_word(docx_file):
    try:
        doc = Document(docx_file)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text
    except Exception as e:
        st.error(f"Error reading Word document: {e}")
        return ""

# Compare sentences for similarity
def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
    similar_sentences = []
    for i, sent1 in enumerate(doc1_sentences):
        if not sent1.strip():
            continue  # Skip empty sentences
        best_match = None
        best_score = 0
        for j, sent2 in enumerate(doc2_sentences):
            if not sent2.strip():
                continue
            try:
                score = util.pytorch_cos_sim(sentence_model.encode(sent1), sentence_model.encode(sent2)).item()
                if score > best_score:  # Higher similarity score
                    best_score = score
                    best_match = (i, j, score, sent1, sent2)
            except Exception as e:
                st.error(f"Error comparing sentences: {e}")
        if best_match and best_score > 0.6:  # Threshold for similarity
            similar_sentences.append(best_match)
    return similar_sentences

# Streamlit UI
def main():
    st.title("Comparative Analysis of Two Documents")
    st.sidebar.header("Upload Files")

    # Upload files
    uploaded_file1 = st.sidebar.file_uploader("Upload the First Document (PDF/Word)", type=["pdf", "docx"])
    uploaded_file2 = st.sidebar.file_uploader("Upload the Second Document (PDF/Word)", type=["pdf", "docx"])

    if uploaded_file1 and uploaded_file2:
        # Extract text from the uploaded documents
        if uploaded_file1.name.endswith(".pdf"):
            text1 = extract_text_from_pdf(uploaded_file1)
        else:
            text1 = extract_text_from_word(uploaded_file1)

        if uploaded_file2.name.endswith(".pdf"):
            text2 = extract_text_from_pdf(uploaded_file2)
        else:
            text2 = extract_text_from_word(uploaded_file2)

        if not text1.strip():
            st.error("The first document is empty or could not be read.")
            return
        if not text2.strip():
            st.error("The second document is empty or could not be read.")
            return

        st.write("### Preview of Document 1:")
        st.text(text1[:500])  # Display a preview of Document 1
        st.write("### Preview of Document 2:")
        st.text(text2[:500])  # Display a preview of Document 2

        # Split text into sentences
        doc1_sentences = text1.split('. ')
        doc2_sentences = text2.split('. ')

        # Limit sentences for testing purposes (optional)
        doc1_sentences = doc1_sentences[:50]  # Remove this line for full processing
        doc2_sentences = doc2_sentences[:50]  # Remove this line for full processing

        # Load models
        tokenizer, model, sentence_model = load_model()
        if not sentence_model:
            st.error("Failed to load the sentence embedding model.")
            return

        # Perform sentence comparison
        st.info("Comparing sentences, this may take a moment...")
        similar_sentences = compare_sentences(doc1_sentences, doc2_sentences, sentence_model)

        # Display results
        st.header("Comparative Analysis Results")
        st.write(f"Number of sentences in Document 1: {len(doc1_sentences)}")
        st.write(f"Number of sentences in Document 2: {len(doc2_sentences)}")

        if similar_sentences:
            st.success(f"Found {len(similar_sentences)} similar sentences!")
            for match in similar_sentences:
                doc1_index, doc2_index, score, sent1, sent2 = match
                st.markdown(f"**Document 1 Sentence {doc1_index + 1}:** {sent1}")
                st.markdown(f"**Document 2 Sentence {doc2_index + 1}:** {sent2}")
                st.markdown(f"**Similarity Score:** {score:.2f}")
                st.markdown("---")
        else:
            st.info("No significantly similar sentences found.")
    else:
        st.warning("Please upload two documents to compare.")

if __name__ == "__main__":
    main()