File size: 5,027 Bytes
e107ee4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from collections import defaultdict

def load_and_preprocess_data(uploaded_file):
    """Load and preprocess the CSV data."""
    df = pd.read_csv(uploaded_file)
    # Combine relevant text fields for similarity comparison
    df['combined_text'] = df['Title'] + ' ' + df['Abstract'] + ' ' + df['Keywords']
    return df

def calculate_similarity_matrix(df):
    """Calculate cosine similarity matrix based on combined text."""
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['combined_text'])
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

def find_similar_papers(similarity_matrix, df, threshold=0.7):
    """Find pairs of papers with similarity above threshold."""
    similar_pairs = []
    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix)):
            similarity = similarity_matrix[i][j]
            if similarity >= threshold:
                similar_pairs.append({
                    'Paper 1': df.iloc[i]['Title'],
                    'Paper 2': df.iloc[j]['Title'],
                    'Similarity': similarity
                })
    return pd.DataFrame(similar_pairs)

def find_outliers(similarity_matrix, df, threshold=0.3):
    """Find papers with low average similarity to others."""
    avg_similarities = np.mean(similarity_matrix, axis=1)
    outliers = []
    for i, avg_sim in enumerate(avg_similarities):
        if avg_sim < threshold:
            outliers.append({
                'Title': df.iloc[i]['Title'],
                'Average Similarity': avg_sim
            })
    return pd.DataFrame(outliers)

def create_similarity_heatmap(similarity_matrix, df):
    """Create a heatmap of similarity matrix."""
    fig = go.Figure(data=go.Heatmap(
        z=similarity_matrix,
        x=df['Title'],
        y=df['Title'],
        colorscale='Viridis'
    ))
    fig.update_layout(
        title='Paper Similarity Heatmap',
        xaxis_tickangle=-45,
        height=800
    )
    return fig

def analyze_keywords(df):
    """Analyze keyword frequency across papers."""
    keyword_freq = defaultdict(int)
    for keywords in df['Keywords']:
        if isinstance(keywords, str):
            for keyword in keywords.split(','):
                keyword = keyword.strip()
                keyword_freq[keyword] += 1
    
    keyword_df = pd.DataFrame([
        {'Keyword': k, 'Frequency': v} 
        for k, v in keyword_freq.items()
    ]).sort_values('Frequency', ascending=False)
    
    return keyword_df

def main():
    st.title('Research Papers Similarity Analysis')
    
    uploaded_file = st.file_uploader("Upload your research papers CSV file", type=['csv'])
    
    if uploaded_file is not None:
        df = load_and_preprocess_data(uploaded_file)
        similarity_matrix = calculate_similarity_matrix(df)
        
        st.header('Document Similarity Analysis')
        
        # Similarity Heatmap
        st.subheader('Similarity Heatmap')
        heatmap = create_similarity_heatmap(similarity_matrix, df)
        st.plotly_chart(heatmap, use_container_width=True)
        
        # Similar Papers
        st.subheader('Similar Papers')
        similarity_threshold = st.slider('Similarity Threshold', 0.0, 1.0, 0.7)
        similar_papers = find_similar_papers(similarity_matrix, df, similarity_threshold)
        if not similar_papers.empty:
            st.dataframe(similar_papers)
        else:
            st.write("No papers found above the similarity threshold.")
        
        # Outliers
        st.subheader('Outlier Papers')
        outlier_threshold = st.slider('Outlier Threshold', 0.0, 1.0, 0.3)
        outliers = find_outliers(similarity_matrix, df, outlier_threshold)
        if not outliers.empty:
            st.dataframe(outliers)
        else:
            st.write("No outliers found below the threshold.")
        
        # Keyword Analysis
        st.header('Keyword Analysis')
        keyword_freq = analyze_keywords(df)
        if not keyword_freq.empty:
            fig = px.bar(keyword_freq, x='Keyword', y='Frequency',
                        title='Keyword Frequency Across Papers')
            fig.update_xaxes(tickangle=45)
            st.plotly_chart(fig, use_container_width=True)
        
        # Basic Statistics
        st.header('Basic Statistics')
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Total Papers", len(df))
            st.metric("Average Similarity", f"{np.mean(similarity_matrix):.2f}")
        with col2:
            st.metric("Unique Keywords", len(keyword_freq))
            st.metric("Max Similarity", f"{np.max(similarity_matrix[~np.eye(similarity_matrix.shape[0], dtype=bool)]):.2f}")

if __name__ == "__main__":
    main()