Mattral commited on
Commit
9bb02cd
·
verified ·
1 Parent(s): e8fa585

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -8
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import pandas as pd
2
  import streamlit as st
3
  from difflib import SequenceMatcher
4
-
 
5
 
6
  ms = st.session_state
7
  if "themes" not in ms:
@@ -62,16 +63,29 @@ def find_exact_matches(df1, df2, column_name):
62
  def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.8):
63
  # Find rows with similar texts in the specified column, excluding exact matches
64
  similar_texts = []
65
- for index1, row1 in df1.iterrows():
66
- for index2, row2 in df2.iterrows():
67
- if (index1, index2) not in exact_matches:
68
- similarity = SequenceMatcher(None, str(row1[column_name]), str(row2[column_name])).ratio()
69
- if similarity >= threshold:
70
- similar_texts.append((index1, index2, row1[column_name], row2[column_name]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  return similar_texts
72
 
73
 
74
-
75
  def main():
76
  st.title("Item Comparison App")
77
 
 
1
  import pandas as pd
2
  import streamlit as st
3
  from difflib import SequenceMatcher
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
 
7
  ms = st.session_state
8
  if "themes" not in ms:
 
63
  def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.8):
64
  # Find rows with similar texts in the specified column, excluding exact matches
65
  similar_texts = []
66
+ exact_match_indices = set(exact_matches.index.tolist())
67
+
68
+ # Concatenate texts from both dataframes
69
+ all_texts = df1[column_name].tolist() + df2[column_name].tolist()
70
+
71
+ # Compute TF-IDF vectors
72
+ vectorizer = TfidfVectorizer()
73
+ tfidf_matrix = vectorizer.fit_transform(all_texts)
74
+
75
+ # Compute cosine similarity matrix
76
+ similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
77
+
78
+ # Iterate over pairs of rows to find similar texts
79
+ for i, row1 in df1.iterrows():
80
+ for j, row2 in df2.iterrows():
81
+ if i not in exact_match_indices and j not in exact_match_indices:
82
+ similarity = similarity_matrix[i, len(df1) + j]
83
+ if similarity >= threshold and similarity < 1: # Exclude exact matches
84
+ similar_texts.append((i, j, row1[column_name], row2[column_name]))
85
+
86
  return similar_texts
87
 
88
 
 
89
  def main():
90
  st.title("Item Comparison App")
91