Mattral commited on
Commit
42ac9eb
·
verified ·
1 Parent(s): 2ead8af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -1
app.py CHANGED
@@ -3,6 +3,8 @@ import streamlit as st
3
  from difflib import SequenceMatcher
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
 
 
6
 
7
  ms = st.session_state
8
  if "themes" not in ms:
@@ -81,7 +83,12 @@ def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.5):
81
  if i not in exact_match_indices and j not in exact_match_indices:
82
  similarity = similarity_matrix[i, len(df1) + j]
83
  if similarity >= threshold and similarity < 1: # Exclude exact matches
84
- similar_texts.append((i, j, row1[column_name], row2[column_name]))
 
 
 
 
 
85
 
86
  return similar_texts
87
 
 
3
  from difflib import SequenceMatcher
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
+ from Levenshtein import distance as levenshtein_distance
7
+
8
 
9
  ms = st.session_state
10
  if "themes" not in ms:
 
83
  if i not in exact_match_indices and j not in exact_match_indices:
84
  similarity = similarity_matrix[i, len(df1) + j]
85
  if similarity >= threshold and similarity < 1: # Exclude exact matches
86
+ # Calculate Levenshtein distance between strings
87
+ distance = levenshtein_distance(row1[column_name], row2[column_name])
88
+ max_length = max(len(row1[column_name]), len(row2[column_name]))
89
+ similarity_score = 1 - (distance / max_length)
90
+ if similarity_score >= threshold:
91
+ similar_texts.append((i, j, row1[column_name], row2[column_name]))
92
 
93
  return similar_texts
94