Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,8 @@ import streamlit as st
|
|
3 |
from difflib import SequenceMatcher
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
6 |
|
7 |
ms = st.session_state
|
8 |
if "themes" not in ms:
|
@@ -81,7 +83,12 @@ def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.5):
|
|
81 |
if i not in exact_match_indices and j not in exact_match_indices:
|
82 |
similarity = similarity_matrix[i, len(df1) + j]
|
83 |
if similarity >= threshold and similarity < 1: # Exclude exact matches
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
return similar_texts
|
87 |
|
|
|
3 |
from difflib import SequenceMatcher
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
from Levenshtein import distance as levenshtein_distance
|
7 |
+
|
8 |
|
9 |
ms = st.session_state
|
10 |
if "themes" not in ms:
|
|
|
83 |
if i not in exact_match_indices and j not in exact_match_indices:
|
84 |
similarity = similarity_matrix[i, len(df1) + j]
|
85 |
if similarity >= threshold and similarity < 1: # Exclude exact matches
|
86 |
+
# Calculate Levenshtein distance between strings
|
87 |
+
distance = levenshtein_distance(row1[column_name], row2[column_name])
|
88 |
+
max_length = max(len(row1[column_name]), len(row2[column_name]))
|
89 |
+
similarity_score = 1 - (distance / max_length)
|
90 |
+
if similarity_score >= threshold:
|
91 |
+
similar_texts.append((i, j, row1[column_name], row2[column_name]))
|
92 |
|
93 |
return similar_texts
|
94 |
|