Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ from Levenshtein import distance as levenshtein_distance
|
|
7 |
import matplotlib.pyplot as plt
|
8 |
import seaborn as sns
|
9 |
|
|
|
10 |
ms = st.session_state
|
11 |
if "themes" not in ms:
|
12 |
ms.themes = {"current_theme": "light",
|
@@ -56,13 +57,20 @@ def read_csv_or_excel(file):
|
|
56 |
return pd.read_excel(file)
|
57 |
else:
|
58 |
raise ValueError("Unsupported file format. Only CSV and Excel files are supported.")
|
|
|
59 |
|
60 |
def find_exact_match(df1, df2, column_name):
|
|
|
|
|
|
|
|
|
61 |
# Find rows with exact matches in the specified column
|
62 |
matches = pd.merge(df1, df2, on=column_name, how='inner')
|
63 |
return matches
|
64 |
|
65 |
|
|
|
|
|
66 |
def find_similar_texts(df1, df2, column_name, threshold=0.3):
|
67 |
# Find rows with similar texts in the specified column, excluding exact matches
|
68 |
similar_texts = []
|
@@ -108,6 +116,7 @@ def plot_correlation(df, column):
|
|
108 |
return plt.gcf() # Return the matplotlib figure
|
109 |
|
110 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
|
|
111 |
def plot_correlation_matrix(df):
|
112 |
# Filter for numeric columns, if the DataFrame has non-numeric columns
|
113 |
numeric_df = df.select_dtypes(include=['number'])
|
@@ -160,6 +169,8 @@ def main():
|
|
160 |
# Display exact matches
|
161 |
st.header("Exact Matches Compare")
|
162 |
for match in exact_matches:
|
|
|
|
|
163 |
st.write(f"Row {match[0]} in warehouse item stocks is exactly the same as Row {match[1]} in industry item stocks:")
|
164 |
st.write(f"Warehouse: {match[2]}")
|
165 |
st.write(f"Industry: {match[3]}")
|
@@ -169,6 +180,9 @@ def main():
|
|
169 |
# Display similar texts
|
170 |
st.header("Similar (but Not Same) Texts")
|
171 |
for text_pair in similar_texts:
|
|
|
|
|
|
|
172 |
st.write(f"Row {text_pair[0]} in warehouse item stocks is similar to Row {text_pair[1]} in industry item stocks:")
|
173 |
st.write(f"Warehouse: {text_pair[2]}")
|
174 |
st.write(f"Industry: {text_pair[3]}")
|
@@ -202,4 +216,4 @@ def main():
|
|
202 |
plot_correlation_matrix(industry_df)
|
203 |
|
204 |
if __name__ == "__main__":
|
205 |
-
main()
|
|
|
7 |
import matplotlib.pyplot as plt
|
8 |
import seaborn as sns
|
9 |
|
10 |
+
|
11 |
ms = st.session_state
|
12 |
if "themes" not in ms:
|
13 |
ms.themes = {"current_theme": "light",
|
|
|
57 |
return pd.read_excel(file)
|
58 |
else:
|
59 |
raise ValueError("Unsupported file format. Only CSV and Excel files are supported.")
|
60 |
+
|
61 |
|
62 |
def find_exact_match(df1, df2, column_name):
|
63 |
+
# Ensure the column for merging has the same data type
|
64 |
+
df1[column_name] = df1[column_name].astype(str).str.strip()
|
65 |
+
df2[column_name] = df2[column_name].astype(str).str.strip()
|
66 |
+
|
67 |
# Find rows with exact matches in the specified column
|
68 |
matches = pd.merge(df1, df2, on=column_name, how='inner')
|
69 |
return matches
|
70 |
|
71 |
|
72 |
+
|
73 |
+
|
74 |
def find_similar_texts(df1, df2, column_name, threshold=0.3):
|
75 |
# Find rows with similar texts in the specified column, excluding exact matches
|
76 |
similar_texts = []
|
|
|
116 |
return plt.gcf() # Return the matplotlib figure
|
117 |
|
118 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
119 |
+
|
120 |
def plot_correlation_matrix(df):
|
121 |
# Filter for numeric columns, if the DataFrame has non-numeric columns
|
122 |
numeric_df = df.select_dtypes(include=['number'])
|
|
|
169 |
# Display exact matches
|
170 |
st.header("Exact Matches Compare")
|
171 |
for match in exact_matches:
|
172 |
+
warehouse_index = text_pair[0] + 2
|
173 |
+
industry_index = text_pair[1] + 2
|
174 |
st.write(f"Row {match[0]} in warehouse item stocks is exactly the same as Row {match[1]} in industry item stocks:")
|
175 |
st.write(f"Warehouse: {match[2]}")
|
176 |
st.write(f"Industry: {match[3]}")
|
|
|
180 |
# Display similar texts
|
181 |
st.header("Similar (but Not Same) Texts")
|
182 |
for text_pair in similar_texts:
|
183 |
+
warehouse_index = text_pair[0] + 2
|
184 |
+
industry_index = text_pair[1] + 2
|
185 |
+
|
186 |
st.write(f"Row {text_pair[0]} in warehouse item stocks is similar to Row {text_pair[1]} in industry item stocks:")
|
187 |
st.write(f"Warehouse: {text_pair[2]}")
|
188 |
st.write(f"Industry: {text_pair[3]}")
|
|
|
216 |
plot_correlation_matrix(industry_df)
|
217 |
|
218 |
if __name__ == "__main__":
|
219 |
+
main()
|