buelfhood commited on
Commit
f38468f
·
verified ·
1 Parent(s): 6b0cacc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -3
app.py CHANGED
@@ -9,7 +9,6 @@ import zipfile
9
  import os
10
  import io
11
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
12
- from matheel.similarity import get_sim_list
13
 
14
  def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
15
  model = SentenceTransformer(model_name)
@@ -22,8 +21,78 @@ def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
22
 
23
  return "The similarity score between the two codes is: %.2f" % overall_similarity
24
 
25
- def get_sim_list_gradio(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
26
- result = get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  return result
28
 
29
  # Define the Gradio app
 
9
  import os
10
  import io
11
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
 
12
 
13
  def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
14
  model = SentenceTransformer(model_name)
 
21
 
22
  return "The similarity score between the two codes is: %.2f" % overall_similarity
23
 
24
+ # Define the function to process the uploaded file and return a DataFrame
25
+ def extract_and_read_compressed_file(file_path):
26
+ file_names = []
27
+ codes = []
28
+
29
+ # Handle .zip files
30
+ if file_path.endswith('.zip'):
31
+ with zipfile.ZipFile(file_path, 'r') as z:
32
+ file_names = z.namelist()
33
+ codes = [z.read(file).decode('utf-8', errors='ignore') for file in file_names]
34
+
35
+ else:
36
+ raise ValueError("Unsupported file type. Only .zip is supported.")
37
+
38
+ return file_names, codes
39
+
40
+ def filter_and_return_top(df, similarity_threshold,returned_results):
41
+ filtered_df = df[df['similarity_score'] > similarity_threshold]
42
+ return filtered_df.head(returned_results)
43
+
44
+ # Perform paraphrase mining with the specified weights
45
+ def perform_paraphrase_mining(model, codes_list, weight_semantic, weight_levenshtein, weight_jaro_winkler):
46
+ return paraphrase_mining_with_combined_score(
47
+ model,
48
+ codes_list,
49
+ weight_semantic=weight_semantic,
50
+ weight_levenshtein=weight_levenshtein,
51
+ weight_jaro_winkler=weight_jaro_winkler
52
+ )
53
+
54
+ def paraphrase_mining_with_combined_score(
55
+ model,
56
+ sentences: List[str],
57
+ show_progress_bar: bool = False,
58
+ weight_semantic: float = 1.0,
59
+ weight_levenshtein: float = 0.0,
60
+ weight_jaro_winkler: float = 0.0
61
+ ):
62
+ embeddings = model.encode(
63
+ sentences, show_progress_bar=show_progress_bar, convert_to_tensor=True)
64
+ paraphrases = util.paraphrase_mining_embeddings(embeddings, score_function=util.cos_sim)
65
+
66
+ results = []
67
+ for score, i, j in paraphrases:
68
+ lev_ratio = Levenshtein.normalized_similarity(sentences[i], sentences[j])
69
+ jaro_winkler_ratio = JaroWinkler.normalized_similarity(sentences[i], sentences[j])
70
+
71
+ combined_score = (weight_semantic * score) + \
72
+ (weight_levenshtein * lev_ratio) + \
73
+ (weight_jaro_winkler * jaro_winkler_ratio)
74
+
75
+ results.append([combined_score, i, j])
76
+
77
+ results = sorted(results, key=lambda x: x[0], reverse=True)
78
+ return results
79
+
80
+ def get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
81
+ file_names, codes = extract_and_read_compressed_file(zipped_file)
82
+ model = SentenceTransformer(model_name)
83
+ code_pairs = perform_paraphrase_mining(model, codes,Ws, Wl, Wj)
84
+ pairs_results = []
85
+
86
+ for score, i, j in code_pairs:
87
+ pairs_results.append({
88
+ 'file_name_1': file_names[i],
89
+ 'file_name_2': file_names[j],
90
+ 'similarity_score': score
91
+ })
92
+
93
+ similarity_df = pd.concat([pd.DataFrame(pairs_results)], ignore_index=True)
94
+ similarity_df = similarity_df.sort_values(by='similarity_score', ascending=False)
95
+ result = filter_and_return_top(similarity_df,threshold,number_results).round(2)
96
  return result
97
 
98
  # Define the Gradio app