subashdvorak commited on
Commit
57f6880
·
verified ·
1 Parent(s): c4b1e95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -43
app.py CHANGED
@@ -7,81 +7,78 @@ from sentence_transformers import SentenceTransformer
7
  import pandas as pd
8
  import re
9
 
 
10
  encoded_df = pd.read_csv('encoded_df.csv').drop(columns=['Unnamed: 0'])
11
 
12
  # Initialize the Sentence Transformer model
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
- # Function to preprocess text (if required)
 
15
  def preprocess_text(text):
16
- # Your text preprocessing logic here (e.g., lowercasing, removing special characters)
17
- text.lower() # Placeholder for text preprocessing
18
- text = re.sub(r'[^a-zA-Z\s]', '', text)
19
  return text
20
 
21
- # Function to generate the graphs
22
  def generate_graphs(new_story):
23
  # Preprocess the new story
24
  new_story = preprocess_text(new_story)
25
 
26
- global model
27
-
28
  # Encode the new story
29
  new_story_vector = model.encode([new_story])[0]
30
 
31
- # Calculate the similarity with all existing stories in the knowledge base
32
  knowledge_base_vectors = encoded_df.iloc[:, :-7].values # Exclude 'likesCount'
33
- print(f"New Story Vector Shape: {new_story_vector.shape}")
34
- print(f"Knowledge Base Vector Shape: {knowledge_base_vectors.shape}")
35
- similarities = cosine_similarity([new_story_vector], knowledge_base_vectors)
36
 
37
- # Find the indices of the 5 most similar stories
38
- top_5_indices = np.argsort(similarities[0])[::-1][:5] # Sort similarities and get top 5
39
 
40
- likes_distribution=[]
41
- for i in top_5_indices:
42
- print(f"Row {i+1}: Similarity = {similarities[0][i]:.4f}, LikesCount = {encoded_df.iloc[i]['likesCount']}")
43
- likes_distribution.append(encoded_df.iloc[i]['likesCount'].astype(int))
44
-
45
- # Plot the similarity distribution for the 5 most similar stories
46
- plt.figure(figsize=(10, 6))
47
- sns.kdeplot([new_story_vector], shade=False, label="New Story", color='blue')
48
 
49
- for i in top_5_indices:
50
- most_similar_vector = encoded_df.iloc[i, :-7].values
51
- sns.kdeplot(most_similar_vector, shade=False, label=f"Most Similar Story: {top_5_indices.tolist().index(i)+1}", alpha=0.5)
52
 
53
- plt.title("Similarity Distribution of New Story and Top 5 Similar Stories", fontsize=14)
54
- plt.xlabel("Value", fontsize=12)
 
 
 
 
 
 
55
  plt.ylabel("Density", fontsize=12)
56
  plt.legend(title="Stories")
57
  sim_dist_plot = plt.gcf()
58
-
59
- # Create a bar graph for the distribution of the 5 most similar stories
60
- # top_5_stories = [0,1,2,3,4]
61
  plt.figure(figsize=(10, 6))
62
- sns.barplot(x=[f"Story {i+1}" for i in range(5)], y=likes_distribution, palette="viridis")
63
- plt.title("LikesCount Distribution for the 5 Most Similar Stories", fontsize=14)
64
- plt.xlabel("Story Similarity (Most Similar to Least)", fontsize=12)
65
  plt.ylabel("LikesCount", fontsize=12)
 
66
  likes_dist_plot = plt.gcf()
67
 
68
-
69
-
70
- return sim_dist_plot,likes_dist_plot
71
 
72
  # Gradio interface
73
  def gradio_interface(new_story):
74
- # Generate and return both plots
75
- likes_dist_plot, sim_dist_plot = generate_graphs(new_story)
76
- return likes_dist_plot, sim_dist_plot
 
77
 
78
  # Create the Gradio interface
79
  iface = gr.Interface(
80
- fn=gradio_interface,
81
- inputs=gr.Textbox(label="Enter a story", lines=10, placeholder="Enter the story here..."),
82
- outputs=[gr.Plot(), gr.Plot()],
83
- title="Story Similarity and Likes Distribution",
84
- description="Enter a new story to compare it with the knowledge base and get analytics on similarity and likes distribution of the most similar stories."
 
85
  )
86
 
87
  # Launch the interface
 
7
  import pandas as pd
8
  import re
9
 
10
+ # Load the knowledge base
11
  encoded_df = pd.read_csv('encoded_df.csv').drop(columns=['Unnamed: 0'])
12
 
13
  # Initialize the Sentence Transformer model
14
  model = SentenceTransformer('all-MiniLM-L6-v2')
15
+
16
+ # Function to preprocess text
17
  def preprocess_text(text):
18
+ text = text.lower() # Lowercase
19
+ text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters
 
20
  return text
21
 
22
+ # Function to generate graphs for stories with similarity > 0.8
23
  def generate_graphs(new_story):
24
  # Preprocess the new story
25
  new_story = preprocess_text(new_story)
26
 
 
 
27
  # Encode the new story
28
  new_story_vector = model.encode([new_story])[0]
29
 
30
+ # Calculate similarity with knowledge base stories
31
  knowledge_base_vectors = encoded_df.iloc[:, :-7].values # Exclude 'likesCount'
32
+ similarities = cosine_similarity([new_story_vector], knowledge_base_vectors)[0]
 
 
33
 
34
+ # Filter indices with similarity > 0.8
35
+ similar_indexes = np.where(similarities > 0.8)[0]
36
 
37
+ if len(similar_indexes) == 0:
38
+ return None, "No stories have a similarity > 0.85."
 
 
 
 
 
 
39
 
40
+ # Get likesCount for stories with similarity > 0.8
41
+ likes_distribution = encoded_df.iloc[similar_indexes]['likesCount'].values
42
+ story_labels = [f"Story {i+1}" for i in similar_indexes]
43
 
44
+ # Plot similarity distribution for all similar stories
45
+ plt.figure(figsize=(10, 6))
46
+ sns.kdeplot(new_story_vector, shade=False, label="New Story", color='blue', linewidth=2)
47
+ for idx in similar_indexes:
48
+ most_similar_vector = encoded_df.iloc[idx, :-7].values
49
+ sns.kdeplot(most_similar_vector, shade=False, label=f"Story {idx+1}", alpha=0.5)
50
+ plt.title("Similarity Distribution: New Story vs Similar Stories", fontsize=14)
51
+ plt.xlabel("Vector Values", fontsize=12)
52
  plt.ylabel("Density", fontsize=12)
53
  plt.legend(title="Stories")
54
  sim_dist_plot = plt.gcf()
55
+
56
+ # Create a bar graph for likes distribution
 
57
  plt.figure(figsize=(10, 6))
58
+ sns.barplot(x=story_labels, y=likes_distribution, palette="viridis")
59
+ plt.title("LikesCount Distribution for Similar Stories", fontsize=14)
60
+ plt.xlabel("Story Index (Similarity > 0.8)", fontsize=12)
61
  plt.ylabel("LikesCount", fontsize=12)
62
+ plt.xticks(rotation=90)
63
  likes_dist_plot = plt.gcf()
64
 
65
+ return sim_dist_plot, likes_dist_plot
 
 
66
 
67
  # Gradio interface
68
  def gradio_interface(new_story):
69
+ sim_dist_plot, likes_dist_plot = generate_graphs(new_story)
70
+ if sim_dist_plot is None:
71
+ return "No stories have a similarity > 0.8.", None
72
+ return sim_dist_plot, likes_dist_plot
73
 
74
  # Create the Gradio interface
75
  iface = gr.Interface(
76
+ fn=gradio_interface,
77
+ inputs=gr.Textbox(label="Enter a story", lines=10, placeholder="Enter the story here..."),
78
+ outputs=[gr.Plot(label="Similarity Distribution"), gr.Plot(label="Likes Distribution")],
79
+ title="Story Similarity and Likes Analysis",
80
+ description="Enter a new story to compare with the knowledge base. "
81
+ "View similarity distributions and likes of stories with similarity > 0.8."
82
  )
83
 
84
  # Launch the interface