subashdvorak commited on
Commit
b6bf4ee
·
verified ·
1 Parent(s): 609a2e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from sentence_transformers import SentenceTransformer
7
+ import pandas as pd
8
+ import re
9
+
10
+ encoded_df = pd.read_csv('encoded_df.csv').drop(columns=['Unnamed: 0'])
11
+
12
+ # Initialize the Sentence Transformer model
13
+ model = SentenceTransformer('all-MiniLM-L6-v2')
14
+ # Function to preprocess text (if required)
15
+ def preprocess_text(text):
16
+ # Your text preprocessing logic here (e.g., lowercasing, removing special characters)
17
+ text.lower() # Placeholder for text preprocessing
18
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
19
+ return text
20
+
21
+ # Function to generate the graphs
22
+ def generate_graphs(new_story):
23
+ # Preprocess the new story
24
+ new_story = preprocess_text(new_story)
25
+
26
+ global model
27
+
28
+ # Encode the new story
29
+ new_story_vector = model.encode([new_story])[0]
30
+
31
+ # Calculate the similarity with all existing stories in the knowledge base
32
+ knowledge_base_vectors = encoded_df.iloc[:, :-1].values # Exclude 'likesCount'
33
+ print(f"New Story Vector Shape: {new_story_vector.shape}")
34
+ print(f"Knowledge Base Vector Shape: {knowledge_base_vectors.shape}")
35
+ similarities = cosine_similarity([new_story_vector], knowledge_base_vectors)
36
+
37
+ # Find the indices of the 5 most similar stories
38
+ top_5_indices = np.argsort(similarities[0])[::-1][:5] # Sort similarities and get top 5
39
+
40
+ # Retrieve the LikesCount for the top 5 most similar stories
41
+ likes_distribution = encoded_df.iloc[top_5_indices]['likesCount'].values
42
+ sorted_likes_distribution = sorted(likes_distribution, reverse=True)
43
+
44
+ # Create a bar graph for the distribution of the 5 most similar stories
45
+ plt.figure(figsize=(10, 6))
46
+ sns.barplot(x=[f"Story {i+1}" for i in range(5)], y=sorted_likes_distribution, palette="viridis")
47
+ plt.title("LikesCount Distribution for the 5 Most Similar Stories", fontsize=14)
48
+ plt.xlabel("Story Similarity (Most Similar to Least)", fontsize=12)
49
+ plt.ylabel("LikesCount", fontsize=12)
50
+ likes_dist_plot = plt.gcf()
51
+
52
+ # Plot the similarity distribution for the 5 most similar stories
53
+ plt.figure(figsize=(10, 6))
54
+ sns.kdeplot([new_story_vector], shade=False, label="New Story", color='blue')
55
+
56
+ for i in top_5_indices:
57
+ most_similar_vector = encoded_df.iloc[i, :-1].values
58
+ sns.kdeplot(most_similar_vector, shade=False, label=f"Most Similar Story: {top_5_indices.tolist().index(i)+1}", alpha=0.5)
59
+
60
+ plt.title("Similarity Distribution of New Story and Top 5 Similar Stories", fontsize=14)
61
+ plt.xlabel("Value", fontsize=12)
62
+ plt.ylabel("Density", fontsize=12)
63
+ plt.legend(title="Stories")
64
+ sim_dist_plot = plt.gcf()
65
+
66
+ return sim_dist_plot,likes_dist_plot
67
+
68
+ # Gradio interface
69
+ def gradio_interface(new_story):
70
+ # Generate and return both plots
71
+ likes_dist_plot, sim_dist_plot = generate_graphs(new_story)
72
+ return likes_dist_plot, sim_dist_plot
73
+
74
+ # Create the Gradio interface
75
+ iface = gr.Interface(
76
+ fn=gradio_interface,
77
+ inputs=gr.Textbox(label="Enter a story", lines=10, placeholder="Enter the story here..."),
78
+ outputs=[gr.Plot(), gr.Plot()],
79
+ title="Story Similarity and Likes Distribution",
80
+ description="Enter a new story to compare it with the knowledge base and get analytics on similarity and likes distribution of the most similar stories."
81
+ )
82
+
83
+ # Launch the interface
84
+ iface.launch(share=True)