File size: 3,453 Bytes
b6bf4ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pandas as pd
import re

encoded_df = pd.read_csv('encoded_df.csv').drop(columns=['Unnamed: 0'])

# Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Function to preprocess text (if required)
def preprocess_text(text):
    # Your text preprocessing logic here (e.g., lowercasing, removing special characters)
    text.lower()  # Placeholder for text preprocessing
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Function to generate the graphs
def generate_graphs(new_story):
    # Preprocess the new story
    new_story = preprocess_text(new_story)

    global model

    # Encode the new story
    new_story_vector = model.encode([new_story])[0]

    # Calculate the similarity with all existing stories in the knowledge base
    knowledge_base_vectors = encoded_df.iloc[:, :-1].values  # Exclude 'likesCount'
    print(f"New Story Vector Shape: {new_story_vector.shape}")
    print(f"Knowledge Base Vector Shape: {knowledge_base_vectors.shape}")
    similarities = cosine_similarity([new_story_vector], knowledge_base_vectors)

    # Find the indices of the 5 most similar stories
    top_5_indices = np.argsort(similarities[0])[::-1][:5]  # Sort similarities and get top 5

    # Retrieve the LikesCount for the top 5 most similar stories
    likes_distribution = encoded_df.iloc[top_5_indices]['likesCount'].values
    sorted_likes_distribution = sorted(likes_distribution, reverse=True)
    
    # Create a bar graph for the distribution of the 5 most similar stories
    plt.figure(figsize=(10, 6))
    sns.barplot(x=[f"Story {i+1}" for i in range(5)], y=sorted_likes_distribution, palette="viridis")
    plt.title("LikesCount Distribution for the 5 Most Similar Stories", fontsize=14)
    plt.xlabel("Story Similarity (Most Similar to Least)", fontsize=12)
    plt.ylabel("LikesCount", fontsize=12)
    likes_dist_plot = plt.gcf()

    # Plot the similarity distribution for the 5 most similar stories
    plt.figure(figsize=(10, 6))
    sns.kdeplot([new_story_vector], shade=False, label="New Story", color='blue')

    for i in top_5_indices:
        most_similar_vector = encoded_df.iloc[i, :-1].values
        sns.kdeplot(most_similar_vector, shade=False, label=f"Most Similar Story: {top_5_indices.tolist().index(i)+1}", alpha=0.5)

    plt.title("Similarity Distribution of New Story and Top 5 Similar Stories", fontsize=14)
    plt.xlabel("Value", fontsize=12)
    plt.ylabel("Density", fontsize=12)
    plt.legend(title="Stories")
    sim_dist_plot = plt.gcf()

    return sim_dist_plot,likes_dist_plot

# Gradio interface
def gradio_interface(new_story):
    # Generate and return both plots
    likes_dist_plot, sim_dist_plot = generate_graphs(new_story)
    return likes_dist_plot, sim_dist_plot

# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_interface, 
    inputs=gr.Textbox(label="Enter a story", lines=10, placeholder="Enter the story here..."), 
    outputs=[gr.Plot(), gr.Plot()],
    title="Story Similarity and Likes Distribution",
    description="Enter a new story to compare it with the knowledge base and get analytics on similarity and likes distribution of the most similar stories."
)

# Launch the interface
iface.launch(share=True)