Spaces:

DvorakInnovationAI
/

Story-Analytics

Sleeping

App Files Files Community

Story-Analytics / app.py

subashdvorak

Create app.py

b6bf4ee verified 10 months ago

raw

history blame

3.45 kB

	import gradio as gr
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	from sklearn.metrics.pairwise import cosine_similarity
	from sentence_transformers import SentenceTransformer
	import pandas as pd
	import re

	encoded_df = pd.read_csv('encoded_df.csv').drop(columns=['Unnamed: 0'])

	# Initialize the Sentence Transformer model
	model = SentenceTransformer('all-MiniLM-L6-v2')
	# Function to preprocess text (if required)
	def preprocess_text(text):
	# Your text preprocessing logic here (e.g., lowercasing, removing special characters)
	text.lower() # Placeholder for text preprocessing
	text = re.sub(r'[^a-zA-Z\s]', '', text)
	return text

	# Function to generate the graphs
	def generate_graphs(new_story):
	# Preprocess the new story
	new_story = preprocess_text(new_story)

	global model

	# Encode the new story
	new_story_vector = model.encode([new_story])[0]

	# Calculate the similarity with all existing stories in the knowledge base
	knowledge_base_vectors = encoded_df.iloc[:, :-1].values # Exclude 'likesCount'
	print(f"New Story Vector Shape: {new_story_vector.shape}")
	print(f"Knowledge Base Vector Shape: {knowledge_base_vectors.shape}")
	similarities = cosine_similarity([new_story_vector], knowledge_base_vectors)

	# Find the indices of the 5 most similar stories
	top_5_indices = np.argsort(similarities[0])[::-1][:5] # Sort similarities and get top 5

	# Retrieve the LikesCount for the top 5 most similar stories
	likes_distribution = encoded_df.iloc[top_5_indices]['likesCount'].values
	sorted_likes_distribution = sorted(likes_distribution, reverse=True)

	# Create a bar graph for the distribution of the 5 most similar stories
	plt.figure(figsize=(10, 6))
	sns.barplot(x=[f"Story {i+1}" for i in range(5)], y=sorted_likes_distribution, palette="viridis")
	plt.title("LikesCount Distribution for the 5 Most Similar Stories", fontsize=14)
	plt.xlabel("Story Similarity (Most Similar to Least)", fontsize=12)
	plt.ylabel("LikesCount", fontsize=12)
	likes_dist_plot = plt.gcf()

	# Plot the similarity distribution for the 5 most similar stories
	plt.figure(figsize=(10, 6))
	sns.kdeplot([new_story_vector], shade=False, label="New Story", color='blue')

	for i in top_5_indices:
	most_similar_vector = encoded_df.iloc[i, :-1].values
	sns.kdeplot(most_similar_vector, shade=False, label=f"Most Similar Story: {top_5_indices.tolist().index(i)+1}", alpha=0.5)

	plt.title("Similarity Distribution of New Story and Top 5 Similar Stories", fontsize=14)
	plt.xlabel("Value", fontsize=12)
	plt.ylabel("Density", fontsize=12)
	plt.legend(title="Stories")
	sim_dist_plot = plt.gcf()

	return sim_dist_plot,likes_dist_plot

	# Gradio interface
	def gradio_interface(new_story):
	# Generate and return both plots
	likes_dist_plot, sim_dist_plot = generate_graphs(new_story)
	return likes_dist_plot, sim_dist_plot

	# Create the Gradio interface
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.Textbox(label="Enter a story", lines=10, placeholder="Enter the story here..."),
	outputs=[gr.Plot(), gr.Plot()],
	title="Story Similarity and Likes Distribution",
	description="Enter a new story to compare it with the knowledge base and get analytics on similarity and likes distribution of the most similar stories."
	)

	# Launch the interface
	iface.launch(share=True)