Spaces:
Running
Running
import os | |
import sys | |
src_directory = os.path.abspath(os.path.join( | |
os.path.dirname(__file__), "../..", "src")) | |
sys.path.append(src_directory) | |
from services import data_processing | |
import pandas as pd | |
from collections import Counter | |
from sqlalchemy.orm import Session | |
def get_trending_videos_count(): | |
df = data_processing.get_updated_df() | |
trending_counts= df.groupby(["trending_date"])['trending_date'].count() | |
return trending_counts | |
def get_most_popular_categories(): | |
df = data_processing.get_updated_df() | |
category_counts = df.groupby(["category_name"])["category_name"].count() | |
return category_counts | |
def get_views_vs_likes(): | |
df = data_processing.get_updated_df() | |
scatter_data = df[["views", "likes"]].dropna() | |
return scatter_data | |
def get_like_ratio_distribution(): | |
df = data_processing.get_updated_df() | |
data= df[["video_id","views","likes"]].copy() | |
data = data[data["views"] > 0] | |
data["like_ratio"] = data["likes"] / data["views"] | |
return data | |
def get_top_liked_videos(top_n=10): | |
df = data_processing.get_updated_df() | |
top_videos = df[["title", "likes"]].dropna().sort_values(by="likes", ascending=False).head(10) | |
return top_videos | |
def get_trending_channels(): | |
df = data_processing.get_updated_df() | |
data = df[["channelTitle","publishedAt"]].copy() | |
return data | |
def calculate_channel_growth(): | |
data = data_processing.get_updated_df() | |
data["publishedAt"] = pd.to_datetime(data["publishedAt"], errors="coerce") | |
data.dropna(subset=["publishedAt"], inplace=True) | |
data["published_month"] = data["publishedAt"].dt.to_period("M").astype(str) | |
grouped_data = data.groupby(["published_month", "channelTitle"]).size().reset_index(name="video_count") | |
return grouped_data | |
def process_tags(): | |
data = data_processing.get_updated_df() | |
tags = data["tags"].dropna().str.lower().str.split("|") | |
all_tags = [tag.strip() for sublist in tags for tag in sublist if tag.strip()] | |
tag_counts = Counter(all_tags) | |
tag_data = [{"tag": tag, "count": count} for tag, count in tag_counts.items()] | |
return {"tags": tag_data} | |
def analyze_trending_duration(): | |
data = data_processing.get_updated_df() | |
trending_days = data.groupby("video_id")["trending_date"].count().reset_index() | |
trending_days.columns = ["video_id", "days_trending"] | |
views_growth = data.groupby("trending_date")["views"].mean().reset_index() | |
return { | |
"lifespan": trending_days.to_dict(orient="records"), | |
"views_growth": views_growth.to_dict(orient="records") | |
} | |
def analyze_upload_patterns(mode: str): | |
data = data_processing.get_updated_df() | |
data["publishedAt"] = pd.to_datetime(data["publishedAt"], errors="coerce") | |
if mode == "hour": | |
data["upload_hour"] = data["publishedAt"].dt.hour | |
hourly_counts = data["upload_hour"].value_counts().sort_index().reset_index() | |
hourly_counts.columns = ["hour", "count"] | |
return hourly_counts.to_dict(orient="records") | |
elif mode == "day": | |
data["upload_day"] = data["publishedAt"].dt.day_name() | |
daily_counts = data["upload_day"].value_counts().reindex( | |
["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] | |
).reset_index() | |
daily_counts.columns = ["day", "count"] | |
return daily_counts.to_dict(orient="records") | |
return {"error": "Invalid mode"} | |
def category_like_view_ratio(): | |
data = data_processing.get_updated_df() | |
data["like_view_ratio"] = data["likes"] / data["views"] | |
category_data = data.groupby("category_name")["like_view_ratio"].mean().reset_index() | |
return category_data.to_dict(orient="records") | |
def category_comment_engagement(): | |
data = data_processing.get_updated_df() | |
return data[["category_name", "comment_count", "views", "likes"]].dropna().to_dict(orient="records") | |
if __name__ == "__main__": | |
pass |