molehh commited on
Commit
74bdacd
Β·
1 Parent(s): 879815b

youtube trend analyer project

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv/
2
+ data
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi[standard]
2
+ pandas
3
+ sqlalchemy
4
+ streamlit
5
+ requests
6
+ plotly
7
+ matplotlib
src/backend/__pycache__/main.cpython-313.pyc ADDED
Binary file (530 Bytes). View file
 
src/backend/__pycache__/trending_videos_page.cpython-313.pyc ADDED
Binary file (7.28 kB). View file
 
src/backend/main.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from routes import homepage_api
3
+ from routes import data_visualization_page_api
4
+
5
+ app = FastAPI()
6
+
7
+ app.include_router(homepage_api.router,tags=["Home"])
8
+ app.include_router(data_visualization_page_api.router,tags=["Data"])
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
src/backend/routes/__init__.py ADDED
File without changes
src/backend/routes/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (171 Bytes). View file
 
src/backend/routes/__pycache__/data_visualization_page_api.cpython-313.pyc ADDED
Binary file (3.9 kB). View file
 
src/backend/routes/__pycache__/homepage_api.cpython-313.pyc ADDED
Binary file (455 Bytes). View file
 
src/backend/routes/__pycache__/time_trend.cpython-313.pyc ADDED
Binary file (279 Bytes). View file
 
src/backend/routes/data_visualization_page_api.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ import trending_videos_page
3
+ from sqlalchemy.orm import Session
4
+ router=APIRouter()
5
+
6
+ @router.get("/trending_videos_count")
7
+ def trending_videos_count():
8
+ data = trending_videos_page.get_trending_videos_count()
9
+ return {"trending_video_counts": data.to_dict()}
10
+
11
+
12
+ @router.get("/most_popular_categories")
13
+ def most_popular_categories():
14
+ data = trending_videos_page.get_most_popular_categories()
15
+ return {"most_popular_categories": data.to_dict()}
16
+
17
+ @router.get("/engagement/like_ratio_distribution")
18
+ def like_ratio_distribution():
19
+ data = trending_videos_page.get_like_ratio_distribution()
20
+ return {"like_ratio_distribution": data.to_dict(orient="records")}
21
+
22
+
23
+ @router.get("/engagement/top_liked_videos")
24
+ def top_liked_videos():
25
+ data = trending_videos_page.get_top_liked_videos()
26
+ return {"top_liked_videos": data.to_dict(orient="records")}
27
+
28
+
29
+ @router.get("/channel-performance/top-trending")
30
+ def top_trending_channels():
31
+ data = trending_videos_page.get_trending_channels()
32
+ channel_counts = data["channelTitle"].value_counts().to_dict()
33
+ return {"top_trending_channels": channel_counts}
34
+
35
+
36
+ @router.get("/channel-performance/growth-over-time")
37
+ def channel_growth_over_time():
38
+ growth_data =trending_videos_page.calculate_channel_growth()
39
+ return growth_data.to_dict(orient="records")
40
+
41
+ @router.get("/analysis")
42
+ def get_tags_analysis():
43
+ popular_tags = trending_videos_page.process_tags()
44
+ return popular_tags
45
+
46
+
47
+ @router.get("/duration_vs_likes")
48
+ def get_trending_by_day():
49
+ analyze_trending_daily=trending_videos_page.analyze_upload_patterns("day")
50
+ return analyze_trending_daily
51
+
52
+
53
+ @router.get("/category-like-view-ratio")
54
+ def get_category_like_view_ratio():
55
+ data = trending_videos_page.category_like_view_ratio()
56
+ return {"data": data}
57
+
58
+ @router.get("/category-comment-engagement")
59
+ def get_category_comment_engagement():
60
+ data = trending_videos_page.category_comment_engagement()
61
+ return {"data": data}
src/backend/routes/homepage_api.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ router = APIRouter()
4
+
5
+ @router.get("/")
6
+ def home():
7
+ return {"Home" :"welome home page"}
src/backend/trending_videos_page.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ src_directory = os.path.abspath(os.path.join(
4
+ os.path.dirname(__file__), "../..", "src"))
5
+ sys.path.append(src_directory)
6
+ from services import data_processing
7
+ import pandas as pd
8
+ from collections import Counter
9
+ from sqlalchemy.orm import Session
10
+
11
+
12
+ def get_trending_videos_count():
13
+ df = data_processing.get_updated_df()
14
+ trending_counts= df.groupby(["trending_date"])['trending_date'].count()
15
+ return trending_counts
16
+
17
+ def get_most_popular_categories():
18
+ df = data_processing.get_updated_df()
19
+ category_counts = df.groupby(["category_name"])["category_name"].count()
20
+ return category_counts
21
+
22
+
23
+ def get_views_vs_likes():
24
+ df = data_processing.get_updated_df()
25
+ scatter_data = df[["views", "likes"]].dropna()
26
+ return scatter_data
27
+
28
+
29
+ def get_like_ratio_distribution():
30
+ df = data_processing.get_updated_df()
31
+ data= df[["video_id","views","likes"]].copy()
32
+ data = data[data["views"] > 0]
33
+ data["like_ratio"] = data["likes"] / data["views"]
34
+ return data
35
+
36
+ def get_top_liked_videos(top_n=10):
37
+ df = data_processing.get_updated_df()
38
+ top_videos = df[["title", "likes"]].dropna().sort_values(by="likes", ascending=False).head(10)
39
+ return top_videos
40
+
41
+ def get_trending_channels():
42
+ df = data_processing.get_updated_df()
43
+ data = df[["channelTitle","publishedAt"]].copy()
44
+ return data
45
+
46
+ def calculate_channel_growth():
47
+ data = data_processing.get_updated_df()
48
+ data["publishedAt"] = pd.to_datetime(data["publishedAt"], errors="coerce")
49
+ data.dropna(subset=["publishedAt"], inplace=True)
50
+ data["published_month"] = data["publishedAt"].dt.to_period("M").astype(str)
51
+ grouped_data = data.groupby(["published_month", "channelTitle"]).size().reset_index(name="video_count")
52
+ return grouped_data
53
+
54
+
55
+ def process_tags():
56
+ data = data_processing.get_updated_df()
57
+ tags = data["tags"].dropna().str.lower().str.split("|")
58
+ all_tags = [tag.strip() for sublist in tags for tag in sublist if tag.strip()]
59
+ tag_counts = Counter(all_tags)
60
+ tag_data = [{"tag": tag, "count": count} for tag, count in tag_counts.items()]
61
+ return {"tags": tag_data}
62
+
63
+
64
+ def analyze_trending_duration():
65
+ data = data_processing.get_updated_df()
66
+ trending_days = data.groupby("video_id")["trending_date"].count().reset_index()
67
+ trending_days.columns = ["video_id", "days_trending"]
68
+ views_growth = data.groupby("trending_date")["views"].mean().reset_index()
69
+
70
+ return {
71
+ "lifespan": trending_days.to_dict(orient="records"),
72
+ "views_growth": views_growth.to_dict(orient="records")
73
+ }
74
+
75
+
76
+ def analyze_upload_patterns(mode: str):
77
+ data = data_processing.get_updated_df()
78
+ data["publishedAt"] = pd.to_datetime(data["publishedAt"], errors="coerce")
79
+ if mode == "hour":
80
+ data["upload_hour"] = data["publishedAt"].dt.hour
81
+ hourly_counts = data["upload_hour"].value_counts().sort_index().reset_index()
82
+ hourly_counts.columns = ["hour", "count"]
83
+ return hourly_counts.to_dict(orient="records")
84
+
85
+ elif mode == "day":
86
+ data["upload_day"] = data["publishedAt"].dt.day_name()
87
+ daily_counts = data["upload_day"].value_counts().reindex(
88
+ ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
89
+ ).reset_index()
90
+ daily_counts.columns = ["day", "count"]
91
+ return daily_counts.to_dict(orient="records")
92
+
93
+ return {"error": "Invalid mode"}
94
+
95
+
96
+ def category_like_view_ratio():
97
+ data = data_processing.get_updated_df()
98
+ data["like_view_ratio"] = data["likes"] / data["views"]
99
+ category_data = data.groupby("category_name")["like_view_ratio"].mean().reset_index()
100
+
101
+ return category_data.to_dict(orient="records")
102
+
103
+ def category_comment_engagement():
104
+ data = data_processing.get_updated_df()
105
+ return data[["category_name", "comment_count", "views", "likes"]].dropna().to_dict(orient="records")
106
+
107
+
108
+
109
+
110
+
111
+
112
+ if __name__ == "__main__":
113
+ pass
src/frontend/app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import pandas as pd
4
+ import plotly.express as px
5
+ import matplotlib.pyplot as plt
6
+
7
+ API_BASE_URL = "http://127.0.0.1:8000"
8
+ CSV_FILE_PATH = "src/data/merged_yt_data.csv"
9
+ KAGGLE_LINK = "https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset?select=IN_category_id.json"
10
+
11
+ st.set_page_config(
12
+ page_title="YouTube Trending Insights",
13
+ layout="wide",
14
+ initial_sidebar_state="expanded"
15
+ )
16
+
17
+ st.markdown("""
18
+ <style>
19
+ .main {
20
+ background-color: #f5f5f5;
21
+ padding: 20px;
22
+ border-radius: 10px;
23
+ }
24
+ .title {
25
+ color: #ff0000;
26
+ font-family: 'Arial', sans-serif;
27
+ text-align: center;
28
+ padding: 20px 0;
29
+ }
30
+ .subtitle {
31
+ color: #333333;
32
+ font-family: 'Arial', sans-serif;
33
+ padding: 10px 0;
34
+ }
35
+ .stButton>button {
36
+ background-color: #ff0000;
37
+ color: white;
38
+ border-radius: 5px;
39
+ padding: 10px 20px;
40
+ }
41
+ .stButton>button:hover {
42
+ background-color: #cc0000;
43
+ }
44
+ </style>
45
+ """, unsafe_allow_html=True)
46
+
47
+ # Sidebar with Logo and Select Box
48
+ with st.sidebar:
49
+ st.markdown("""
50
+ <div style='text-align:center;'>
51
+ <img src='https://upload.wikimedia.org/wikipedia/commons/b/b8/YouTube_Logo_2017.svg' width='80%'>
52
+ </div>
53
+ """, unsafe_allow_html=True)
54
+
55
+ st.markdown("<h2 class='subtitle'>πŸ“Š YouTube Analytics</h2>", unsafe_allow_html=True)
56
+
57
+
58
+ # Kaggle Dataset Link
59
+ st.sidebar.markdown("""
60
+ <h3 style='text-align:center;'>πŸ“‚ Kaggle Dataset</h3>
61
+ <p style='text-align:center;'>
62
+ <a href='{}' target='_blank' style='text-decoration:none;'>
63
+ <button style='background-color:#ff0000; color:white; padding:10px 20px; border:none; border-radius:5px; cursor:pointer;'>
64
+ πŸ”— Open Kaggle Dataset
65
+ </button>
66
+ </a>
67
+ </p>
68
+ """.format(KAGGLE_LINK), unsafe_allow_html=True)
69
+ # Analysis Options
70
+ options = {
71
+ "πŸ“ˆ Trending Videos Over Time": "/trending_videos_count",
72
+ "πŸ₯§ Most Popular Categories": "/most_popular_categories",
73
+ "πŸ“Š Like Ratio Distribution": "/engagement/like_ratio_distribution",
74
+ "πŸ‘ Top Liked Videos": "/engagement/top_liked_videos",
75
+ "πŸ† Top Trending Channels": "/channel-performance/top-trending",
76
+ "πŸ“… Channel Growth Over Time": "/channel-performance/growth-over-time",
77
+ "❀️ Category Like-View Ratio": "/category-like-view-ratio",
78
+ "πŸ’¬ Category Comment Engagement": "/category-comment-engagement"
79
+ }
80
+
81
+ selected_option = st.selectbox("Choose an analysis:", list(options.keys()),
82
+ help="Select a visualization to explore YouTube trends")
83
+
84
+ def fetch_data(endpoint):
85
+ try:
86
+ response = requests.get(f"{API_BASE_URL}{endpoint}")
87
+ response.raise_for_status()
88
+ return response.json()
89
+ except requests.RequestException as e:
90
+ st.error(f"Failed to fetch data: {e}")
91
+ return None
92
+
93
+ st.markdown(f"<h2 class='subtitle'>{selected_option}</h2>", unsafe_allow_html=True)
94
+ data = fetch_data(options[selected_option])
95
+
96
+ if data:
97
+ if "Trending Videos" in selected_option:
98
+ df = pd.DataFrame(data["trending_video_counts"].items(), columns=["Date", "Count"])
99
+ df["Date"] = pd.to_datetime(df["Date"])
100
+ fig = px.line(df, x="Date", y="Count", title="Trending Videos Over Time")
101
+ st.plotly_chart(fig, use_container_width=True)
102
+
103
+ elif "Popular Categories" in selected_option:
104
+ df = pd.DataFrame.from_dict(data["most_popular_categories"], orient='index', columns=["Count"])
105
+ fig = px.pie(df, names=df.index, values="Count", title="Popular Categories",
106
+ hole=0.4, color_discrete_sequence=px.colors.sequential.RdBu)
107
+ st.plotly_chart(fig, use_container_width=True)
108
+
109
+ elif "Like Ratio" in selected_option:
110
+ df = pd.DataFrame(data["like_ratio_distribution"])
111
+ fig = px.histogram(df, x="like_ratio", nbins=50, title="Like Ratio Distribution",
112
+ color_discrete_sequence=['#ff0000'])
113
+ st.plotly_chart(fig, use_container_width=True)
114
+
115
+ elif "Top Liked Videos" in selected_option:
116
+ df = pd.DataFrame(data["top_liked_videos"])
117
+ fig = px.bar(df, x="title", y="likes", title="πŸ” Top Liked Videos",
118
+ color="likes", color_continuous_scale="Reds")
119
+ st.plotly_chart(fig, use_container_width=True)
120
+
121
+
122
+
123
+ elif "Top Trending Channels" in selected_option:
124
+ df = pd.DataFrame(data["top_trending_channels"].items(), columns=["Channel", "Trending Count"])
125
+ df = df.sort_values(by="Trending Count", ascending=False).head(10)
126
+ fig = px.bar(df, x="Channel", y="Trending Count", title="Top Trending Channels",
127
+ color="Trending Count", color_continuous_scale="Reds")
128
+ st.plotly_chart(fig, use_container_width=True)
129
+
130
+ elif "Channel Growth" in selected_option:
131
+ df = pd.DataFrame(data)
132
+ fig = px.line(df, x="published_month", y="video_count", color="channelTitle",
133
+ title="Channel Growth Over Time", line_shape="spline")
134
+ st.plotly_chart(fig, use_container_width=True)
135
+
136
+ elif "Like-View Ratio" in selected_option:
137
+ df = pd.DataFrame(data["data"])
138
+ fig = px.sunburst(df, path=["category_name"], values="like_view_ratio",
139
+ title="Category Like-View Ratio", color="like_view_ratio",
140
+ color_continuous_scale="RdYlBu")
141
+ st.plotly_chart(fig, use_container_width=True)
142
+
143
+ elif "Comment Engagement" in selected_option:
144
+ df = pd.DataFrame(data["data"])
145
+ fig = px.treemap(df, path=["category_name"], values="comment_count",
146
+ title="Category Comment Engagement", color="comment_count",
147
+ color_continuous_scale="Blues")
148
+ st.plotly_chart(fig, use_container_width=True)
149
+
150
+ # Dataset Preview
151
+ st.sidebar.markdown("<h2 class='subtitle'>πŸ“‹ Dataset Preview</h2>", unsafe_allow_html=True)
152
+ with st.sidebar.expander("View Raw Dataset", expanded=False):
153
+ if st.button("Show Dataset Preview"):
154
+ try:
155
+ df_csv = pd.read_csv(CSV_FILE_PATH)
156
+ st.dataframe(df_csv.head(1000), use_container_width=True)
157
+ except Exception as e:
158
+ st.error(f"Error loading dataset: {e}")
159
+
160
+
161
+
162
+
163
+
164
+
src/services/__pycache__/data_processing.cpython-313.pyc ADDED
Binary file (505 Bytes). View file
 
src/services/data_processing.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ csv_path = "src/data/yt_data.csv"
4
+ UPDATED_CSV_PATH = "src/data/merged_yt_data.csv"
5
+ # csv_df = pd.read_csv(csv_path)
6
+
7
+
8
+ def get_updated_df():
9
+
10
+ final_csv_df = pd.read_csv(UPDATED_CSV_PATH)
11
+ return final_csv_df
12
+
13
+
14
+ # df = get_updated_df()
15
+ # print(df.columns)
16
+
17
+ # # Load JSON file
18
+ # json_path = "category_id.json"
19
+ # json_df = pd.read_json(json_path)
20
+
21
+ # # Extract category ID and title from JSON
22
+ # category_df = pd.json_normalize(json_df["items"])
23
+ # category_df = category_df[["id", "snippet.title"]].rename(
24
+ # columns={"id": "category_id", "snippet.title": "category_name"})
25
+
26
+ # # Merge CSV data with category data on category_id
27
+ # csv_df["category_id"] = csv_df["category_id"].astype(
28
+ # str) # Ensure category_id is a string for merging
29
+ # merged_df = pd.merge(csv_df, category_df, on="category_id",
30
+ # how="left") # Left join to keep all video data
31
+
32
+
33
+ # # Save the merged DataFrame as a new CSV file
34
+ # merged_path = "merged_yt_data.csv"
35
+ # merged_df.to_csv(merged_path, index=False)