Spaces:
Sleeping
Sleeping
youtube trend analyer project
Browse files- .gitignore +2 -0
- requirements.txt +7 -0
- src/backend/__pycache__/main.cpython-313.pyc +0 -0
- src/backend/__pycache__/trending_videos_page.cpython-313.pyc +0 -0
- src/backend/main.py +18 -0
- src/backend/routes/__init__.py +0 -0
- src/backend/routes/__pycache__/__init__.cpython-313.pyc +0 -0
- src/backend/routes/__pycache__/data_visualization_page_api.cpython-313.pyc +0 -0
- src/backend/routes/__pycache__/homepage_api.cpython-313.pyc +0 -0
- src/backend/routes/__pycache__/time_trend.cpython-313.pyc +0 -0
- src/backend/routes/data_visualization_page_api.py +61 -0
- src/backend/routes/homepage_api.py +7 -0
- src/backend/trending_videos_page.py +113 -0
- src/frontend/app.py +164 -0
- src/services/__pycache__/data_processing.cpython-313.pyc +0 -0
- src/services/data_processing.py +35 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
venv/
|
2 |
+
data
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi[standard]
|
2 |
+
pandas
|
3 |
+
sqlalchemy
|
4 |
+
streamlit
|
5 |
+
requests
|
6 |
+
plotly
|
7 |
+
matplotlib
|
src/backend/__pycache__/main.cpython-313.pyc
ADDED
Binary file (530 Bytes). View file
|
|
src/backend/__pycache__/trending_videos_page.cpython-313.pyc
ADDED
Binary file (7.28 kB). View file
|
|
src/backend/main.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from routes import homepage_api
|
3 |
+
from routes import data_visualization_page_api
|
4 |
+
|
5 |
+
app = FastAPI()
|
6 |
+
|
7 |
+
app.include_router(homepage_api.router,tags=["Home"])
|
8 |
+
app.include_router(data_visualization_page_api.router,tags=["Data"])
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
|
src/backend/routes/__init__.py
ADDED
File without changes
|
src/backend/routes/__pycache__/__init__.cpython-313.pyc
ADDED
Binary file (171 Bytes). View file
|
|
src/backend/routes/__pycache__/data_visualization_page_api.cpython-313.pyc
ADDED
Binary file (3.9 kB). View file
|
|
src/backend/routes/__pycache__/homepage_api.cpython-313.pyc
ADDED
Binary file (455 Bytes). View file
|
|
src/backend/routes/__pycache__/time_trend.cpython-313.pyc
ADDED
Binary file (279 Bytes). View file
|
|
src/backend/routes/data_visualization_page_api.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
import trending_videos_page
|
3 |
+
from sqlalchemy.orm import Session
|
4 |
+
router=APIRouter()
|
5 |
+
|
6 |
+
@router.get("/trending_videos_count")
|
7 |
+
def trending_videos_count():
|
8 |
+
data = trending_videos_page.get_trending_videos_count()
|
9 |
+
return {"trending_video_counts": data.to_dict()}
|
10 |
+
|
11 |
+
|
12 |
+
@router.get("/most_popular_categories")
|
13 |
+
def most_popular_categories():
|
14 |
+
data = trending_videos_page.get_most_popular_categories()
|
15 |
+
return {"most_popular_categories": data.to_dict()}
|
16 |
+
|
17 |
+
@router.get("/engagement/like_ratio_distribution")
|
18 |
+
def like_ratio_distribution():
|
19 |
+
data = trending_videos_page.get_like_ratio_distribution()
|
20 |
+
return {"like_ratio_distribution": data.to_dict(orient="records")}
|
21 |
+
|
22 |
+
|
23 |
+
@router.get("/engagement/top_liked_videos")
|
24 |
+
def top_liked_videos():
|
25 |
+
data = trending_videos_page.get_top_liked_videos()
|
26 |
+
return {"top_liked_videos": data.to_dict(orient="records")}
|
27 |
+
|
28 |
+
|
29 |
+
@router.get("/channel-performance/top-trending")
|
30 |
+
def top_trending_channels():
|
31 |
+
data = trending_videos_page.get_trending_channels()
|
32 |
+
channel_counts = data["channelTitle"].value_counts().to_dict()
|
33 |
+
return {"top_trending_channels": channel_counts}
|
34 |
+
|
35 |
+
|
36 |
+
@router.get("/channel-performance/growth-over-time")
|
37 |
+
def channel_growth_over_time():
|
38 |
+
growth_data =trending_videos_page.calculate_channel_growth()
|
39 |
+
return growth_data.to_dict(orient="records")
|
40 |
+
|
41 |
+
@router.get("/analysis")
|
42 |
+
def get_tags_analysis():
|
43 |
+
popular_tags = trending_videos_page.process_tags()
|
44 |
+
return popular_tags
|
45 |
+
|
46 |
+
|
47 |
+
@router.get("/duration_vs_likes")
|
48 |
+
def get_trending_by_day():
|
49 |
+
analyze_trending_daily=trending_videos_page.analyze_upload_patterns("day")
|
50 |
+
return analyze_trending_daily
|
51 |
+
|
52 |
+
|
53 |
+
@router.get("/category-like-view-ratio")
|
54 |
+
def get_category_like_view_ratio():
|
55 |
+
data = trending_videos_page.category_like_view_ratio()
|
56 |
+
return {"data": data}
|
57 |
+
|
58 |
+
@router.get("/category-comment-engagement")
|
59 |
+
def get_category_comment_engagement():
|
60 |
+
data = trending_videos_page.category_comment_engagement()
|
61 |
+
return {"data": data}
|
src/backend/routes/homepage_api.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
|
3 |
+
router = APIRouter()
|
4 |
+
|
5 |
+
@router.get("/")
|
6 |
+
def home():
|
7 |
+
return {"Home" :"welome home page"}
|
src/backend/trending_videos_page.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
src_directory = os.path.abspath(os.path.join(
|
4 |
+
os.path.dirname(__file__), "../..", "src"))
|
5 |
+
sys.path.append(src_directory)
|
6 |
+
from services import data_processing
|
7 |
+
import pandas as pd
|
8 |
+
from collections import Counter
|
9 |
+
from sqlalchemy.orm import Session
|
10 |
+
|
11 |
+
|
12 |
+
def get_trending_videos_count():
|
13 |
+
df = data_processing.get_updated_df()
|
14 |
+
trending_counts= df.groupby(["trending_date"])['trending_date'].count()
|
15 |
+
return trending_counts
|
16 |
+
|
17 |
+
def get_most_popular_categories():
|
18 |
+
df = data_processing.get_updated_df()
|
19 |
+
category_counts = df.groupby(["category_name"])["category_name"].count()
|
20 |
+
return category_counts
|
21 |
+
|
22 |
+
|
23 |
+
def get_views_vs_likes():
|
24 |
+
df = data_processing.get_updated_df()
|
25 |
+
scatter_data = df[["views", "likes"]].dropna()
|
26 |
+
return scatter_data
|
27 |
+
|
28 |
+
|
29 |
+
def get_like_ratio_distribution():
|
30 |
+
df = data_processing.get_updated_df()
|
31 |
+
data= df[["video_id","views","likes"]].copy()
|
32 |
+
data = data[data["views"] > 0]
|
33 |
+
data["like_ratio"] = data["likes"] / data["views"]
|
34 |
+
return data
|
35 |
+
|
36 |
+
def get_top_liked_videos(top_n=10):
|
37 |
+
df = data_processing.get_updated_df()
|
38 |
+
top_videos = df[["title", "likes"]].dropna().sort_values(by="likes", ascending=False).head(10)
|
39 |
+
return top_videos
|
40 |
+
|
41 |
+
def get_trending_channels():
|
42 |
+
df = data_processing.get_updated_df()
|
43 |
+
data = df[["channelTitle","publishedAt"]].copy()
|
44 |
+
return data
|
45 |
+
|
46 |
+
def calculate_channel_growth():
|
47 |
+
data = data_processing.get_updated_df()
|
48 |
+
data["publishedAt"] = pd.to_datetime(data["publishedAt"], errors="coerce")
|
49 |
+
data.dropna(subset=["publishedAt"], inplace=True)
|
50 |
+
data["published_month"] = data["publishedAt"].dt.to_period("M").astype(str)
|
51 |
+
grouped_data = data.groupby(["published_month", "channelTitle"]).size().reset_index(name="video_count")
|
52 |
+
return grouped_data
|
53 |
+
|
54 |
+
|
55 |
+
def process_tags():
|
56 |
+
data = data_processing.get_updated_df()
|
57 |
+
tags = data["tags"].dropna().str.lower().str.split("|")
|
58 |
+
all_tags = [tag.strip() for sublist in tags for tag in sublist if tag.strip()]
|
59 |
+
tag_counts = Counter(all_tags)
|
60 |
+
tag_data = [{"tag": tag, "count": count} for tag, count in tag_counts.items()]
|
61 |
+
return {"tags": tag_data}
|
62 |
+
|
63 |
+
|
64 |
+
def analyze_trending_duration():
|
65 |
+
data = data_processing.get_updated_df()
|
66 |
+
trending_days = data.groupby("video_id")["trending_date"].count().reset_index()
|
67 |
+
trending_days.columns = ["video_id", "days_trending"]
|
68 |
+
views_growth = data.groupby("trending_date")["views"].mean().reset_index()
|
69 |
+
|
70 |
+
return {
|
71 |
+
"lifespan": trending_days.to_dict(orient="records"),
|
72 |
+
"views_growth": views_growth.to_dict(orient="records")
|
73 |
+
}
|
74 |
+
|
75 |
+
|
76 |
+
def analyze_upload_patterns(mode: str):
|
77 |
+
data = data_processing.get_updated_df()
|
78 |
+
data["publishedAt"] = pd.to_datetime(data["publishedAt"], errors="coerce")
|
79 |
+
if mode == "hour":
|
80 |
+
data["upload_hour"] = data["publishedAt"].dt.hour
|
81 |
+
hourly_counts = data["upload_hour"].value_counts().sort_index().reset_index()
|
82 |
+
hourly_counts.columns = ["hour", "count"]
|
83 |
+
return hourly_counts.to_dict(orient="records")
|
84 |
+
|
85 |
+
elif mode == "day":
|
86 |
+
data["upload_day"] = data["publishedAt"].dt.day_name()
|
87 |
+
daily_counts = data["upload_day"].value_counts().reindex(
|
88 |
+
["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
|
89 |
+
).reset_index()
|
90 |
+
daily_counts.columns = ["day", "count"]
|
91 |
+
return daily_counts.to_dict(orient="records")
|
92 |
+
|
93 |
+
return {"error": "Invalid mode"}
|
94 |
+
|
95 |
+
|
96 |
+
def category_like_view_ratio():
|
97 |
+
data = data_processing.get_updated_df()
|
98 |
+
data["like_view_ratio"] = data["likes"] / data["views"]
|
99 |
+
category_data = data.groupby("category_name")["like_view_ratio"].mean().reset_index()
|
100 |
+
|
101 |
+
return category_data.to_dict(orient="records")
|
102 |
+
|
103 |
+
def category_comment_engagement():
|
104 |
+
data = data_processing.get_updated_df()
|
105 |
+
return data[["category_name", "comment_count", "views", "likes"]].dropna().to_dict(orient="records")
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
if __name__ == "__main__":
|
113 |
+
pass
|
src/frontend/app.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.express as px
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
|
7 |
+
API_BASE_URL = "http://127.0.0.1:8000"
|
8 |
+
CSV_FILE_PATH = "src/data/merged_yt_data.csv"
|
9 |
+
KAGGLE_LINK = "https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset?select=IN_category_id.json"
|
10 |
+
|
11 |
+
st.set_page_config(
|
12 |
+
page_title="YouTube Trending Insights",
|
13 |
+
layout="wide",
|
14 |
+
initial_sidebar_state="expanded"
|
15 |
+
)
|
16 |
+
|
17 |
+
st.markdown("""
|
18 |
+
<style>
|
19 |
+
.main {
|
20 |
+
background-color: #f5f5f5;
|
21 |
+
padding: 20px;
|
22 |
+
border-radius: 10px;
|
23 |
+
}
|
24 |
+
.title {
|
25 |
+
color: #ff0000;
|
26 |
+
font-family: 'Arial', sans-serif;
|
27 |
+
text-align: center;
|
28 |
+
padding: 20px 0;
|
29 |
+
}
|
30 |
+
.subtitle {
|
31 |
+
color: #333333;
|
32 |
+
font-family: 'Arial', sans-serif;
|
33 |
+
padding: 10px 0;
|
34 |
+
}
|
35 |
+
.stButton>button {
|
36 |
+
background-color: #ff0000;
|
37 |
+
color: white;
|
38 |
+
border-radius: 5px;
|
39 |
+
padding: 10px 20px;
|
40 |
+
}
|
41 |
+
.stButton>button:hover {
|
42 |
+
background-color: #cc0000;
|
43 |
+
}
|
44 |
+
</style>
|
45 |
+
""", unsafe_allow_html=True)
|
46 |
+
|
47 |
+
# Sidebar with Logo and Select Box
|
48 |
+
with st.sidebar:
|
49 |
+
st.markdown("""
|
50 |
+
<div style='text-align:center;'>
|
51 |
+
<img src='https://upload.wikimedia.org/wikipedia/commons/b/b8/YouTube_Logo_2017.svg' width='80%'>
|
52 |
+
</div>
|
53 |
+
""", unsafe_allow_html=True)
|
54 |
+
|
55 |
+
st.markdown("<h2 class='subtitle'>π YouTube Analytics</h2>", unsafe_allow_html=True)
|
56 |
+
|
57 |
+
|
58 |
+
# Kaggle Dataset Link
|
59 |
+
st.sidebar.markdown("""
|
60 |
+
<h3 style='text-align:center;'>π Kaggle Dataset</h3>
|
61 |
+
<p style='text-align:center;'>
|
62 |
+
<a href='{}' target='_blank' style='text-decoration:none;'>
|
63 |
+
<button style='background-color:#ff0000; color:white; padding:10px 20px; border:none; border-radius:5px; cursor:pointer;'>
|
64 |
+
π Open Kaggle Dataset
|
65 |
+
</button>
|
66 |
+
</a>
|
67 |
+
</p>
|
68 |
+
""".format(KAGGLE_LINK), unsafe_allow_html=True)
|
69 |
+
# Analysis Options
|
70 |
+
options = {
|
71 |
+
"π Trending Videos Over Time": "/trending_videos_count",
|
72 |
+
"π₯§ Most Popular Categories": "/most_popular_categories",
|
73 |
+
"π Like Ratio Distribution": "/engagement/like_ratio_distribution",
|
74 |
+
"π Top Liked Videos": "/engagement/top_liked_videos",
|
75 |
+
"π Top Trending Channels": "/channel-performance/top-trending",
|
76 |
+
"π
Channel Growth Over Time": "/channel-performance/growth-over-time",
|
77 |
+
"β€οΈ Category Like-View Ratio": "/category-like-view-ratio",
|
78 |
+
"π¬ Category Comment Engagement": "/category-comment-engagement"
|
79 |
+
}
|
80 |
+
|
81 |
+
selected_option = st.selectbox("Choose an analysis:", list(options.keys()),
|
82 |
+
help="Select a visualization to explore YouTube trends")
|
83 |
+
|
84 |
+
def fetch_data(endpoint):
|
85 |
+
try:
|
86 |
+
response = requests.get(f"{API_BASE_URL}{endpoint}")
|
87 |
+
response.raise_for_status()
|
88 |
+
return response.json()
|
89 |
+
except requests.RequestException as e:
|
90 |
+
st.error(f"Failed to fetch data: {e}")
|
91 |
+
return None
|
92 |
+
|
93 |
+
st.markdown(f"<h2 class='subtitle'>{selected_option}</h2>", unsafe_allow_html=True)
|
94 |
+
data = fetch_data(options[selected_option])
|
95 |
+
|
96 |
+
if data:
|
97 |
+
if "Trending Videos" in selected_option:
|
98 |
+
df = pd.DataFrame(data["trending_video_counts"].items(), columns=["Date", "Count"])
|
99 |
+
df["Date"] = pd.to_datetime(df["Date"])
|
100 |
+
fig = px.line(df, x="Date", y="Count", title="Trending Videos Over Time")
|
101 |
+
st.plotly_chart(fig, use_container_width=True)
|
102 |
+
|
103 |
+
elif "Popular Categories" in selected_option:
|
104 |
+
df = pd.DataFrame.from_dict(data["most_popular_categories"], orient='index', columns=["Count"])
|
105 |
+
fig = px.pie(df, names=df.index, values="Count", title="Popular Categories",
|
106 |
+
hole=0.4, color_discrete_sequence=px.colors.sequential.RdBu)
|
107 |
+
st.plotly_chart(fig, use_container_width=True)
|
108 |
+
|
109 |
+
elif "Like Ratio" in selected_option:
|
110 |
+
df = pd.DataFrame(data["like_ratio_distribution"])
|
111 |
+
fig = px.histogram(df, x="like_ratio", nbins=50, title="Like Ratio Distribution",
|
112 |
+
color_discrete_sequence=['#ff0000'])
|
113 |
+
st.plotly_chart(fig, use_container_width=True)
|
114 |
+
|
115 |
+
elif "Top Liked Videos" in selected_option:
|
116 |
+
df = pd.DataFrame(data["top_liked_videos"])
|
117 |
+
fig = px.bar(df, x="title", y="likes", title="π Top Liked Videos",
|
118 |
+
color="likes", color_continuous_scale="Reds")
|
119 |
+
st.plotly_chart(fig, use_container_width=True)
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
+
elif "Top Trending Channels" in selected_option:
|
124 |
+
df = pd.DataFrame(data["top_trending_channels"].items(), columns=["Channel", "Trending Count"])
|
125 |
+
df = df.sort_values(by="Trending Count", ascending=False).head(10)
|
126 |
+
fig = px.bar(df, x="Channel", y="Trending Count", title="Top Trending Channels",
|
127 |
+
color="Trending Count", color_continuous_scale="Reds")
|
128 |
+
st.plotly_chart(fig, use_container_width=True)
|
129 |
+
|
130 |
+
elif "Channel Growth" in selected_option:
|
131 |
+
df = pd.DataFrame(data)
|
132 |
+
fig = px.line(df, x="published_month", y="video_count", color="channelTitle",
|
133 |
+
title="Channel Growth Over Time", line_shape="spline")
|
134 |
+
st.plotly_chart(fig, use_container_width=True)
|
135 |
+
|
136 |
+
elif "Like-View Ratio" in selected_option:
|
137 |
+
df = pd.DataFrame(data["data"])
|
138 |
+
fig = px.sunburst(df, path=["category_name"], values="like_view_ratio",
|
139 |
+
title="Category Like-View Ratio", color="like_view_ratio",
|
140 |
+
color_continuous_scale="RdYlBu")
|
141 |
+
st.plotly_chart(fig, use_container_width=True)
|
142 |
+
|
143 |
+
elif "Comment Engagement" in selected_option:
|
144 |
+
df = pd.DataFrame(data["data"])
|
145 |
+
fig = px.treemap(df, path=["category_name"], values="comment_count",
|
146 |
+
title="Category Comment Engagement", color="comment_count",
|
147 |
+
color_continuous_scale="Blues")
|
148 |
+
st.plotly_chart(fig, use_container_width=True)
|
149 |
+
|
150 |
+
# Dataset Preview
|
151 |
+
st.sidebar.markdown("<h2 class='subtitle'>π Dataset Preview</h2>", unsafe_allow_html=True)
|
152 |
+
with st.sidebar.expander("View Raw Dataset", expanded=False):
|
153 |
+
if st.button("Show Dataset Preview"):
|
154 |
+
try:
|
155 |
+
df_csv = pd.read_csv(CSV_FILE_PATH)
|
156 |
+
st.dataframe(df_csv.head(1000), use_container_width=True)
|
157 |
+
except Exception as e:
|
158 |
+
st.error(f"Error loading dataset: {e}")
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
|
src/services/__pycache__/data_processing.cpython-313.pyc
ADDED
Binary file (505 Bytes). View file
|
|
src/services/data_processing.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
csv_path = "src/data/yt_data.csv"
|
4 |
+
UPDATED_CSV_PATH = "src/data/merged_yt_data.csv"
|
5 |
+
# csv_df = pd.read_csv(csv_path)
|
6 |
+
|
7 |
+
|
8 |
+
def get_updated_df():
|
9 |
+
|
10 |
+
final_csv_df = pd.read_csv(UPDATED_CSV_PATH)
|
11 |
+
return final_csv_df
|
12 |
+
|
13 |
+
|
14 |
+
# df = get_updated_df()
|
15 |
+
# print(df.columns)
|
16 |
+
|
17 |
+
# # Load JSON file
|
18 |
+
# json_path = "category_id.json"
|
19 |
+
# json_df = pd.read_json(json_path)
|
20 |
+
|
21 |
+
# # Extract category ID and title from JSON
|
22 |
+
# category_df = pd.json_normalize(json_df["items"])
|
23 |
+
# category_df = category_df[["id", "snippet.title"]].rename(
|
24 |
+
# columns={"id": "category_id", "snippet.title": "category_name"})
|
25 |
+
|
26 |
+
# # Merge CSV data with category data on category_id
|
27 |
+
# csv_df["category_id"] = csv_df["category_id"].astype(
|
28 |
+
# str) # Ensure category_id is a string for merging
|
29 |
+
# merged_df = pd.merge(csv_df, category_df, on="category_id",
|
30 |
+
# how="left") # Left join to keep all video data
|
31 |
+
|
32 |
+
|
33 |
+
# # Save the merged DataFrame as a new CSV file
|
34 |
+
# merged_path = "merged_yt_data.csv"
|
35 |
+
# merged_df.to_csv(merged_path, index=False)
|