Stat_2025 / app.py
fruitpicker01's picture
Update app.py
263298d verified
raw
history blame
16.7 kB
import gradio as gr
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from prophet import Prophet
import io
from PIL import Image
# =====================
# Первый набор CSV-файлов
# =====================
URL_DASHA = "https://raw.githubusercontent.com/fruitpicker01/Storage_Dasha_2025/main/messages.csv"
URL_LERA = "https://raw.githubusercontent.com/fruitpicker01/Storage_Lera_2025/main/messages.csv"
URL_SVETA = "https://raw.githubusercontent.com/fruitpicker01/Storage_Sveta_2025/main/messages.csv"
# =====================
# Второй набор CSV-файлов
# =====================
URL_DASHA_2 = "https://raw.githubusercontent.com/fruitpicker01/Storage_2_Dasha_2025/main/messages.csv"
URL_LERA_2 = "https://raw.githubusercontent.com/fruitpicker01/Storage_2_Lera_2025/main/messages.csv"
URL_SVETA_2 = "https://raw.githubusercontent.com/fruitpicker01/Storage_2_Sveta_2025/main/messages.csv"
# =====================
# Третий набор CSV-файлов (messages_2.csv)
# =====================
URL_DASHA_3 = "https://raw.githubusercontent.com/fruitpicker01/Storage_2_Dasha_2025/main/messages_2.csv"
URL_LERA_3 = "https://raw.githubusercontent.com/fruitpicker01/Storage_2_Lera_2025/main/messages_2.csv"
URL_SVETA_3 = "https://raw.githubusercontent.com/fruitpicker01/Storage_2_Sveta_2025/main/messages_2.csv"
def read_and_process_data(url, user_name):
"""
Считывает CSV, отбирает нужные столбцы,
удаляет дубликаты (gender, generation, industry, opf),
приводит timestamp -> date.
Возвращает:
- unique_count (кол-во уникальных записей)
- df_daily: [date, count, user]
"""
import requests, base64, io
print(f"\n=== [{user_name}] чтение CSV ===")
import re
pattern = re.compile(r"https://raw\.githubusercontent\.com/([^/]+)/([^/]+)/([^/]+)/(.+)")
m = pattern.match(url)
if not m:
# Если URL не совпадает с raw.githubusercontent.com, пробуем напрямую
print(f"[{user_name}] URL не совпадает с raw.githubusercontent.com, читаем напрямую...")
try:
df = pd.read_csv(url, na_values=["Не выбрано"])
except Exception as e:
print(f"[{user_name}] Ошибка при pd.read_csv напрямую: {e}")
return 0, pd.DataFrame(columns=["date", "count", "user"])
else:
owner = m.group(1)
repo_name = m.group(2)
branch = m.group(3)
file_path = m.group(4)
api_url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{file_path}?ref={branch}"
print(f"[{user_name}] Пытаемся Contents API: {api_url}")
resp = requests.get(api_url)
if resp.status_code != 200:
print(f"[{user_name}] Не удалось получить JSON (статус={resp.status_code}), читаем напрямую...")
try:
df = pd.read_csv(url, na_values=["Не выбрано"])
except Exception as e:
print(f"[{user_name}] Ошибка при pd.read_csv напрямую: {e}")
return 0, pd.DataFrame(columns=["date", "count", "user"])
else:
data_json = resp.json()
size = data_json.get("size", 0)
file_content_encoded = data_json.get("content")
download_url = data_json.get("download_url")
if not file_content_encoded or size > 1_000_000:
# Большой файл или отсутствует content => используем download_url
print(f"[{user_name}] Файл крупнее 1 МБ или content отсутствует, скачиваем по download_url={download_url}")
try:
resp2 = requests.get(download_url)
resp2.raise_for_status()
csv_text = resp2.text
df = pd.read_csv(io.StringIO(csv_text), na_values=["Не выбрано"])
except Exception as e:
print(f"[{user_name}] Ошибка при чтении по download_url: {e}")
return 0, pd.DataFrame(columns=["date", "count", "user"])
else:
# Получаем Base64 и декодируем
try:
file_bytes = base64.b64decode(file_content_encoded)
df = pd.read_csv(io.StringIO(file_bytes.decode("utf-8")), na_values=["Не выбрано"])
except Exception as e:
print(f"[{user_name}] Ошибка декодирования Base64: {e}")
return 0, pd.DataFrame(columns=["date", "count", "user"])
print(f"[{user_name}] Исходное кол-во строк: {len(df)}")
cols = ["gender", "generation", "industry", "opf", "timestamp"]
df = df[[c for c in cols if c in df.columns]].copy()
print(f"[{user_name}] После отбора столбцов: {df.shape}")
df_unique = df.drop_duplicates(subset=["gender", "generation", "industry", "opf"]).copy()
print(f"[{user_name}] После drop_duplicates: {df_unique.shape}")
df_unique["timestamp"] = pd.to_numeric(df_unique["timestamp"], errors='coerce')
df_unique["date"] = pd.to_datetime(df_unique["timestamp"], unit="s", origin="unix", errors='coerce').dt.date
count_nat = df_unique["date"].isna().sum()
print(f"[{user_name}] Кол-во NaT дат: {count_nat}")
unique_count = len(df_unique)
# Группировка по датам
df_daily = df_unique.groupby("date").size().reset_index(name="count")
df_daily["user"] = user_name
return unique_count, df_daily
def make_average_forecast(total_by_date, end_date_str="2025-03-31"):
"""
Делает «прогноз по среднему» до указанной даты (end_date_str).
Берём средний дневной прирост count и
добавляем его день за днём (не учитывая выходные).
Возвращает DataFrame: [ds, yhat]
ds - дата (Timestamp)
yhat - прогноз накопленной суммы
"""
if total_by_date.empty:
return pd.DataFrame(columns=["ds", "yhat"])
df_tmp = total_by_date.copy()
df_tmp["date"] = pd.to_datetime(df_tmp["date"])
avg_inc = df_tmp["count"].mean() if len(df_tmp) else 0
last_date = df_tmp["date"].max()
last_cumulative = df_tmp["cumulative"].iloc[-1]
end_date = pd.to_datetime(end_date_str)
forecast_data = []
running_total = last_cumulative
current_date = last_date
while current_date < end_date:
current_date += pd.Timedelta(days=1)
if current_date > end_date:
break
running_total += avg_inc
forecast_data.append({"ds": current_date, "yhat": running_total})
return pd.DataFrame(forecast_data)
def process_data():
print("\n=== Начинаем process_data (Seaborn + Prophet + средний) ===")
# ====== Чтение данных (первый набор) ======
dasha_count, dasha_daily = read_and_process_data(URL_DASHA, "Даша")
lera_count, lera_daily = read_and_process_data(URL_LERA, "Лера")
sveta_count, sveta_daily = read_and_process_data(URL_SVETA, "Света")
# ====== Чтение (второй набор) ======
try:
dasha_count2, dasha_daily2 = read_and_process_data(URL_DASHA_2, "Даша (2)")
dasha_daily2["user"] = "Даша"
except Exception as e:
print(f"[Даша (2)] Ошибка при чтении дополнительного CSV: {e}")
dasha_count2, dasha_daily2 = 0, pd.DataFrame(columns=["date", "count", "user"])
try:
lera_count2, lera_daily2 = read_and_process_data(URL_LERA_2, "Лера (2)")
lera_daily2["user"] = "Лера"
except Exception as e:
print(f"[Лера (2)] Ошибка при чтении дополнительного CSV: {e}")
lera_count2, lera_daily2 = 0, pd.DataFrame(columns=["date", "count", "user"])
try:
sveta_count2, sveta_daily2 = read_and_process_data(URL_SVETA_2, "Света (2)")
sveta_daily2["user"] = "Света"
except Exception as e:
print(f"[Света (2)] Ошибка при чтении дополнительного CSV: {e}")
sveta_count2, sveta_daily2 = 0, pd.DataFrame(columns=["date", "count", "user"])
# ====== Чтение (третий набор: messages_2.csv) ======
try:
dasha_count3, dasha_daily3 = read_and_process_data(URL_DASHA_3, "Даша (3)")
# Объединяем с "Дашей"
dasha_daily3["user"] = "Даша"
except Exception as e:
print(f"[Даша (3)] Ошибка при чтении messages_2.csv: {e}")
dasha_count3, dasha_daily3 = 0, pd.DataFrame(columns=["date", "count", "user"])
try:
lera_count3, lera_daily3 = read_and_process_data(URL_LERA_3, "Лера (3)")
lera_daily3["user"] = "Лера"
except Exception as e:
print(f"[Лера (3)] Ошибка при чтении messages_2.csv: {e}")
lera_count3, lera_daily3 = 0, pd.DataFrame(columns=["date", "count", "user"])
try:
sveta_count3, sveta_daily3 = read_and_process_data(URL_SVETA_3, "Света (3)")
sveta_daily3["user"] = "Света"
except Exception as e:
print(f"[Света (3)] Ошибка при чтении messages_2.csv: {e}")
sveta_count3, sveta_daily3 = 0, pd.DataFrame(columns=["date", "count", "user"])
# ====== Итоговые суммы ======
dasha_count_total = dasha_count + dasha_count2 + dasha_count3
lera_count_total = lera_count + lera_count2 + lera_count3
sveta_count_total = sveta_count + sveta_count2 + sveta_count3
dasha_daily_total = pd.concat([dasha_daily, dasha_daily2, dasha_daily3], ignore_index=True)
lera_daily_total = pd.concat([lera_daily, lera_daily2, lera_daily3 ], ignore_index=True)
sveta_daily_total = pd.concat([sveta_daily, sveta_daily2, sveta_daily3], ignore_index=True)
total_count = dasha_count_total + lera_count_total + sveta_count_total
print(f"Суммарное количество (Д+Л+С): {total_count}")
# ====== Проценты ======
dasha_percent = round((dasha_count_total / 234) * 100) if 234 else 0
lera_percent = round((lera_count_total / 234) * 100) if 234 else 0
sveta_percent = round((sveta_count_total / 234) * 100) if 234 else 0
total_percent = round((total_count / 702) * 100) if 702 else 0
def get_progress_bar(label, abs_val, pct):
capacity = 234 if label in ["Даша", "Лера", "Света"] else 702
return f"""
<div style='margin-bottom: 1em;'>
<div><strong>{label}</strong></div>
<div style='width: 100%; background-color: #ddd; text-align: left;'>
<div style='width: {pct}%; background-color: #4CAF50; padding: 5px 0;'>
&nbsp;{abs_val} SMS ({pct}% из {capacity})
</div>
</div>
</div>
"""
bars_html = (
get_progress_bar("Даша", dasha_count_total, dasha_percent) +
get_progress_bar("Лера", lera_count_total, lera_percent) +
get_progress_bar("Света", sveta_count_total, sveta_percent) +
get_progress_bar("Всего", total_count, total_percent)
)
# ====== Ежедневные данные + накопительное ======
daily_all = pd.concat([dasha_daily_total, lera_daily_total, sveta_daily_total], ignore_index=True)
daily_all = daily_all.dropna(subset=["date"])
daily_all = daily_all.sort_values(["user", "date"])
daily_all["count"] = pd.to_numeric(daily_all["count"], errors="coerce").fillna(0)
daily_all["cumulative"] = daily_all.groupby("user")["count"].cumsum()
# «Всего» по датам
total_by_date = daily_all.groupby("date")["count"].sum().reset_index(name="count")
total_by_date = total_by_date.sort_values("date")
total_by_date["cumulative"] = total_by_date["count"].cumsum()
total_by_date["user"] = "Всего"
# ====== Первый график (накопительные кривые) ======
daily_all_final = pd.concat([daily_all, total_by_date], ignore_index=True)
daily_all_final["date_dt"] = pd.to_datetime(daily_all_final["date"])
# Сортируем легенду по убыванию финальной точки
last_values = daily_all_final.groupby("user")["cumulative"].last().sort_values(ascending=False)
sorted_users = last_values.index.tolist()
fig1, ax1 = plt.subplots(figsize=(8,5))
sns.lineplot(
data=daily_all_final,
x="date_dt", y="cumulative",
hue="user",
hue_order=sorted_users,
ax=ax1, marker="o"
)
ax1.set_title("Накопительное количество SMS")
ax1.set_xlabel("Дата")
ax1.set_ylabel("Накопленное число SMS")
fig1.autofmt_xdate(rotation=30)
buf1 = io.BytesIO()
plt.savefig(buf1, format="png")
buf1.seek(0)
image1_pil = Image.open(buf1)
# ====== Prophet + Прогноз по среднему (всего) ======
df_prophet = total_by_date[["date", "cumulative"]].copy()
df_prophet.columns = ["ds", "y"]
df_prophet["ds"] = pd.to_datetime(df_prophet["ds"])
model = Prophet()
model.fit(df_prophet)
end_date = pd.to_datetime("2025-03-31")
last_date = df_prophet["ds"].max()
additional_days = (end_date - last_date).days
future = model.make_future_dataframe(periods=additional_days if additional_days>0 else 0)
forecast = model.predict(future)
# Подготовка данных для графика
df_plot = pd.merge(
forecast[["ds", "yhat"]],
df_prophet[["ds", "y"]],
on="ds",
how="left"
)
df_history = df_plot.dropna(subset=["y"]).copy()
df_future = df_plot[df_plot["y"].isna()].copy()
# Прогноз по среднему
df_avg = make_average_forecast(total_by_date, "2025-03-31")
df_history["type"] = "История"
df_history["value"] = df_history["y"]
df_future["type"] = "Прогноз (Prophet)"
df_future["value"] = df_future["yhat"]
df_avg["type"] = "Прогноз (среднее)"
df_avg["value"] = df_avg["yhat"]
df_avg.rename(columns={"ds":"ds"}, inplace=True)
# Сшиваем
df_combined = pd.concat([df_history, df_future, df_avg], ignore_index=True)
df_combined["ds"] = pd.to_datetime(df_combined["ds"])
# Второй график
line_styles = {
"История": "",
"Прогноз (Prophet)": (2,2),
"Прогноз (среднее)": (2,2)
}
line_colors = {
"История": "blue",
"Прогноз (Prophet)": "red",
"Прогноз (среднее)": "green"
}
fig2, ax2 = plt.subplots(figsize=(8,5))
sns.lineplot(
data=df_combined,
x="ds", y="value",
hue="type",
style="type",
dashes=line_styles,
palette=line_colors,
markers=False,
ax=ax2
)
ax2.set_title("Прогноз до конца марта 2025 (Prophet & По среднему)")
ax2.set_xlabel("Дата")
ax2.set_ylabel("Накопленное число SMS (Всего)")
fig2.autofmt_xdate(rotation=30)
buf2 = io.BytesIO()
plt.savefig(buf2, format="png")
buf2.seek(0)
image2_pil = Image.open(buf2)
# Результат
return bars_html, image1_pil, image2_pil
# Gradio-интерфейс
with gr.Blocks() as demo:
gr.Markdown("<h2>Количество сохраненных SMS (Даша, Лера, Света, Всего) + Прогноз</h2>")
btn = gr.Button("Обновить данные и показать результат")
html_output = gr.HTML(label="Прогресс-бары: количество SMS и %")
image_output1 = gr.Image(type="pil", label="Накопительный график")
image_output2 = gr.Image(type="pil", label="Прогноз: Prophet & По среднему")
btn.click(fn=process_data, outputs=[html_output, image_output1, image_output2])
if __name__ == "__main__":
demo.launch()