Update app.py
Browse files
app.py
CHANGED
@@ -19,94 +19,80 @@ def read_and_process_data(url, user_name):
|
|
19 |
cols = ["gender", "generation", "industry", "opf", "timestamp"]
|
20 |
df = df[[c for c in cols if c in df.columns]].copy()
|
21 |
|
22 |
-
# Убираем дубликаты по
|
23 |
df_unique = df.drop_duplicates(subset=["gender", "generation", "industry", "opf"])
|
24 |
|
25 |
# Количество уникальных SMS
|
26 |
unique_count = len(df_unique)
|
27 |
|
28 |
-
#
|
29 |
if "timestamp" in df_unique.columns:
|
30 |
df_unique["timestamp"] = pd.to_numeric(df_unique["timestamp"], errors='coerce')
|
31 |
df_unique["date"] = pd.to_datetime(df_unique["timestamp"], unit="s", origin="unix", errors='coerce').dt.date
|
32 |
else:
|
33 |
df_unique["date"] = pd.NaT
|
34 |
|
35 |
-
# Сгруппируем по дате, чтобы получить
|
36 |
df_daily = df_unique.groupby("date").size().reset_index(name="count")
|
37 |
df_daily["user"] = user_name
|
38 |
|
39 |
return unique_count, df_daily
|
40 |
|
41 |
-
def
|
42 |
"""
|
43 |
-
Строит
|
44 |
-
считая
|
45 |
-
В выходные (Sat/Sun) прирост = 0.
|
46 |
|
47 |
-
Возвращает DataFrame с колонками ["ds", "yhat"],
|
|
|
48 |
"""
|
49 |
if total_by_date.empty:
|
50 |
return pd.DataFrame(columns=["ds", "yhat"])
|
51 |
|
52 |
-
# Превращаем date -> datetime, чтобы работать с днем недели
|
53 |
df_tmp = total_by_date.copy()
|
54 |
df_tmp["date"] = pd.to_datetime(df_tmp["date"])
|
55 |
|
56 |
-
# Средний дневной прирост по
|
57 |
-
|
58 |
-
# Фильтруем по будням (понедельник=0 ... пятница=4)
|
59 |
-
df_weekdays = df_tmp[df_tmp["date"].dt.weekday < 5]
|
60 |
-
if len(df_weekdays) == 0:
|
61 |
-
avg_inc = 0
|
62 |
-
else:
|
63 |
-
avg_inc = df_weekdays["count"].mean()
|
64 |
|
65 |
last_date = df_tmp["date"].max()
|
66 |
-
last_cumulative = df_tmp["cumulative"].iloc[-1]
|
67 |
|
68 |
end_date = pd.to_datetime(end_date_str)
|
69 |
|
70 |
-
#
|
71 |
current_date = last_date
|
72 |
-
|
73 |
running_total = last_cumulative
|
74 |
|
75 |
while current_date < end_date:
|
76 |
-
current_date
|
77 |
if current_date > end_date:
|
78 |
break
|
79 |
-
#
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
# Превращаем в DataFrame
|
86 |
-
df_naive = pd.DataFrame(naive_data)
|
87 |
-
return df_naive
|
88 |
|
89 |
def process_data():
|
90 |
-
# Считываем
|
91 |
dasha_count, dasha_daily = read_and_process_data(URL_DASHA, "Даша")
|
92 |
lera_count, lera_daily = read_and_process_data(URL_LERA, "Лера")
|
93 |
sveta_count, sveta_daily = read_and_process_data(URL_SVETA, "Света")
|
94 |
|
95 |
-
#
|
96 |
total_count = dasha_count + lera_count + sveta_count
|
97 |
|
98 |
-
#
|
99 |
dasha_percent = round((dasha_count / 234) * 100) if 234 else 0
|
100 |
lera_percent = round((lera_count / 234) * 100) if 234 else 0
|
101 |
sveta_percent = round((sveta_count / 234) * 100) if 234 else 0
|
102 |
total_percent = round((total_count / 702) * 100) if 702 else 0
|
103 |
|
104 |
-
# Генерируем HTML
|
105 |
def get_progress_bar(label, abs_val, pct):
|
106 |
-
if label in ["Даша", "Лера", "Света"]
|
107 |
-
capacity = 234
|
108 |
-
else:
|
109 |
-
capacity = 702
|
110 |
return f"""
|
111 |
<div style='margin-bottom: 1em;'>
|
112 |
<div><strong>{label}</strong></div>
|
@@ -125,36 +111,35 @@ def process_data():
|
|
125 |
get_progress_bar("Всего", total_count, total_percent)
|
126 |
)
|
127 |
|
128 |
-
#
|
129 |
daily_all = pd.concat([dasha_daily, lera_daily, sveta_daily], ignore_index=True)
|
130 |
daily_all = daily_all.dropna(subset=["date"]) # убираем NaT
|
131 |
|
132 |
-
# Считаем кумулятивное
|
133 |
daily_all = daily_all.sort_values(by=["user", "date"])
|
134 |
daily_all["cumulative"] = daily_all.groupby("user")["count"].cumsum()
|
135 |
|
136 |
-
#
|
137 |
total_by_date = daily_all.groupby("date")["count"].sum().reset_index(name="count")
|
138 |
total_by_date = total_by_date.sort_values(by="date")
|
139 |
total_by_date["cumulative"] = total_by_date["count"].cumsum()
|
140 |
total_by_date["user"] = "Всего"
|
141 |
|
142 |
-
#
|
143 |
daily_all_final = pd.concat([daily_all, total_by_date], ignore_index=True)
|
144 |
|
145 |
-
#
|
146 |
last_values = daily_all_final.groupby("user")["cumulative"].last().sort_values(ascending=False)
|
147 |
sorted_users = last_values.index.tolist()
|
148 |
|
149 |
-
# Явно зададим цвета
|
150 |
color_map = {
|
151 |
-
"Даша": "#1f77b4",
|
152 |
-
"Лера": "#2ca02c",
|
153 |
-
"Света": "#d62728",
|
154 |
-
"Всего": "#9467bd"
|
155 |
}
|
156 |
|
157 |
-
# Строим накопительный
|
158 |
fig = px.line(
|
159 |
daily_all_final,
|
160 |
x="date",
|
@@ -166,20 +151,16 @@ def process_data():
|
|
166 |
color_discrete_map=color_map
|
167 |
)
|
168 |
|
169 |
-
#
|
170 |
-
# Прогнозы (Prophet + Наивный)
|
171 |
-
# -----------------------
|
172 |
-
|
173 |
forecast_fig = None
|
174 |
|
175 |
-
#
|
176 |
-
# (Если данных мало, проверим, чтобы не было пусто)
|
177 |
if not total_by_date.empty:
|
178 |
df_prophet = total_by_date[["date", "cumulative"]].copy()
|
179 |
df_prophet.columns = ["ds", "y"]
|
180 |
df_prophet["ds"] = pd.to_datetime(df_prophet["ds"])
|
181 |
|
182 |
-
#
|
183 |
model = Prophet()
|
184 |
model.fit(df_prophet)
|
185 |
|
@@ -187,28 +168,26 @@ def process_data():
|
|
187 |
last_date = df_prophet["ds"].max()
|
188 |
additional_days = (end_date - last_date).days
|
189 |
|
190 |
-
future = model.make_future_dataframe(periods=0)
|
191 |
if additional_days > 0:
|
192 |
future = model.make_future_dataframe(periods=additional_days)
|
193 |
|
194 |
forecast = model.predict(future)
|
195 |
|
196 |
-
#
|
197 |
df_plot = pd.merge(
|
198 |
forecast[["ds", "yhat"]],
|
199 |
df_prophet[["ds", "y"]],
|
200 |
on="ds",
|
201 |
how="left"
|
202 |
)
|
203 |
-
df_history = df_plot.dropna(subset=["y"])
|
204 |
-
df_future = df_plot[df_plot["y"].isna()]
|
205 |
|
206 |
-
#
|
207 |
-
|
208 |
-
# Функция вернёт df_naive: ds, yhat
|
209 |
-
df_naive = make_naive_forecast(total_by_date, "2025-02-28")
|
210 |
|
211 |
-
#
|
212 |
forecast_fig = px.line(
|
213 |
df_history,
|
214 |
x="ds",
|
@@ -216,6 +195,7 @@ def process_data():
|
|
216 |
title="Прогноз до конца февраля 2025 (всего)",
|
217 |
labels={"ds": "Дата", "y": "Накопленное число SMS"}
|
218 |
)
|
|
|
219 |
forecast_fig.add_scatter(
|
220 |
x=df_future["ds"],
|
221 |
y=df_future["yhat"],
|
@@ -224,25 +204,26 @@ def process_data():
|
|
224 |
line=dict(dash="dash", color="red")
|
225 |
)
|
226 |
|
227 |
-
#
|
228 |
-
if not
|
229 |
forecast_fig.add_scatter(
|
230 |
-
x=
|
231 |
-
y=
|
232 |
mode="lines",
|
233 |
-
name="Прогноз по среднему",
|
234 |
line=dict(dash="dash", color="green")
|
235 |
)
|
236 |
|
237 |
forecast_fig.update_layout(showlegend=True)
|
238 |
|
|
|
239 |
return (bars_html, fig, forecast_fig)
|
240 |
|
241 |
with gr.Blocks() as demo:
|
242 |
gr.Markdown("<h2>Количество сохраненных SMS (Даша, Лера, Света)</h2>")
|
243 |
btn = gr.Button("Обновить данные и показать результат")
|
244 |
html_output = gr.HTML(label="Прогресс-бары: количество SMS и %")
|
245 |
-
plot_output = gr.Plot(label="Накопительный график
|
246 |
forecast_output = gr.Plot(label="Прогноз до конца февраля 2025 (всего)")
|
247 |
|
248 |
btn.click(fn=process_data, outputs=[html_output, plot_output, forecast_output])
|
|
|
19 |
cols = ["gender", "generation", "industry", "opf", "timestamp"]
|
20 |
df = df[[c for c in cols if c in df.columns]].copy()
|
21 |
|
22 |
+
# Убираем дубликаты по ключевым столбцам
|
23 |
df_unique = df.drop_duplicates(subset=["gender", "generation", "industry", "opf"])
|
24 |
|
25 |
# Количество уникальных SMS
|
26 |
unique_count = len(df_unique)
|
27 |
|
28 |
+
# Преобразуем timestamp -> date
|
29 |
if "timestamp" in df_unique.columns:
|
30 |
df_unique["timestamp"] = pd.to_numeric(df_unique["timestamp"], errors='coerce')
|
31 |
df_unique["date"] = pd.to_datetime(df_unique["timestamp"], unit="s", origin="unix", errors='coerce').dt.date
|
32 |
else:
|
33 |
df_unique["date"] = pd.NaT
|
34 |
|
35 |
+
# Сгруппируем по дате, чтобы получить кол-во за каждый день
|
36 |
df_daily = df_unique.groupby("date").size().reset_index(name="count")
|
37 |
df_daily["user"] = user_name
|
38 |
|
39 |
return unique_count, df_daily
|
40 |
|
41 |
+
def make_average_forecast(total_by_date, end_date_str="2025-02-28"):
|
42 |
"""
|
43 |
+
Строит «прогноз по среднему» до указанной даты (end_date_str),
|
44 |
+
считая средний дневной прирост (по всем дням, без исключения выходных).
|
|
|
45 |
|
46 |
+
Возвращает DataFrame с колонками ["ds", "yhat"],
|
47 |
+
начиная с (last_date+1) по end_date_str (включительно).
|
48 |
"""
|
49 |
if total_by_date.empty:
|
50 |
return pd.DataFrame(columns=["ds", "yhat"])
|
51 |
|
|
|
52 |
df_tmp = total_by_date.copy()
|
53 |
df_tmp["date"] = pd.to_datetime(df_tmp["date"])
|
54 |
|
55 |
+
# Средний дневной прирост (столбец "count") по всем дням
|
56 |
+
avg_inc = df_tmp["count"].mean() if len(df_tmp) else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
last_date = df_tmp["date"].max()
|
59 |
+
last_cumulative = df_tmp["cumulative"].iloc[-1]
|
60 |
|
61 |
end_date = pd.to_datetime(end_date_str)
|
62 |
|
63 |
+
# Движемся по календарю день за днём
|
64 |
current_date = last_date
|
65 |
+
forecast_data = []
|
66 |
running_total = last_cumulative
|
67 |
|
68 |
while current_date < end_date:
|
69 |
+
current_date += pd.Timedelta(days=1)
|
70 |
if current_date > end_date:
|
71 |
break
|
72 |
+
# Прибавляем средний прирост независимо от выходного или буднего дня
|
73 |
+
running_total += avg_inc
|
74 |
+
forecast_data.append({"ds": current_date, "yhat": running_total})
|
75 |
+
|
76 |
+
return pd.DataFrame(forecast_data)
|
|
|
|
|
|
|
|
|
77 |
|
78 |
def process_data():
|
79 |
+
# Шаг 1: Считываем CSV по каждому репозиторию (Даша, Лера, Света)
|
80 |
dasha_count, dasha_daily = read_and_process_data(URL_DASHA, "Даша")
|
81 |
lera_count, lera_daily = read_and_process_data(URL_LERA, "Лера")
|
82 |
sveta_count, sveta_daily = read_and_process_data(URL_SVETA, "Света")
|
83 |
|
84 |
+
# Сумма
|
85 |
total_count = dasha_count + lera_count + sveta_count
|
86 |
|
87 |
+
# Подсчитываем проценты
|
88 |
dasha_percent = round((dasha_count / 234) * 100) if 234 else 0
|
89 |
lera_percent = round((lera_count / 234) * 100) if 234 else 0
|
90 |
sveta_percent = round((sveta_count / 234) * 100) if 234 else 0
|
91 |
total_percent = round((total_count / 702) * 100) if 702 else 0
|
92 |
|
93 |
+
# Генерируем HTML для прогресс-баров
|
94 |
def get_progress_bar(label, abs_val, pct):
|
95 |
+
capacity = 234 if label in ["Даша", "Лера", "Света"] else 702
|
|
|
|
|
|
|
96 |
return f"""
|
97 |
<div style='margin-bottom: 1em;'>
|
98 |
<div><strong>{label}</strong></div>
|
|
|
111 |
get_progress_bar("Всего", total_count, total_percent)
|
112 |
)
|
113 |
|
114 |
+
# Шаг 2: Готовим общий датафрейм по датам
|
115 |
daily_all = pd.concat([dasha_daily, lera_daily, sveta_daily], ignore_index=True)
|
116 |
daily_all = daily_all.dropna(subset=["date"]) # убираем NaT
|
117 |
|
118 |
+
# Считаем кумулятивное значение для каждого пользователя
|
119 |
daily_all = daily_all.sort_values(by=["user", "date"])
|
120 |
daily_all["cumulative"] = daily_all.groupby("user")["count"].cumsum()
|
121 |
|
122 |
+
# «Всего»
|
123 |
total_by_date = daily_all.groupby("date")["count"].sum().reset_index(name="count")
|
124 |
total_by_date = total_by_date.sort_values(by="date")
|
125 |
total_by_date["cumulative"] = total_by_date["count"].cumsum()
|
126 |
total_by_date["user"] = "Всего"
|
127 |
|
128 |
+
# Объединяем
|
129 |
daily_all_final = pd.concat([daily_all, total_by_date], ignore_index=True)
|
130 |
|
131 |
+
# Сортируем легенду: у кого итог больше, тот сверху
|
132 |
last_values = daily_all_final.groupby("user")["cumulative"].last().sort_values(ascending=False)
|
133 |
sorted_users = last_values.index.tolist()
|
134 |
|
|
|
135 |
color_map = {
|
136 |
+
"Даша": "#1f77b4",
|
137 |
+
"Лера": "#2ca02c",
|
138 |
+
"Света": "#d62728",
|
139 |
+
"Всего": "#9467bd"
|
140 |
}
|
141 |
|
142 |
+
# Строим накопительный график
|
143 |
fig = px.line(
|
144 |
daily_all_final,
|
145 |
x="date",
|
|
|
151 |
color_discrete_map=color_map
|
152 |
)
|
153 |
|
154 |
+
# Шаг 3: Два прогноза
|
|
|
|
|
|
|
155 |
forecast_fig = None
|
156 |
|
157 |
+
# Если есть данные "Всего", делаем прогноз
|
|
|
158 |
if not total_by_date.empty:
|
159 |
df_prophet = total_by_date[["date", "cumulative"]].copy()
|
160 |
df_prophet.columns = ["ds", "y"]
|
161 |
df_prophet["ds"] = pd.to_datetime(df_prophet["ds"])
|
162 |
|
163 |
+
# Прогноз Prophet
|
164 |
model = Prophet()
|
165 |
model.fit(df_prophet)
|
166 |
|
|
|
168 |
last_date = df_prophet["ds"].max()
|
169 |
additional_days = (end_date - last_date).days
|
170 |
|
171 |
+
future = model.make_future_dataframe(periods=0) # если уже после
|
172 |
if additional_days > 0:
|
173 |
future = model.make_future_dataframe(periods=additional_days)
|
174 |
|
175 |
forecast = model.predict(future)
|
176 |
|
177 |
+
# Совмещаем
|
178 |
df_plot = pd.merge(
|
179 |
forecast[["ds", "yhat"]],
|
180 |
df_prophet[["ds", "y"]],
|
181 |
on="ds",
|
182 |
how="left"
|
183 |
)
|
184 |
+
df_history = df_plot.dropna(subset=["y"])
|
185 |
+
df_future = df_plot[df_plot["y"].isna()]
|
186 |
|
187 |
+
# Прогноз по среднему (без учёта выходных — т. е. на каждый календарный день)
|
188 |
+
df_avg = make_average_forecast(total_by_date, "2025-02-28")
|
|
|
|
|
189 |
|
190 |
+
# Общий график для сравнения
|
191 |
forecast_fig = px.line(
|
192 |
df_history,
|
193 |
x="ds",
|
|
|
195 |
title="Прогноз до конца февраля 2025 (всего)",
|
196 |
labels={"ds": "Дата", "y": "Накопленное число SMS"}
|
197 |
)
|
198 |
+
# Prophet-пунктир
|
199 |
forecast_fig.add_scatter(
|
200 |
x=df_future["ds"],
|
201 |
y=df_future["yhat"],
|
|
|
204 |
line=dict(dash="dash", color="red")
|
205 |
)
|
206 |
|
207 |
+
# Средний-пунктир
|
208 |
+
if not df_avg.empty:
|
209 |
forecast_fig.add_scatter(
|
210 |
+
x=df_avg["ds"],
|
211 |
+
y=df_avg["yhat"],
|
212 |
mode="lines",
|
213 |
+
name="Прогноз (по среднему)",
|
214 |
line=dict(dash="dash", color="green")
|
215 |
)
|
216 |
|
217 |
forecast_fig.update_layout(showlegend=True)
|
218 |
|
219 |
+
# Возвращаем всё в Gradio
|
220 |
return (bars_html, fig, forecast_fig)
|
221 |
|
222 |
with gr.Blocks() as demo:
|
223 |
gr.Markdown("<h2>Количество сохраненных SMS (Даша, Лера, Света)</h2>")
|
224 |
btn = gr.Button("Обновить данные и показать результат")
|
225 |
html_output = gr.HTML(label="Прогресс-бары: количество SMS и %")
|
226 |
+
plot_output = gr.Plot(label="Накопительный график (Даша, Лера, Света, Всего)")
|
227 |
forecast_output = gr.Plot(label="Прогноз до конца февраля 2025 (всего)")
|
228 |
|
229 |
btn.click(fn=process_data, outputs=[html_output, plot_output, forecast_output])
|