Spaces:
Sleeping
Sleeping
BarngSJ
commited on
Commit
·
34d098b
1
Parent(s):
e0dd26e
ew
Browse files- app.py +34 -0
- make_csv.py +50 -0
- scrap_data_from_FN.py +184 -0
app.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import schedule
|
3 |
+
import time
|
4 |
+
import subprocess
|
5 |
+
import threading
|
6 |
+
|
7 |
+
# Function to execute your scripts
|
8 |
+
def job():
|
9 |
+
st.write("Running scheduled scripts...")
|
10 |
+
subprocess.run(["python", "scrap_data_from_FN.py"])
|
11 |
+
time.sleep(20)
|
12 |
+
subprocess.run(["python", "run_at_morning.py"])
|
13 |
+
st.write("Scripts executed successfully!")
|
14 |
+
|
15 |
+
# Schedule the job at 04:00 AM daily
|
16 |
+
schedule.every().day.at("04:00").do(job)
|
17 |
+
|
18 |
+
# Background scheduler function
|
19 |
+
def run_scheduler():
|
20 |
+
while True:
|
21 |
+
schedule.run_pending()
|
22 |
+
time.sleep(60)
|
23 |
+
|
24 |
+
# Start the scheduler in a separate thread
|
25 |
+
threading.Thread(target=run_scheduler, daemon=True).start()
|
26 |
+
|
27 |
+
# Streamlit UI
|
28 |
+
st.title("Automated Script Runner on Hugging Face Spaces")
|
29 |
+
st.write("This app automatically runs scripts at 04:00 AM.")
|
30 |
+
|
31 |
+
if st.button("Run Now"):
|
32 |
+
job()
|
33 |
+
st.success("Scripts executed manually!")
|
34 |
+
|
make_csv.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
import datetime
|
5 |
+
|
6 |
+
today_format = datetime.datetime.today().strftime("%Y%m%d")
|
7 |
+
|
8 |
+
# Load the CSV into a DataFrame
|
9 |
+
df = pd.read_csv("all_data.csv")
|
10 |
+
df_name = pd.read_csv("tempo_list.csv")
|
11 |
+
|
12 |
+
# Split the 'ratings' column into separate columns
|
13 |
+
ratings_df = df["ratings"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {})
|
14 |
+
ratings_split = pd.json_normalize(ratings_df)
|
15 |
+
|
16 |
+
# Merge the new ratings columns with the original DataFrame
|
17 |
+
df = df.drop("ratings", axis=1).join(ratings_split)
|
18 |
+
df = df.dropna(subset=["name"])
|
19 |
+
|
20 |
+
# Extract tempo_code from the source_url
|
21 |
+
df["tempo_code"] = df["source_url"].apply(lambda x: x.split("/")[4] if isinstance(x, str) else "")
|
22 |
+
df["tempo_code"] = df["tempo_code"].apply(lambda x: x[3:])
|
23 |
+
|
24 |
+
# Merge with df_name based on tempo_code
|
25 |
+
df = pd.merge(df, df_name, left_on="tempo_code", right_on="サロンID", how="left")
|
26 |
+
df["review_date"] = df["review_date"].str.replace("[投稿日] ", "")
|
27 |
+
df["review_date"] = pd.to_datetime(df["review_date"])
|
28 |
+
|
29 |
+
# If you need the date and time separately:
|
30 |
+
df["date"] = df["review_date"].dt.date
|
31 |
+
df["time"] = df["review_date"].dt.time
|
32 |
+
|
33 |
+
df = df.dropna(subset=["review_date"])
|
34 |
+
|
35 |
+
df = df[["サロンID", "サロン名", "name", "details", "date", "time", "total_rating", "雰囲気", "接客サービス", "技術・仕上がり", "メニュー・料金", "review"]]
|
36 |
+
df = df.rename(
|
37 |
+
columns={
|
38 |
+
"name": "客様",
|
39 |
+
"details": "詳細情報",
|
40 |
+
"date": "日程",
|
41 |
+
"time": "時刻",
|
42 |
+
"total_rating": "総合評価",
|
43 |
+
"review": "コメント",
|
44 |
+
}
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
df = df.drop_duplicates()
|
49 |
+
# Save the new DataFrame to a new CSV file
|
50 |
+
df.to_csv(f"{today_format}.csv", index=False)
|
scrap_data_from_FN.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# データ操作ライブラリ
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
# 時間関連ライブラリ
|
6 |
+
import time
|
7 |
+
|
8 |
+
# URLデータ収集ライブラリ
|
9 |
+
import requests
|
10 |
+
from bs4 import BeautifulSoup
|
11 |
+
|
12 |
+
# ユーザーエージェント
|
13 |
+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
|
14 |
+
|
15 |
+
# サロンのIDリスト
|
16 |
+
tempo_list = [
|
17 |
+
"H000139654",
|
18 |
+
"H000143665",
|
19 |
+
"H000160021",
|
20 |
+
"H000209382",
|
21 |
+
"H000223934",
|
22 |
+
"H000225867",
|
23 |
+
"H000229159",
|
24 |
+
"H000231759",
|
25 |
+
"H000233312",
|
26 |
+
"H000237335",
|
27 |
+
"H000237561",
|
28 |
+
"H000265843",
|
29 |
+
"H000273518",
|
30 |
+
"H000286411",
|
31 |
+
"H000307248",
|
32 |
+
"H000307249",
|
33 |
+
"H000307251",
|
34 |
+
"H000307252",
|
35 |
+
"H000307254",
|
36 |
+
"H000307256",
|
37 |
+
"H000307404",
|
38 |
+
"H000316742",
|
39 |
+
"H000319805",
|
40 |
+
"H000319837",
|
41 |
+
"H000348209",
|
42 |
+
"H000356610",
|
43 |
+
"H000361649",
|
44 |
+
"H000368241",
|
45 |
+
"H000377123",
|
46 |
+
"H000391152",
|
47 |
+
"H000396645",
|
48 |
+
"H000396756",
|
49 |
+
"H000402609",
|
50 |
+
"H000402612",
|
51 |
+
"H000406857",
|
52 |
+
"H000407525",
|
53 |
+
"H000410429",
|
54 |
+
"H000410434",
|
55 |
+
"H000416986",
|
56 |
+
"H000419242",
|
57 |
+
"H000434472",
|
58 |
+
"H000449155",
|
59 |
+
"H000449351",
|
60 |
+
"H000477350",
|
61 |
+
"H000491208",
|
62 |
+
"H000494046",
|
63 |
+
"H000500991",
|
64 |
+
"H000503062",
|
65 |
+
"H000511837",
|
66 |
+
"H000522696",
|
67 |
+
"H000553193",
|
68 |
+
"H000585265",
|
69 |
+
"H000585268",
|
70 |
+
"H000610008",
|
71 |
+
"H000628393",
|
72 |
+
"H000640388",
|
73 |
+
"H000640401",
|
74 |
+
"H000649747",
|
75 |
+
"H000655543",
|
76 |
+
"H000707971",
|
77 |
+
"H000715770",
|
78 |
+
]
|
79 |
+
|
80 |
+
# 収集するURLリスト作成
|
81 |
+
urls = []
|
82 |
+
for tempo in tempo_list:
|
83 |
+
for j in range(1, 20): # ページ1から14まで
|
84 |
+
urls.append(f"https://beauty.hotpepper.jp/kr/sln{tempo}/review/PN{j}.html")
|
85 |
+
|
86 |
+
# 途中までのデータがあれば読み込む
|
87 |
+
csv_filename = "all_data.csv"
|
88 |
+
if os.path.exists(csv_filename):
|
89 |
+
existing_df = pd.read_csv(csv_filename, encoding="utf-8-sig")
|
90 |
+
scraped_urls = set(existing_df["source_url"]) # すでに取得済みのURL
|
91 |
+
print(f"Loaded existing data: {len(existing_df)} rows")
|
92 |
+
else:
|
93 |
+
existing_df = pd.DataFrame()
|
94 |
+
scraped_urls = set()
|
95 |
+
|
96 |
+
|
97 |
+
# データスクレイピング関数
|
98 |
+
def scrap_data(url, headers):
|
99 |
+
response = requests.get(url, headers=headers)
|
100 |
+
if response.status_code != 200:
|
101 |
+
print(f"Failed to fetch {url}, status code:", response.status_code)
|
102 |
+
return pd.DataFrame() # 空のDataFrameを返す
|
103 |
+
|
104 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
105 |
+
customers = []
|
106 |
+
|
107 |
+
# 各レビューを処理
|
108 |
+
for review in soup.find_all("div", class_="fr"):
|
109 |
+
customer = {"source_url": url} # どのURLから取得したか記録
|
110 |
+
|
111 |
+
# 名前と詳細
|
112 |
+
name_tag = review.find_previous("p", class_="fl w580 pL25")
|
113 |
+
if name_tag:
|
114 |
+
customer["name"] = name_tag.find("span", class_="b").text.strip() if name_tag.find("span", class_="b") else "N/A"
|
115 |
+
# print(customer["name"])
|
116 |
+
customer["details"] = name_tag.find("span", class_="mL5 fs10 fgGray").text.strip() if name_tag.find("span", class_="mL5 fs10 fgGray") else "N/A"
|
117 |
+
# print(customer["details"])
|
118 |
+
|
119 |
+
total_rating = review.find_next("li", class_="bdN fgGray b")
|
120 |
+
if total_rating:
|
121 |
+
# Correct the reference from name_tag to total_rating
|
122 |
+
total_rating_span = total_rating.find("span", class_="mL5 mR10 fgPurple4")
|
123 |
+
customer["total_rating"] = total_rating_span.text.strip() if total_rating_span else "N/A"
|
124 |
+
# print(customer["total_rating"])
|
125 |
+
|
126 |
+
# レビュー日
|
127 |
+
date_tag = review.find("p", class_="fs10 fgGray")
|
128 |
+
customer["review_date"] = date_tag.text.strip() if date_tag else "N/A"
|
129 |
+
|
130 |
+
# 評価
|
131 |
+
ratings = {}
|
132 |
+
ratings_section = review.find_next("ul", class_="judgeList cFix")
|
133 |
+
if ratings_section:
|
134 |
+
for li in ratings_section.find_all("li"):
|
135 |
+
label = li.find("span", class_="fgGray")
|
136 |
+
value = li.find("span", class_="mL10 fgPurple4 b")
|
137 |
+
if label and value:
|
138 |
+
ratings[label.text.strip()] = value.text.strip()
|
139 |
+
customer["ratings"] = ratings
|
140 |
+
|
141 |
+
# レビュー内容
|
142 |
+
review_text = review.find_next("p", class_="mT10 wwbw")
|
143 |
+
customer["review"] = review_text.text.strip() if review_text else "N/A"
|
144 |
+
|
145 |
+
customers.append(customer)
|
146 |
+
|
147 |
+
return pd.DataFrame(customers)
|
148 |
+
|
149 |
+
|
150 |
+
# 収集データのリスト
|
151 |
+
df_list = []
|
152 |
+
count = 0 # 何回目のスクレイピングかをカウント
|
153 |
+
|
154 |
+
for url in urls:
|
155 |
+
if url in scraped_urls:
|
156 |
+
print(f"Skipping already scraped: {url}")
|
157 |
+
continue # 既に取得済みならスキップ
|
158 |
+
|
159 |
+
print(f"Scraping: {url}")
|
160 |
+
data_df = scrap_data(url, headers)
|
161 |
+
time.sleep(5) # 負荷をかけないようにスリープ
|
162 |
+
|
163 |
+
if not data_df.empty:
|
164 |
+
df_list.append(data_df)
|
165 |
+
scraped_urls.add(url) # 取得済みリストに追加
|
166 |
+
count += 1
|
167 |
+
|
168 |
+
# 10回ごと��保存
|
169 |
+
if count % 10 == 0 and df_list:
|
170 |
+
temp_df = pd.concat(df_list, ignore_index=True)
|
171 |
+
total_df = pd.concat([existing_df, temp_df], ignore_index=True)
|
172 |
+
total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
|
173 |
+
print(f"Temporary save: {len(total_df)} rows written")
|
174 |
+
df_list = [] # メモリ節約のためリストをリセット
|
175 |
+
existing_df = total_df # 既存データを更新
|
176 |
+
|
177 |
+
# 残りのデータを保存
|
178 |
+
if df_list:
|
179 |
+
temp_df = pd.concat(df_list, ignore_index=True)
|
180 |
+
total_df = pd.concat([existing_df, temp_df], ignore_index=True)
|
181 |
+
total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
|
182 |
+
print(f"Final save: {len(total_df)} rows written")
|
183 |
+
|
184 |
+
print("Scraping completed.")
|