BarngSJ commited on
Commit
34d098b
·
1 Parent(s): e0dd26e
Files changed (3) hide show
  1. app.py +34 -0
  2. make_csv.py +50 -0
  3. scrap_data_from_FN.py +184 -0
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import schedule
3
+ import time
4
+ import subprocess
5
+ import threading
6
+
7
+ # Function to execute your scripts
8
+ def job():
9
+ st.write("Running scheduled scripts...")
10
+ subprocess.run(["python", "scrap_data_from_FN.py"])
11
+ time.sleep(20)
12
+ subprocess.run(["python", "run_at_morning.py"])
13
+ st.write("Scripts executed successfully!")
14
+
15
+ # Schedule the job at 04:00 AM daily
16
+ schedule.every().day.at("04:00").do(job)
17
+
18
+ # Background scheduler function
19
+ def run_scheduler():
20
+ while True:
21
+ schedule.run_pending()
22
+ time.sleep(60)
23
+
24
+ # Start the scheduler in a separate thread
25
+ threading.Thread(target=run_scheduler, daemon=True).start()
26
+
27
+ # Streamlit UI
28
+ st.title("Automated Script Runner on Hugging Face Spaces")
29
+ st.write("This app automatically runs scripts at 04:00 AM.")
30
+
31
+ if st.button("Run Now"):
32
+ job()
33
+ st.success("Scripts executed manually!")
34
+
make_csv.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import pandas as pd
3
+
4
+ import datetime
5
+
6
+ today_format = datetime.datetime.today().strftime("%Y%m%d")
7
+
8
+ # Load the CSV into a DataFrame
9
+ df = pd.read_csv("all_data.csv")
10
+ df_name = pd.read_csv("tempo_list.csv")
11
+
12
+ # Split the 'ratings' column into separate columns
13
+ ratings_df = df["ratings"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {})
14
+ ratings_split = pd.json_normalize(ratings_df)
15
+
16
+ # Merge the new ratings columns with the original DataFrame
17
+ df = df.drop("ratings", axis=1).join(ratings_split)
18
+ df = df.dropna(subset=["name"])
19
+
20
+ # Extract tempo_code from the source_url
21
+ df["tempo_code"] = df["source_url"].apply(lambda x: x.split("/")[4] if isinstance(x, str) else "")
22
+ df["tempo_code"] = df["tempo_code"].apply(lambda x: x[3:])
23
+
24
+ # Merge with df_name based on tempo_code
25
+ df = pd.merge(df, df_name, left_on="tempo_code", right_on="サロンID", how="left")
26
+ df["review_date"] = df["review_date"].str.replace("[投稿日] ", "")
27
+ df["review_date"] = pd.to_datetime(df["review_date"])
28
+
29
+ # If you need the date and time separately:
30
+ df["date"] = df["review_date"].dt.date
31
+ df["time"] = df["review_date"].dt.time
32
+
33
+ df = df.dropna(subset=["review_date"])
34
+
35
+ df = df[["サロンID", "サロン名", "name", "details", "date", "time", "total_rating", "雰囲気", "接客サービス", "技術・仕上がり", "メニュー・料金", "review"]]
36
+ df = df.rename(
37
+ columns={
38
+ "name": "客様",
39
+ "details": "詳細情報",
40
+ "date": "日程",
41
+ "time": "時刻",
42
+ "total_rating": "総合評価",
43
+ "review": "コメント",
44
+ }
45
+ )
46
+
47
+
48
+ df = df.drop_duplicates()
49
+ # Save the new DataFrame to a new CSV file
50
+ df.to_csv(f"{today_format}.csv", index=False)
scrap_data_from_FN.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # データ操作ライブラリ
2
+ import os
3
+ import pandas as pd
4
+
5
+ # 時間関連ライブラリ
6
+ import time
7
+
8
+ # URLデータ収集ライブラリ
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+
12
+ # ユーザーエージェント
13
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
14
+
15
+ # サロンのIDリスト
16
+ tempo_list = [
17
+ "H000139654",
18
+ "H000143665",
19
+ "H000160021",
20
+ "H000209382",
21
+ "H000223934",
22
+ "H000225867",
23
+ "H000229159",
24
+ "H000231759",
25
+ "H000233312",
26
+ "H000237335",
27
+ "H000237561",
28
+ "H000265843",
29
+ "H000273518",
30
+ "H000286411",
31
+ "H000307248",
32
+ "H000307249",
33
+ "H000307251",
34
+ "H000307252",
35
+ "H000307254",
36
+ "H000307256",
37
+ "H000307404",
38
+ "H000316742",
39
+ "H000319805",
40
+ "H000319837",
41
+ "H000348209",
42
+ "H000356610",
43
+ "H000361649",
44
+ "H000368241",
45
+ "H000377123",
46
+ "H000391152",
47
+ "H000396645",
48
+ "H000396756",
49
+ "H000402609",
50
+ "H000402612",
51
+ "H000406857",
52
+ "H000407525",
53
+ "H000410429",
54
+ "H000410434",
55
+ "H000416986",
56
+ "H000419242",
57
+ "H000434472",
58
+ "H000449155",
59
+ "H000449351",
60
+ "H000477350",
61
+ "H000491208",
62
+ "H000494046",
63
+ "H000500991",
64
+ "H000503062",
65
+ "H000511837",
66
+ "H000522696",
67
+ "H000553193",
68
+ "H000585265",
69
+ "H000585268",
70
+ "H000610008",
71
+ "H000628393",
72
+ "H000640388",
73
+ "H000640401",
74
+ "H000649747",
75
+ "H000655543",
76
+ "H000707971",
77
+ "H000715770",
78
+ ]
79
+
80
+ # 収集するURLリスト作成
81
+ urls = []
82
+ for tempo in tempo_list:
83
+ for j in range(1, 20): # ページ1から14まで
84
+ urls.append(f"https://beauty.hotpepper.jp/kr/sln{tempo}/review/PN{j}.html")
85
+
86
+ # 途中までのデータがあれば読み込む
87
+ csv_filename = "all_data.csv"
88
+ if os.path.exists(csv_filename):
89
+ existing_df = pd.read_csv(csv_filename, encoding="utf-8-sig")
90
+ scraped_urls = set(existing_df["source_url"]) # すでに取得済みのURL
91
+ print(f"Loaded existing data: {len(existing_df)} rows")
92
+ else:
93
+ existing_df = pd.DataFrame()
94
+ scraped_urls = set()
95
+
96
+
97
+ # データスクレイピング関数
98
+ def scrap_data(url, headers):
99
+ response = requests.get(url, headers=headers)
100
+ if response.status_code != 200:
101
+ print(f"Failed to fetch {url}, status code:", response.status_code)
102
+ return pd.DataFrame() # 空のDataFrameを返す
103
+
104
+ soup = BeautifulSoup(response.text, "html.parser")
105
+ customers = []
106
+
107
+ # 各レビューを処理
108
+ for review in soup.find_all("div", class_="fr"):
109
+ customer = {"source_url": url} # どのURLから取得したか記録
110
+
111
+ # 名前と詳細
112
+ name_tag = review.find_previous("p", class_="fl w580 pL25")
113
+ if name_tag:
114
+ customer["name"] = name_tag.find("span", class_="b").text.strip() if name_tag.find("span", class_="b") else "N/A"
115
+ # print(customer["name"])
116
+ customer["details"] = name_tag.find("span", class_="mL5 fs10 fgGray").text.strip() if name_tag.find("span", class_="mL5 fs10 fgGray") else "N/A"
117
+ # print(customer["details"])
118
+
119
+ total_rating = review.find_next("li", class_="bdN fgGray b")
120
+ if total_rating:
121
+ # Correct the reference from name_tag to total_rating
122
+ total_rating_span = total_rating.find("span", class_="mL5 mR10 fgPurple4")
123
+ customer["total_rating"] = total_rating_span.text.strip() if total_rating_span else "N/A"
124
+ # print(customer["total_rating"])
125
+
126
+ # レビュー日
127
+ date_tag = review.find("p", class_="fs10 fgGray")
128
+ customer["review_date"] = date_tag.text.strip() if date_tag else "N/A"
129
+
130
+ # 評価
131
+ ratings = {}
132
+ ratings_section = review.find_next("ul", class_="judgeList cFix")
133
+ if ratings_section:
134
+ for li in ratings_section.find_all("li"):
135
+ label = li.find("span", class_="fgGray")
136
+ value = li.find("span", class_="mL10 fgPurple4 b")
137
+ if label and value:
138
+ ratings[label.text.strip()] = value.text.strip()
139
+ customer["ratings"] = ratings
140
+
141
+ # レビュー内容
142
+ review_text = review.find_next("p", class_="mT10 wwbw")
143
+ customer["review"] = review_text.text.strip() if review_text else "N/A"
144
+
145
+ customers.append(customer)
146
+
147
+ return pd.DataFrame(customers)
148
+
149
+
150
+ # 収集データのリスト
151
+ df_list = []
152
+ count = 0 # 何回目のスクレイピングかをカウント
153
+
154
+ for url in urls:
155
+ if url in scraped_urls:
156
+ print(f"Skipping already scraped: {url}")
157
+ continue # 既に取得済みならスキップ
158
+
159
+ print(f"Scraping: {url}")
160
+ data_df = scrap_data(url, headers)
161
+ time.sleep(5) # 負荷をかけないようにスリープ
162
+
163
+ if not data_df.empty:
164
+ df_list.append(data_df)
165
+ scraped_urls.add(url) # 取得済みリストに追加
166
+ count += 1
167
+
168
+ # 10回ごと��保存
169
+ if count % 10 == 0 and df_list:
170
+ temp_df = pd.concat(df_list, ignore_index=True)
171
+ total_df = pd.concat([existing_df, temp_df], ignore_index=True)
172
+ total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
173
+ print(f"Temporary save: {len(total_df)} rows written")
174
+ df_list = [] # メモリ節約のためリストをリセット
175
+ existing_df = total_df # 既存データを更新
176
+
177
+ # 残りのデータを保存
178
+ if df_list:
179
+ temp_df = pd.concat(df_list, ignore_index=True)
180
+ total_df = pd.concat([existing_df, temp_df], ignore_index=True)
181
+ total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
182
+ print(f"Final save: {len(total_df)} rows written")
183
+
184
+ print("Scraping completed.")