Spaces:
Sleeping
Sleeping
# データ操作ライブラリ | |
import os | |
import pandas as pd | |
# 時間関連ライブラリ | |
import time | |
# URLデータ収集ライブラリ | |
import requests | |
from bs4 import BeautifulSoup | |
# ユーザーエージェント | |
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"} | |
# サロンのIDリスト | |
tempo_list = [ | |
"H000139654", | |
"H000143665", | |
"H000160021", | |
"H000209382", | |
"H000223934", | |
"H000225867", | |
"H000229159", | |
"H000231759", | |
"H000233312", | |
"H000237335", | |
"H000237561", | |
"H000265843", | |
"H000273518", | |
"H000286411", | |
"H000307248", | |
"H000307249", | |
"H000307251", | |
"H000307252", | |
"H000307254", | |
"H000307256", | |
"H000307404", | |
"H000316742", | |
"H000319805", | |
"H000319837", | |
"H000348209", | |
"H000356610", | |
"H000361649", | |
"H000368241", | |
"H000377123", | |
"H000391152", | |
"H000396645", | |
"H000396756", | |
"H000402609", | |
"H000402612", | |
"H000406857", | |
"H000407525", | |
"H000410429", | |
"H000410434", | |
"H000416986", | |
"H000419242", | |
"H000434472", | |
"H000449155", | |
"H000449351", | |
"H000477350", | |
"H000491208", | |
"H000494046", | |
"H000500991", | |
"H000503062", | |
"H000511837", | |
"H000522696", | |
"H000553193", | |
"H000585265", | |
"H000585268", | |
"H000610008", | |
"H000628393", | |
"H000640388", | |
"H000640401", | |
"H000649747", | |
"H000655543", | |
"H000707971", | |
"H000715770", | |
] | |
# 収集するURLリスト作成 | |
urls = [] | |
for tempo in tempo_list: | |
for j in range(1, 20): # ページ1から14まで | |
urls.append(f"https://beauty.hotpepper.jp/kr/sln{tempo}/review/PN{j}.html") | |
# 途中までのデータがあれば読み込む | |
csv_filename = "all_data.csv" | |
if os.path.exists(csv_filename): | |
existing_df = pd.read_csv(csv_filename, encoding="utf-8-sig") | |
scraped_urls = set(existing_df["source_url"]) # すでに取得済みのURL | |
print(f"Loaded existing data: {len(existing_df)} rows") | |
else: | |
existing_df = pd.DataFrame() | |
scraped_urls = set() | |
# データスクレイピング関数 | |
def scrap_data(url, headers): | |
response = requests.get(url, headers=headers) | |
if response.status_code != 200: | |
print(f"Failed to fetch {url}, status code:", response.status_code) | |
return pd.DataFrame() # 空のDataFrameを返す | |
soup = BeautifulSoup(response.text, "html.parser") | |
customers = [] | |
# 各レビューを処理 | |
for review in soup.find_all("div", class_="fr"): | |
customer = {"source_url": url} # どのURLから取得したか記録 | |
# 名前と詳細 | |
name_tag = review.find_previous("p", class_="fl w580 pL25") | |
if name_tag: | |
customer["name"] = name_tag.find("span", class_="b").text.strip() if name_tag.find("span", class_="b") else "N/A" | |
# print(customer["name"]) | |
customer["details"] = name_tag.find("span", class_="mL5 fs10 fgGray").text.strip() if name_tag.find("span", class_="mL5 fs10 fgGray") else "N/A" | |
# print(customer["details"]) | |
total_rating = review.find_next("li", class_="bdN fgGray b") | |
if total_rating: | |
# Correct the reference from name_tag to total_rating | |
total_rating_span = total_rating.find("span", class_="mL5 mR10 fgPurple4") | |
customer["total_rating"] = total_rating_span.text.strip() if total_rating_span else "N/A" | |
# print(customer["total_rating"]) | |
# レビュー日 | |
date_tag = review.find("p", class_="fs10 fgGray") | |
customer["review_date"] = date_tag.text.strip() if date_tag else "N/A" | |
# 評価 | |
ratings = {} | |
ratings_section = review.find_next("ul", class_="judgeList cFix") | |
if ratings_section: | |
for li in ratings_section.find_all("li"): | |
label = li.find("span", class_="fgGray") | |
value = li.find("span", class_="mL10 fgPurple4 b") | |
if label and value: | |
ratings[label.text.strip()] = value.text.strip() | |
customer["ratings"] = ratings | |
# レビュー内容 | |
review_text = review.find_next("p", class_="mT10 wwbw") | |
customer["review"] = review_text.text.strip() if review_text else "N/A" | |
customers.append(customer) | |
return pd.DataFrame(customers) | |
# 収集データのリスト | |
df_list = [] | |
count = 0 # 何回目のスクレイピングかをカウント | |
for url in urls: | |
if url in scraped_urls: | |
print(f"Skipping already scraped: {url}") | |
continue # 既に取得済みならスキップ | |
print(f"Scraping: {url}") | |
data_df = scrap_data(url, headers) | |
time.sleep(5) # 負荷をかけないようにスリープ | |
if not data_df.empty: | |
df_list.append(data_df) | |
scraped_urls.add(url) # 取得済みリストに追加 | |
count += 1 | |
# 10回ごとに保存 | |
if count % 10 == 0 and df_list: | |
temp_df = pd.concat(df_list, ignore_index=True) | |
total_df = pd.concat([existing_df, temp_df], ignore_index=True) | |
total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig") | |
print(f"Temporary save: {len(total_df)} rows written") | |
df_list = [] # メモリ節約のためリストをリセット | |
existing_df = total_df # 既存データを更新 | |
# 残りのデータを保存 | |
if df_list: | |
temp_df = pd.concat(df_list, ignore_index=True) | |
total_df = pd.concat([existing_df, temp_df], ignore_index=True) | |
total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig") | |
print(f"Final save: {len(total_df)} rows written") | |
print("Scraping completed.") | |