# データ操作ライブラリ import os import pandas as pd # 時間関連ライブラリ import time # URLデータ収集ライブラリ import requests from bs4 import BeautifulSoup # ユーザーエージェント headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"} # サロンのIDリスト tempo_list = [ "H000139654", "H000143665", "H000160021", "H000209382", "H000223934", "H000225867", "H000229159", "H000231759", "H000233312", "H000237335", "H000237561", "H000265843", "H000273518", "H000286411", "H000307248", "H000307249", "H000307251", "H000307252", "H000307254", "H000307256", "H000307404", "H000316742", "H000319805", "H000319837", "H000348209", "H000356610", "H000361649", "H000368241", "H000377123", "H000391152", "H000396645", "H000396756", "H000402609", "H000402612", "H000406857", "H000407525", "H000410429", "H000410434", "H000416986", "H000419242", "H000434472", "H000449155", "H000449351", "H000477350", "H000491208", "H000494046", "H000500991", "H000503062", "H000511837", "H000522696", "H000553193", "H000585265", "H000585268", "H000610008", "H000628393", "H000640388", "H000640401", "H000649747", "H000655543", "H000707971", "H000715770", ] # 収集するURLリスト作成 urls = [] for tempo in tempo_list: for j in range(1, 20): # ページ1から14まで urls.append(f"https://beauty.hotpepper.jp/kr/sln{tempo}/review/PN{j}.html") # 途中までのデータがあれば読み込む csv_filename = "all_data.csv" if os.path.exists(csv_filename): existing_df = pd.read_csv(csv_filename, encoding="utf-8-sig") scraped_urls = set(existing_df["source_url"]) # すでに取得済みのURL print(f"Loaded existing data: {len(existing_df)} rows") else: existing_df = pd.DataFrame() scraped_urls = set() # データスクレイピング関数 def scrap_data(url, headers): response = requests.get(url, headers=headers) if response.status_code != 200: print(f"Failed to fetch {url}, status code:", response.status_code) return pd.DataFrame() # 空のDataFrameを返す soup = BeautifulSoup(response.text, "html.parser") customers = [] # 各レビューを処理 for review in soup.find_all("div", class_="fr"): customer = {"source_url": url} # どのURLから取得したか記録 # 名前と詳細 name_tag = review.find_previous("p", class_="fl w580 pL25") if name_tag: customer["name"] = name_tag.find("span", class_="b").text.strip() if name_tag.find("span", class_="b") else "N/A" # print(customer["name"]) customer["details"] = name_tag.find("span", class_="mL5 fs10 fgGray").text.strip() if name_tag.find("span", class_="mL5 fs10 fgGray") else "N/A" # print(customer["details"]) total_rating = review.find_next("li", class_="bdN fgGray b") if total_rating: # Correct the reference from name_tag to total_rating total_rating_span = total_rating.find("span", class_="mL5 mR10 fgPurple4") customer["total_rating"] = total_rating_span.text.strip() if total_rating_span else "N/A" # print(customer["total_rating"]) # レビュー日 date_tag = review.find("p", class_="fs10 fgGray") customer["review_date"] = date_tag.text.strip() if date_tag else "N/A" # 評価 ratings = {} ratings_section = review.find_next("ul", class_="judgeList cFix") if ratings_section: for li in ratings_section.find_all("li"): label = li.find("span", class_="fgGray") value = li.find("span", class_="mL10 fgPurple4 b") if label and value: ratings[label.text.strip()] = value.text.strip() customer["ratings"] = ratings # レビュー内容 review_text = review.find_next("p", class_="mT10 wwbw") customer["review"] = review_text.text.strip() if review_text else "N/A" customers.append(customer) return pd.DataFrame(customers) # 収集データのリスト df_list = [] count = 0 # 何回目のスクレイピングかをカウント for url in urls: if url in scraped_urls: print(f"Skipping already scraped: {url}") continue # 既に取得済みならスキップ print(f"Scraping: {url}") data_df = scrap_data(url, headers) time.sleep(5) # 負荷をかけないようにスリープ if not data_df.empty: df_list.append(data_df) scraped_urls.add(url) # 取得済みリストに追加 count += 1 # 10回ごとに保存 if count % 10 == 0 and df_list: temp_df = pd.concat(df_list, ignore_index=True) total_df = pd.concat([existing_df, temp_df], ignore_index=True) total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig") print(f"Temporary save: {len(total_df)} rows written") df_list = [] # メモリ節約のためリストをリセット existing_df = total_df # 既存データを更新 # 残りのデータを保存 if df_list: temp_df = pd.concat(df_list, ignore_index=True) total_df = pd.concat([existing_df, temp_df], ignore_index=True) total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig") print(f"Final save: {len(total_df)} rows written") print("Scraping completed.")