FN_scrapping / scrap_data_from_FN.py
BarngSJ
ew
34d098b
# データ操作ライブラリ
import os
import pandas as pd
# 時間関連ライブラリ
import time
# URLデータ収集ライブラリ
import requests
from bs4 import BeautifulSoup
# ユーザーエージェント
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
# サロンのIDリスト
tempo_list = [
"H000139654",
"H000143665",
"H000160021",
"H000209382",
"H000223934",
"H000225867",
"H000229159",
"H000231759",
"H000233312",
"H000237335",
"H000237561",
"H000265843",
"H000273518",
"H000286411",
"H000307248",
"H000307249",
"H000307251",
"H000307252",
"H000307254",
"H000307256",
"H000307404",
"H000316742",
"H000319805",
"H000319837",
"H000348209",
"H000356610",
"H000361649",
"H000368241",
"H000377123",
"H000391152",
"H000396645",
"H000396756",
"H000402609",
"H000402612",
"H000406857",
"H000407525",
"H000410429",
"H000410434",
"H000416986",
"H000419242",
"H000434472",
"H000449155",
"H000449351",
"H000477350",
"H000491208",
"H000494046",
"H000500991",
"H000503062",
"H000511837",
"H000522696",
"H000553193",
"H000585265",
"H000585268",
"H000610008",
"H000628393",
"H000640388",
"H000640401",
"H000649747",
"H000655543",
"H000707971",
"H000715770",
]
# 収集するURLリスト作成
urls = []
for tempo in tempo_list:
for j in range(1, 20): # ページ1から14まで
urls.append(f"https://beauty.hotpepper.jp/kr/sln{tempo}/review/PN{j}.html")
# 途中までのデータがあれば読み込む
csv_filename = "all_data.csv"
if os.path.exists(csv_filename):
existing_df = pd.read_csv(csv_filename, encoding="utf-8-sig")
scraped_urls = set(existing_df["source_url"]) # すでに取得済みのURL
print(f"Loaded existing data: {len(existing_df)} rows")
else:
existing_df = pd.DataFrame()
scraped_urls = set()
# データスクレイピング関数
def scrap_data(url, headers):
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to fetch {url}, status code:", response.status_code)
return pd.DataFrame() # 空のDataFrameを返す
soup = BeautifulSoup(response.text, "html.parser")
customers = []
# 各レビューを処理
for review in soup.find_all("div", class_="fr"):
customer = {"source_url": url} # どのURLから取得したか記録
# 名前と詳細
name_tag = review.find_previous("p", class_="fl w580 pL25")
if name_tag:
customer["name"] = name_tag.find("span", class_="b").text.strip() if name_tag.find("span", class_="b") else "N/A"
# print(customer["name"])
customer["details"] = name_tag.find("span", class_="mL5 fs10 fgGray").text.strip() if name_tag.find("span", class_="mL5 fs10 fgGray") else "N/A"
# print(customer["details"])
total_rating = review.find_next("li", class_="bdN fgGray b")
if total_rating:
# Correct the reference from name_tag to total_rating
total_rating_span = total_rating.find("span", class_="mL5 mR10 fgPurple4")
customer["total_rating"] = total_rating_span.text.strip() if total_rating_span else "N/A"
# print(customer["total_rating"])
# レビュー日
date_tag = review.find("p", class_="fs10 fgGray")
customer["review_date"] = date_tag.text.strip() if date_tag else "N/A"
# 評価
ratings = {}
ratings_section = review.find_next("ul", class_="judgeList cFix")
if ratings_section:
for li in ratings_section.find_all("li"):
label = li.find("span", class_="fgGray")
value = li.find("span", class_="mL10 fgPurple4 b")
if label and value:
ratings[label.text.strip()] = value.text.strip()
customer["ratings"] = ratings
# レビュー内容
review_text = review.find_next("p", class_="mT10 wwbw")
customer["review"] = review_text.text.strip() if review_text else "N/A"
customers.append(customer)
return pd.DataFrame(customers)
# 収集データのリスト
df_list = []
count = 0 # 何回目のスクレイピングかをカウント
for url in urls:
if url in scraped_urls:
print(f"Skipping already scraped: {url}")
continue # 既に取得済みならスキップ
print(f"Scraping: {url}")
data_df = scrap_data(url, headers)
time.sleep(5) # 負荷をかけないようにスリープ
if not data_df.empty:
df_list.append(data_df)
scraped_urls.add(url) # 取得済みリストに追加
count += 1
# 10回ごとに保存
if count % 10 == 0 and df_list:
temp_df = pd.concat(df_list, ignore_index=True)
total_df = pd.concat([existing_df, temp_df], ignore_index=True)
total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
print(f"Temporary save: {len(total_df)} rows written")
df_list = [] # メモリ節約のためリストをリセット
existing_df = total_df # 既存データを更新
# 残りのデータを保存
if df_list:
temp_df = pd.concat(df_list, ignore_index=True)
total_df = pd.concat([existing_df, temp_df], ignore_index=True)
total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
print(f"Final save: {len(total_df)} rows written")
print("Scraping completed.")