Spaces:

Jay9114
/

FN_scrapping

Sleeping

FN_scrapping / scrap_data_from_FN.py

BarngSJ

34d098b 8 days ago

5.69 kB

	# データ操作ライブラリ
	import os
	import pandas as pd

	# 時間関連ライブラリ
	import time

	# URLデータ収集ライブラリ
	import requests
	from bs4 import BeautifulSoup

	# ユーザーエージェント
	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}

	# サロンのIDリスト
	tempo_list = [
	"H000139654",
	"H000143665",
	"H000160021",
	"H000209382",
	"H000223934",
	"H000225867",
	"H000229159",
	"H000231759",
	"H000233312",
	"H000237335",
	"H000237561",
	"H000265843",
	"H000273518",
	"H000286411",
	"H000307248",
	"H000307249",
	"H000307251",
	"H000307252",
	"H000307254",
	"H000307256",
	"H000307404",
	"H000316742",
	"H000319805",
	"H000319837",
	"H000348209",
	"H000356610",
	"H000361649",
	"H000368241",
	"H000377123",
	"H000391152",
	"H000396645",
	"H000396756",
	"H000402609",
	"H000402612",
	"H000406857",
	"H000407525",
	"H000410429",
	"H000410434",
	"H000416986",
	"H000419242",
	"H000434472",
	"H000449155",
	"H000449351",
	"H000477350",
	"H000491208",
	"H000494046",
	"H000500991",
	"H000503062",
	"H000511837",
	"H000522696",
	"H000553193",
	"H000585265",
	"H000585268",
	"H000610008",
	"H000628393",
	"H000640388",
	"H000640401",
	"H000649747",
	"H000655543",
	"H000707971",
	"H000715770",
	]

	# 収集するURLリスト作成
	urls = []
	for tempo in tempo_list:
	for j in range(1, 20): # ページ1から14まで
	urls.append(f"https://beauty.hotpepper.jp/kr/sln{tempo}/review/PN{j}.html")

	# 途中までのデータがあれば読み込む
	csv_filename = "all_data.csv"
	if os.path.exists(csv_filename):
	existing_df = pd.read_csv(csv_filename, encoding="utf-8-sig")
	scraped_urls = set(existing_df["source_url"]) # すでに取得済みのURL
	print(f"Loaded existing data: {len(existing_df)} rows")
	else:
	existing_df = pd.DataFrame()
	scraped_urls = set()


	# データスクレイピング関数
	def scrap_data(url, headers):
	response = requests.get(url, headers=headers)
	if response.status_code != 200:
	print(f"Failed to fetch {url}, status code:", response.status_code)
	return pd.DataFrame() # 空のDataFrameを返す

	soup = BeautifulSoup(response.text, "html.parser")
	customers = []

	# 各レビューを処理
	for review in soup.find_all("div", class_="fr"):
	customer = {"source_url": url} # どのURLから取得したか記録

	# 名前と詳細
	name_tag = review.find_previous("p", class_="fl w580 pL25")
	if name_tag:
	customer["name"] = name_tag.find("span", class_="b").text.strip() if name_tag.find("span", class_="b") else "N/A"
	# print(customer["name"])
	customer["details"] = name_tag.find("span", class_="mL5 fs10 fgGray").text.strip() if name_tag.find("span", class_="mL5 fs10 fgGray") else "N/A"
	# print(customer["details"])

	total_rating = review.find_next("li", class_="bdN fgGray b")
	if total_rating:
	# Correct the reference from name_tag to total_rating
	total_rating_span = total_rating.find("span", class_="mL5 mR10 fgPurple4")
	customer["total_rating"] = total_rating_span.text.strip() if total_rating_span else "N/A"
	# print(customer["total_rating"])

	# レビュー日
	date_tag = review.find("p", class_="fs10 fgGray")
	customer["review_date"] = date_tag.text.strip() if date_tag else "N/A"

	# 評価
	ratings = {}
	ratings_section = review.find_next("ul", class_="judgeList cFix")
	if ratings_section:
	for li in ratings_section.find_all("li"):
	label = li.find("span", class_="fgGray")
	value = li.find("span", class_="mL10 fgPurple4 b")
	if label and value:
	ratings[label.text.strip()] = value.text.strip()
	customer["ratings"] = ratings

	# レビュー内容
	review_text = review.find_next("p", class_="mT10 wwbw")
	customer["review"] = review_text.text.strip() if review_text else "N/A"

	customers.append(customer)

	return pd.DataFrame(customers)


	# 収集データのリスト
	df_list = []
	count = 0 # 何回目のスクレイピングかをカウント

	for url in urls:
	if url in scraped_urls:
	print(f"Skipping already scraped: {url}")
	continue # 既に取得済みならスキップ

	print(f"Scraping: {url}")
	data_df = scrap_data(url, headers)
	time.sleep(5) # 負荷をかけないようにスリープ

	if not data_df.empty:
	df_list.append(data_df)
	scraped_urls.add(url) # 取得済みリストに追加
	count += 1

	# 10回ごとに保存
	if count % 10 == 0 and df_list:
	temp_df = pd.concat(df_list, ignore_index=True)
	total_df = pd.concat([existing_df, temp_df], ignore_index=True)
	total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
	print(f"Temporary save: {len(total_df)} rows written")
	df_list = [] # メモリ節約のためリストをリセット
	existing_df = total_df # 既存データを更新

	# 残りのデータを保存
	if df_list:
	temp_df = pd.concat(df_list, ignore_index=True)
	total_df = pd.concat([existing_df, temp_df], ignore_index=True)
	total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
	print(f"Final save: {len(total_df)} rows written")

	print("Scraping completed.")