# データ操作ライブラリ
import os
import pandas as pd

# 時間関連ライブラリ
import time

# URLデータ収集ライブラリ
import requests
from bs4 import BeautifulSoup

# ユーザーエージェント
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}

# サロンのIDリスト
tempo_list = [
    "H000139654",
    "H000143665",
    "H000160021",
    "H000209382",
    "H000223934",
    "H000225867",
    "H000229159",
    "H000231759",
    "H000233312",
    "H000237335",
    "H000237561",
    "H000265843",
    "H000273518",
    "H000286411",
    "H000307248",
    "H000307249",
    "H000307251",
    "H000307252",
    "H000307254",
    "H000307256",
    "H000307404",
    "H000316742",
    "H000319805",
    "H000319837",
    "H000348209",
    "H000356610",
    "H000361649",
    "H000368241",
    "H000377123",
    "H000391152",
    "H000396645",
    "H000396756",
    "H000402609",
    "H000402612",
    "H000406857",
    "H000407525",
    "H000410429",
    "H000410434",
    "H000416986",
    "H000419242",
    "H000434472",
    "H000449155",
    "H000449351",
    "H000477350",
    "H000491208",
    "H000494046",
    "H000500991",
    "H000503062",
    "H000511837",
    "H000522696",
    "H000553193",
    "H000585265",
    "H000585268",
    "H000610008",
    "H000628393",
    "H000640388",
    "H000640401",
    "H000649747",
    "H000655543",
    "H000707971",
    "H000715770",
]

# 収集するURLリスト作成
urls = []
for tempo in tempo_list:
    for j in range(1, 20):  # ページ1から14まで
        urls.append(f"https://beauty.hotpepper.jp/kr/sln{tempo}/review/PN{j}.html")

# 途中までのデータがあれば読み込む
csv_filename = "all_data.csv"
if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename, encoding="utf-8-sig")
    scraped_urls = set(existing_df["source_url"])  # すでに取得済みのURL
    print(f"Loaded existing data: {len(existing_df)} rows")
else:
    existing_df = pd.DataFrame()
    scraped_urls = set()


# データスクレイピング関数
def scrap_data(url, headers):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch {url}, status code:", response.status_code)
        return pd.DataFrame()  # 空のDataFrameを返す

    soup = BeautifulSoup(response.text, "html.parser")
    customers = []

    # 各レビューを処理
    for review in soup.find_all("div", class_="fr"):
        customer = {"source_url": url}  # どのURLから取得したか記録

        # 名前と詳細
        name_tag = review.find_previous("p", class_="fl w580 pL25")
        if name_tag:
            customer["name"] = name_tag.find("span", class_="b").text.strip() if name_tag.find("span", class_="b") else "N/A"
            # print(customer["name"])
            customer["details"] = name_tag.find("span", class_="mL5 fs10 fgGray").text.strip() if name_tag.find("span", class_="mL5 fs10 fgGray") else "N/A"
            # print(customer["details"])

        total_rating = review.find_next("li", class_="bdN fgGray b")
        if total_rating:
            # Correct the reference from name_tag to total_rating
            total_rating_span = total_rating.find("span", class_="mL5 mR10 fgPurple4")
            customer["total_rating"] = total_rating_span.text.strip() if total_rating_span else "N/A"
            # print(customer["total_rating"])

        # レビュー日
        date_tag = review.find("p", class_="fs10 fgGray")
        customer["review_date"] = date_tag.text.strip() if date_tag else "N/A"

        # 評価
        ratings = {}
        ratings_section = review.find_next("ul", class_="judgeList cFix")
        if ratings_section:
            for li in ratings_section.find_all("li"):
                label = li.find("span", class_="fgGray")
                value = li.find("span", class_="mL10 fgPurple4 b")
                if label and value:
                    ratings[label.text.strip()] = value.text.strip()
        customer["ratings"] = ratings

        # レビュー内容
        review_text = review.find_next("p", class_="mT10 wwbw")
        customer["review"] = review_text.text.strip() if review_text else "N/A"

        customers.append(customer)

    return pd.DataFrame(customers)


# 収集データのリスト
df_list = []
count = 0  # 何回目のスクレイピングかをカウント

for url in urls:
    if url in scraped_urls:
        print(f"Skipping already scraped: {url}")
        continue  # 既に取得済みならスキップ

    print(f"Scraping: {url}")
    data_df = scrap_data(url, headers)
    time.sleep(5)  # 負荷をかけないようにスリープ

    if not data_df.empty:
        df_list.append(data_df)
        scraped_urls.add(url)  # 取得済みリストに追加
        count += 1

    # 10回ごとに保存
    if count % 10 == 0 and df_list:
        temp_df = pd.concat(df_list, ignore_index=True)
        total_df = pd.concat([existing_df, temp_df], ignore_index=True)
        total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
        print(f"Temporary save: {len(total_df)} rows written")
        df_list = []  # メモリ節約のためリストをリセット
        existing_df = total_df  # 既存データを更新

# 残りのデータを保存
if df_list:
    temp_df = pd.concat(df_list, ignore_index=True)
    total_df = pd.concat([existing_df, temp_df], ignore_index=True)
    total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
    print(f"Final save: {len(total_df)} rows written")

print("Scraping completed.")