Spaces:

Jay9114
/

FN_scrapping

Sleeping

App Files Files Community

BarngSJ commited on 11 days ago

Commit

34d098b

1 Parent(s): e0dd26e

ew

Browse files

Files changed (3) hide show

app.py +34 -0
make_csv.py +50 -0
scrap_data_from_FN.py +184 -0

app.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import streamlit as st
+import schedule
+import time
+import subprocess
+import threading
+# Function to execute your scripts
+def job():
+    st.write("Running scheduled scripts...")
+    subprocess.run(["python", "scrap_data_from_FN.py"])
+    time.sleep(20)
+    subprocess.run(["python", "run_at_morning.py"])
+    st.write("Scripts executed successfully!")
+# Schedule the job at 04:00 AM daily
+schedule.every().day.at("04:00").do(job)
+# Background scheduler function
+def run_scheduler():
+    while True:
+        schedule.run_pending()
+        time.sleep(60)
+# Start the scheduler in a separate thread
+threading.Thread(target=run_scheduler, daemon=True).start()
+# Streamlit UI
+st.title("Automated Script Runner on Hugging Face Spaces")
+st.write("This app automatically runs scripts at 04:00 AM.")
+if st.button("Run Now"):
+    job()
+    st.success("Scripts executed manually!")

make_csv.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import ast
+import pandas as pd
+import datetime
+today_format = datetime.datetime.today().strftime("%Y%m%d")
+# Load the CSV into a DataFrame
+df = pd.read_csv("all_data.csv")
+df_name = pd.read_csv("tempo_list.csv")
+# Split the 'ratings' column into separate columns
+ratings_df = df["ratings"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {})
+ratings_split = pd.json_normalize(ratings_df)
+# Merge the new ratings columns with the original DataFrame
+df = df.drop("ratings", axis=1).join(ratings_split)
+df = df.dropna(subset=["name"])
+# Extract tempo_code from the source_url
+df["tempo_code"] = df["source_url"].apply(lambda x: x.split("/")[4] if isinstance(x, str) else "")
+df["tempo_code"] = df["tempo_code"].apply(lambda x: x[3:])
+# Merge with df_name based on tempo_code
+df = pd.merge(df, df_name, left_on="tempo_code", right_on="サロンＩＤ", how="left")
+df["review_date"] = df["review_date"].str.replace("[投稿日] ", "")
+df["review_date"] = pd.to_datetime(df["review_date"])
+# If you need the date and time separately:
+df["date"] = df["review_date"].dt.date
+df["time"] = df["review_date"].dt.time
+df = df.dropna(subset=["review_date"])
+df = df[["サロンＩＤ", "サロン名", "name", "details", "date", "time", "total_rating", "雰囲気", "接客サービス", "技術・仕上がり", "メニュー・料金", "review"]]
+df = df.rename(
+    columns={
+        "name": "客様",
+        "details": "詳細情報",
+        "date": "日程",
+        "time": "時刻",
+        "total_rating": "総合評価",
+        "review": "コメント",
+    }
+)
+df = df.drop_duplicates()
+# Save the new DataFrame to a new CSV file
+df.to_csv(f"{today_format}.csv", index=False)

scrap_data_from_FN.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# データ操作ライブラリ
+import os
+import pandas as pd
+# 時間関連ライブラリ
+import time
+# URLデータ収集ライブラリ
+import requests
+from bs4 import BeautifulSoup
+# ユーザーエージェント
+headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
+# サロンのIDリスト
+tempo_list = [
+    "H000139654",
+    "H000143665",
+    "H000160021",
+    "H000209382",
+    "H000223934",
+    "H000225867",
+    "H000229159",
+    "H000231759",
+    "H000233312",
+    "H000237335",
+    "H000237561",
+    "H000265843",
+    "H000273518",
+    "H000286411",
+    "H000307248",
+    "H000307249",
+    "H000307251",
+    "H000307252",
+    "H000307254",
+    "H000307256",
+    "H000307404",
+    "H000316742",
+    "H000319805",
+    "H000319837",
+    "H000348209",
+    "H000356610",
+    "H000361649",
+    "H000368241",
+    "H000377123",
+    "H000391152",
+    "H000396645",
+    "H000396756",
+    "H000402609",
+    "H000402612",
+    "H000406857",
+    "H000407525",
+    "H000410429",
+    "H000410434",
+    "H000416986",
+    "H000419242",
+    "H000434472",
+    "H000449155",
+    "H000449351",
+    "H000477350",
+    "H000491208",
+    "H000494046",
+    "H000500991",
+    "H000503062",
+    "H000511837",
+    "H000522696",
+    "H000553193",
+    "H000585265",
+    "H000585268",
+    "H000610008",
+    "H000628393",
+    "H000640388",
+    "H000640401",
+    "H000649747",
+    "H000655543",
+    "H000707971",
+    "H000715770",
+]
+# 収集するURLリスト作成
+urls = []
+for tempo in tempo_list:
+    for j in range(1, 20):  # ページ1から14まで
+        urls.append(f"https://beauty.hotpepper.jp/kr/sln{tempo}/review/PN{j}.html")
+# 途中までのデータがあれば読み込む
+csv_filename = "all_data.csv"
+if os.path.exists(csv_filename):
+    existing_df = pd.read_csv(csv_filename, encoding="utf-8-sig")
+    scraped_urls = set(existing_df["source_url"])  # すでに取得済みのURL
+    print(f"Loaded existing data: {len(existing_df)} rows")
+else:
+    existing_df = pd.DataFrame()
+    scraped_urls = set()
+# データスクレイピング関数
+def scrap_data(url, headers):
+    response = requests.get(url, headers=headers)
+    if response.status_code != 200:
+        print(f"Failed to fetch {url}, status code:", response.status_code)
+        return pd.DataFrame()  # 空のDataFrameを返す
+    soup = BeautifulSoup(response.text, "html.parser")
+    customers = []
+    # 各レビューを処理
+    for review in soup.find_all("div", class_="fr"):
+        customer = {"source_url": url}  # どのURLから取得したか記録
+        # 名前と詳細
+        name_tag = review.find_previous("p", class_="fl w580 pL25")
+        if name_tag:
+            customer["name"] = name_tag.find("span", class_="b").text.strip() if name_tag.find("span", class_="b") else "N/A"
+            # print(customer["name"])
+            customer["details"] = name_tag.find("span", class_="mL5 fs10 fgGray").text.strip() if name_tag.find("span", class_="mL5 fs10 fgGray") else "N/A"
+            # print(customer["details"])
+        total_rating = review.find_next("li", class_="bdN fgGray b")
+        if total_rating:
+            # Correct the reference from name_tag to total_rating
+            total_rating_span = total_rating.find("span", class_="mL5 mR10 fgPurple4")
+            customer["total_rating"] = total_rating_span.text.strip() if total_rating_span else "N/A"
+            # print(customer["total_rating"])
+        # レビュー日
+        date_tag = review.find("p", class_="fs10 fgGray")
+        customer["review_date"] = date_tag.text.strip() if date_tag else "N/A"
+        # 評価
+        ratings = {}
+        ratings_section = review.find_next("ul", class_="judgeList cFix")
+        if ratings_section:
+            for li in ratings_section.find_all("li"):
+                label = li.find("span", class_="fgGray")
+                value = li.find("span", class_="mL10 fgPurple4 b")
+                if label and value:
+                    ratings[label.text.strip()] = value.text.strip()
+        customer["ratings"] = ratings
+        # レビュー内容
+        review_text = review.find_next("p", class_="mT10 wwbw")
+        customer["review"] = review_text.text.strip() if review_text else "N/A"
+        customers.append(customer)
+    return pd.DataFrame(customers)
+# 収集データのリスト
+df_list = []
+count = 0  # 何回目のスクレイピングかをカウント
+for url in urls:
+    if url in scraped_urls:
+        print(f"Skipping already scraped: {url}")
+        continue  # 既に取得済みならスキップ
+    print(f"Scraping: {url}")
+    data_df = scrap_data(url, headers)
+    time.sleep(5)  # 負荷をかけないようにスリープ
+    if not data_df.empty:
+        df_list.append(data_df)
+        scraped_urls.add(url)  # 取得済みリストに追加
+        count += 1
+    # 10回ごと��保存
+    if count % 10 == 0 and df_list:
+        temp_df = pd.concat(df_list, ignore_index=True)
+        total_df = pd.concat([existing_df, temp_df], ignore_index=True)
+        total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
+        print(f"Temporary save: {len(total_df)} rows written")
+        df_list = []  # メモリ節約のためリストをリセット
+        existing_df = total_df  # 既存データを更新
+# 残りのデータを保存
+if df_list:
+    temp_df = pd.concat(df_list, ignore_index=True)
+    total_df = pd.concat([existing_df, temp_df], ignore_index=True)
+    total_df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
+    print(f"Final save: {len(total_df)} rows written")
+print("Scraping completed.")