trendyol-review-summarizer / scrape /trendyol_scraper.py
enesmanan's picture
add chrome webdiriver docker file
9cbdd01 verified
raw
history blame
4.19 kB
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
import os
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
def scrape_reviews(url):
# Create data directory if it doesn't exist
data_directory = "data"
if not os.path.exists(data_directory):
os.makedirs(data_directory)
def comprehensive_scroll(driver):
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Chrome options for Linux/Space environment
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
try:
# Linux için Chrome ve ChromeDriver kurulumu
os.system('apt-get update && apt-get install -y chromium-browser chromium-chromedriver')
# ChromeDriver'ı otomatik yönet
service = ChromeService()
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get(url)
try:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
).click()
except:
print("Çerez popup'ı bulunamadı, devam ediliyor...")
comprehensive_scroll(driver)
comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
total_comments = len(comment_elements)
data = []
for i in range(1, total_comments + 1):
try:
username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
username = driver.find_element(By.XPATH, username_xpath).text
except:
username = "N/A"
try:
comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
comment = driver.find_element(By.XPATH, comment_xpath).text
except:
comment = "N/A"
try:
date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
date = driver.find_element(By.XPATH, date_xpath).text
except:
date = "N/A"
try:
star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
star_count = len(full_stars)
except:
star_count = 0
data.append({
"Kullanıcı_id": i,
"Kullanıcı Adı": username,
"Yorum": comment,
"Tarih": date,
"Yıldız Sayısı": star_count
})
return pd.DataFrame(data)
except Exception as e:
print(f"Hata oluştu: {str(e)}")
return pd.DataFrame()
finally:
if 'driver' in locals():
driver.quit()