fix
Browse files- scrape/trendyol_scraper.py +75 -45
scrape/trendyol_scraper.py
CHANGED
@@ -3,108 +3,138 @@ from selenium.webdriver.chrome.service import Service
|
|
3 |
from selenium.webdriver.common.by import By
|
4 |
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
6 |
import time
|
7 |
import pandas as pd
|
8 |
import os
|
|
|
|
|
|
|
9 |
|
10 |
def comprehensive_scroll(driver):
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
last_height = new_height
|
26 |
|
27 |
def scrape_reviews(url):
|
28 |
"""URL'den yorumları çeken fonksiyon"""
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
try:
|
43 |
# Linux için ChromeDriver ayarı
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
|
|
47 |
driver.get(url)
|
48 |
|
49 |
# Çerez popup'ını kabul et
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
53 |
|
|
|
54 |
comprehensive_scroll(driver)
|
55 |
|
|
|
56 |
comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
|
57 |
total_comments = len(comment_elements)
|
|
|
58 |
|
59 |
data = []
|
60 |
for i in range(1, total_comments + 1):
|
61 |
-
kullanıcı_id = i
|
62 |
try:
|
63 |
username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
|
64 |
username = driver.find_element(By.XPATH, username_xpath).text
|
65 |
-
except:
|
66 |
username = "N/A"
|
67 |
|
68 |
try:
|
69 |
comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
|
70 |
comment = driver.find_element(By.XPATH, comment_xpath).text
|
71 |
-
except:
|
72 |
comment = "N/A"
|
73 |
|
74 |
try:
|
75 |
date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
|
76 |
date = driver.find_element(By.XPATH, date_xpath).text
|
77 |
-
except:
|
78 |
date = "N/A"
|
79 |
|
80 |
-
star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
|
81 |
try:
|
|
|
82 |
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
|
83 |
star_count = len(full_stars)
|
84 |
-
except:
|
85 |
star_count = 0
|
86 |
|
87 |
data.append({
|
88 |
-
"Kullanıcı_id":
|
89 |
"Kullanıcı Adı": username,
|
90 |
"Yorum": comment,
|
91 |
"Tarih": date,
|
92 |
"Yıldız Sayısı": star_count
|
93 |
})
|
94 |
|
|
|
|
|
|
|
|
|
|
|
95 |
# Geçici dosya olarak kaydet
|
96 |
temp_file = os.path.join(data_directory, 'temp_comments.csv')
|
97 |
-
df = pd.DataFrame(data)
|
98 |
df.to_csv(temp_file, index=False, encoding='utf-8-sig')
|
|
|
99 |
|
100 |
return df
|
101 |
|
102 |
except Exception as e:
|
103 |
-
|
104 |
-
return pd.DataFrame()
|
105 |
|
106 |
finally:
|
107 |
-
driver
|
|
|
|
|
108 |
# Geçici dosyayı sil
|
109 |
-
|
110 |
-
|
|
|
|
|
|
3 |
from selenium.webdriver.common.by import By
|
4 |
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
7 |
import time
|
8 |
import pandas as pd
|
9 |
import os
|
10 |
+
import logging
|
11 |
+
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
|
14 |
def comprehensive_scroll(driver):
|
15 |
+
"""Scroll until no more new content is loaded"""
|
16 |
+
try:
|
17 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
18 |
+
while True:
|
19 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
20 |
+
time.sleep(3)
|
21 |
+
|
22 |
+
new_height = driver.execute_script("return document.body.scrollHeight")
|
23 |
+
if new_height == last_height:
|
24 |
+
break
|
25 |
+
last_height = new_height
|
26 |
+
except Exception as e:
|
27 |
+
logger.error(f"Scroll sırasında hata: {str(e)}")
|
|
|
|
|
28 |
|
29 |
def scrape_reviews(url):
|
30 |
"""URL'den yorumları çeken fonksiyon"""
|
31 |
+
driver = None
|
32 |
+
try:
|
33 |
+
# Data directory oluştur
|
34 |
+
data_directory = "data"
|
35 |
+
if not os.path.exists(data_directory):
|
36 |
+
os.makedirs(data_directory)
|
37 |
|
38 |
+
# Chrome options ayarları
|
39 |
+
chrome_options = webdriver.ChromeOptions()
|
40 |
+
chrome_options.add_argument('--headless')
|
41 |
+
chrome_options.add_argument('--disable-gpu')
|
42 |
+
chrome_options.add_argument('--no-sandbox')
|
43 |
+
chrome_options.add_argument('--disable-dev-shm-usage')
|
44 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
45 |
|
|
|
46 |
# Linux için ChromeDriver ayarı
|
47 |
+
try:
|
48 |
+
# Önce /usr/local/bin/chromedriver'ı dene
|
49 |
+
service = Service('/usr/local/bin/chromedriver')
|
50 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
51 |
+
except:
|
52 |
+
try:
|
53 |
+
# Eğer başarısız olursa /usr/bin/chromedriver'ı dene
|
54 |
+
service = Service('/usr/bin/chromedriver')
|
55 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
56 |
+
except:
|
57 |
+
# Son olarak PATH'teki chromedriver'ı dene
|
58 |
+
service = Service('chromedriver')
|
59 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
60 |
|
61 |
+
logger.info(f"URL'ye erişiliyor: {url}")
|
62 |
driver.get(url)
|
63 |
|
64 |
# Çerez popup'ını kabul et
|
65 |
+
try:
|
66 |
+
WebDriverWait(driver, 10).until(
|
67 |
+
EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
|
68 |
+
).click()
|
69 |
+
logger.info("Çerez popup'ı kabul edildi")
|
70 |
+
except TimeoutException:
|
71 |
+
logger.warning("Çerez popup'ı bulunamadı veya tıklanamadı")
|
72 |
|
73 |
+
logger.info("Sayfa kaydırılıyor...")
|
74 |
comprehensive_scroll(driver)
|
75 |
|
76 |
+
logger.info("Yorumlar toplanıyor...")
|
77 |
comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
|
78 |
total_comments = len(comment_elements)
|
79 |
+
logger.info(f"Toplam {total_comments} yorum bulundu")
|
80 |
|
81 |
data = []
|
82 |
for i in range(1, total_comments + 1):
|
|
|
83 |
try:
|
84 |
username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
|
85 |
username = driver.find_element(By.XPATH, username_xpath).text
|
86 |
+
except NoSuchElementException:
|
87 |
username = "N/A"
|
88 |
|
89 |
try:
|
90 |
comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
|
91 |
comment = driver.find_element(By.XPATH, comment_xpath).text
|
92 |
+
except NoSuchElementException:
|
93 |
comment = "N/A"
|
94 |
|
95 |
try:
|
96 |
date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
|
97 |
date = driver.find_element(By.XPATH, date_xpath).text
|
98 |
+
except NoSuchElementException:
|
99 |
date = "N/A"
|
100 |
|
|
|
101 |
try:
|
102 |
+
star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
|
103 |
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
|
104 |
star_count = len(full_stars)
|
105 |
+
except NoSuchElementException:
|
106 |
star_count = 0
|
107 |
|
108 |
data.append({
|
109 |
+
"Kullanıcı_id": i,
|
110 |
"Kullanıcı Adı": username,
|
111 |
"Yorum": comment,
|
112 |
"Tarih": date,
|
113 |
"Yıldız Sayısı": star_count
|
114 |
})
|
115 |
|
116 |
+
if i % 10 == 0:
|
117 |
+
logger.info(f"{i}/{total_comments} yorum toplandı")
|
118 |
+
|
119 |
+
df = pd.DataFrame(data)
|
120 |
+
|
121 |
# Geçici dosya olarak kaydet
|
122 |
temp_file = os.path.join(data_directory, 'temp_comments.csv')
|
|
|
123 |
df.to_csv(temp_file, index=False, encoding='utf-8-sig')
|
124 |
+
logger.info(f"Veriler {temp_file} dosyasına kaydedildi")
|
125 |
|
126 |
return df
|
127 |
|
128 |
except Exception as e:
|
129 |
+
logger.error(f"Veri çekme sırasında hata: {str(e)}")
|
130 |
+
return pd.DataFrame()
|
131 |
|
132 |
finally:
|
133 |
+
if driver:
|
134 |
+
driver.quit()
|
135 |
+
logger.info("Chrome driver kapatıldı")
|
136 |
# Geçici dosyayı sil
|
137 |
+
temp_file = os.path.join("data", 'temp_comments.csv')
|
138 |
+
if os.path.exists(temp_file):
|
139 |
+
os.remove(temp_file)
|
140 |
+
logger.info("Geçici dosya silindi")
|