scraper fix
Browse files- scrape/trendyol_scraper.py +59 -24
scrape/trendyol_scraper.py
CHANGED
@@ -7,6 +7,7 @@ from selenium.webdriver.chrome.options import Options
|
|
7 |
import time
|
8 |
import pandas as pd
|
9 |
import os
|
|
|
10 |
|
11 |
def scrape_reviews(url):
|
12 |
# Create data directory if it doesn't exist
|
@@ -17,76 +18,110 @@ def scrape_reviews(url):
|
|
17 |
def comprehensive_scroll(driver):
|
18 |
last_height = driver.execute_script("return document.body.scrollHeight")
|
19 |
while True:
|
20 |
-
|
21 |
-
|
|
|
22 |
new_height = driver.execute_script("return document.body.scrollHeight")
|
23 |
if new_height == last_height:
|
24 |
break
|
25 |
last_height = new_height
|
26 |
|
27 |
-
#
|
28 |
chrome_options = Options()
|
29 |
chrome_options.add_argument('--headless')
|
30 |
chrome_options.add_argument('--no-sandbox')
|
31 |
chrome_options.add_argument('--disable-dev-shm-usage')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
try:
|
34 |
-
# Space'in ChromeDriver path'ini kullan
|
35 |
driver = webdriver.Chrome(options=chrome_options)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
37 |
driver.get(url)
|
|
|
38 |
|
39 |
try:
|
40 |
-
|
|
|
41 |
EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
|
42 |
).click()
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
46 |
comprehensive_scroll(driver)
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
49 |
total_comments = len(comment_elements)
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
data = []
|
52 |
-
for i in
|
53 |
try:
|
54 |
-
|
55 |
-
username = driver.find_element(By.XPATH, username_xpath).text
|
56 |
except:
|
57 |
username = "N/A"
|
58 |
|
59 |
try:
|
60 |
-
|
61 |
-
comment = driver.find_element(By.XPATH, comment_xpath).text
|
62 |
except:
|
63 |
comment = "N/A"
|
64 |
|
65 |
try:
|
66 |
-
|
67 |
-
date = driver.find_element(By.XPATH, date_xpath).text
|
68 |
except:
|
69 |
date = "N/A"
|
70 |
|
71 |
try:
|
72 |
-
|
73 |
-
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
|
74 |
-
star_count = len(full_stars)
|
75 |
except:
|
76 |
-
|
77 |
|
78 |
data.append({
|
79 |
"Kullanıcı_id": i,
|
80 |
"Kullanıcı Adı": username,
|
81 |
"Yorum": comment,
|
82 |
"Tarih": date,
|
83 |
-
"Yıldız Sayısı":
|
84 |
})
|
|
|
|
|
|
|
|
|
85 |
|
86 |
return pd.DataFrame(data)
|
87 |
|
88 |
except Exception as e:
|
89 |
-
print(f"Hata
|
90 |
return pd.DataFrame()
|
91 |
|
92 |
finally:
|
|
|
7 |
import time
|
8 |
import pandas as pd
|
9 |
import os
|
10 |
+
import random
|
11 |
|
12 |
def scrape_reviews(url):
|
13 |
# Create data directory if it doesn't exist
|
|
|
18 |
def comprehensive_scroll(driver):
|
19 |
last_height = driver.execute_script("return document.body.scrollHeight")
|
20 |
while True:
|
21 |
+
# Rastgele scroll
|
22 |
+
driver.execute_script(f"window.scrollTo(0, {last_height * random.uniform(0.7, 1.0)});")
|
23 |
+
time.sleep(random.uniform(2, 4)) # Rastgele bekleme
|
24 |
new_height = driver.execute_script("return document.body.scrollHeight")
|
25 |
if new_height == last_height:
|
26 |
break
|
27 |
last_height = new_height
|
28 |
|
29 |
+
# Chrome ayarları
|
30 |
chrome_options = Options()
|
31 |
chrome_options.add_argument('--headless')
|
32 |
chrome_options.add_argument('--no-sandbox')
|
33 |
chrome_options.add_argument('--disable-dev-shm-usage')
|
34 |
+
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
35 |
+
chrome_options.add_argument('--disable-extensions')
|
36 |
+
chrome_options.add_argument('--ignore-certificate-errors')
|
37 |
+
chrome_options.add_argument('--disable-gpu')
|
38 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
39 |
+
|
40 |
+
# User agent ekleme
|
41 |
+
user_agents = [
|
42 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
43 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
44 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
|
45 |
+
]
|
46 |
+
chrome_options.add_argument(f'user-agent={random.choice(user_agents)}')
|
47 |
+
|
48 |
+
# Bot tespitini zorlaştırmak için ek ayarlar
|
49 |
+
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
50 |
+
chrome_options.add_experimental_option('useAutomationExtension', False)
|
51 |
|
52 |
try:
|
|
|
53 |
driver = webdriver.Chrome(options=chrome_options)
|
54 |
|
55 |
+
# WebDriver özelliğini gizle
|
56 |
+
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
57 |
+
|
58 |
+
# Sayfayı yükle
|
59 |
+
print("Sayfa yükleniyor...")
|
60 |
driver.get(url)
|
61 |
+
time.sleep(random.uniform(3, 5)) # Rastgele bekleme
|
62 |
|
63 |
try:
|
64 |
+
# Çerez popup'ını bekle ve tıkla
|
65 |
+
WebDriverWait(driver, 15).until(
|
66 |
EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
|
67 |
).click()
|
68 |
+
time.sleep(random.uniform(1, 2))
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Çerez popup'ı hatası: {str(e)}")
|
71 |
+
|
72 |
+
print("Yorumlar yükleniyor...")
|
73 |
comprehensive_scroll(driver)
|
74 |
+
|
75 |
+
# Yorumları bulmadan önce kısa bir bekleme
|
76 |
+
time.sleep(random.uniform(2, 3))
|
77 |
+
|
78 |
+
comment_elements = driver.find_elements(By.XPATH, '//div[contains(@class, "comment-cards-item")]')
|
79 |
total_comments = len(comment_elements)
|
80 |
+
|
81 |
+
if total_comments == 0:
|
82 |
+
print("Yorum elementi bulunamadı!")
|
83 |
+
return pd.DataFrame()
|
84 |
+
|
85 |
+
print(f"Toplam {total_comments} yorum bulundu.")
|
86 |
+
|
87 |
data = []
|
88 |
+
for i, element in enumerate(comment_elements, 1):
|
89 |
try:
|
90 |
+
username = element.find_element(By.CLASS_NAME, "user-name").text
|
|
|
91 |
except:
|
92 |
username = "N/A"
|
93 |
|
94 |
try:
|
95 |
+
comment = element.find_element(By.CLASS_NAME, "comment-text").text
|
|
|
96 |
except:
|
97 |
comment = "N/A"
|
98 |
|
99 |
try:
|
100 |
+
date = element.find_element(By.CLASS_NAME, "comment-date").text
|
|
|
101 |
except:
|
102 |
date = "N/A"
|
103 |
|
104 |
try:
|
105 |
+
stars = len(element.find_elements(By.CSS_SELECTOR, "div.full[style='width: 100%; max-width: 100%;']"))
|
|
|
|
|
106 |
except:
|
107 |
+
stars = 0
|
108 |
|
109 |
data.append({
|
110 |
"Kullanıcı_id": i,
|
111 |
"Kullanıcı Adı": username,
|
112 |
"Yorum": comment,
|
113 |
"Tarih": date,
|
114 |
+
"Yıldız Sayısı": stars
|
115 |
})
|
116 |
+
|
117 |
+
# Her 5 yorumda bir rastgele bekleme
|
118 |
+
if i % 5 == 0:
|
119 |
+
time.sleep(random.uniform(0.5, 1))
|
120 |
|
121 |
return pd.DataFrame(data)
|
122 |
|
123 |
except Exception as e:
|
124 |
+
print(f"Hata detayı: {str(e)}")
|
125 |
return pd.DataFrame()
|
126 |
|
127 |
finally:
|