Upload 3 files
Browse files- scrape/trendyol_scraper.py +56 -29
scrape/trendyol_scraper.py
CHANGED
@@ -10,22 +10,27 @@ import os
|
|
10 |
import random
|
11 |
|
12 |
def scrape_reviews(url):
|
13 |
-
|
|
|
14 |
data_directory = "data"
|
15 |
if not os.path.exists(data_directory):
|
16 |
os.makedirs(data_directory)
|
17 |
|
18 |
def comprehensive_scroll(driver):
|
|
|
19 |
last_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
20 |
while True:
|
21 |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
22 |
time.sleep(3)
|
23 |
new_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
|
|
24 |
if new_height == last_height:
|
25 |
break
|
26 |
last_height = new_height
|
|
|
27 |
|
28 |
-
# Chrome ayarları
|
29 |
chrome_options = Options()
|
30 |
chrome_options.add_argument('--headless')
|
31 |
chrome_options.add_argument('--no-sandbox')
|
@@ -36,76 +41,98 @@ def scrape_reviews(url):
|
|
36 |
chrome_options.add_argument("--window-size=1920,1080")
|
37 |
|
38 |
try:
|
39 |
-
|
40 |
service = Service()
|
41 |
driver = webdriver.Chrome(service=service, options=chrome_options)
|
42 |
|
|
|
43 |
driver.get(url)
|
44 |
-
time.sleep(
|
|
|
|
|
|
|
|
|
45 |
|
46 |
try:
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
52 |
|
53 |
comprehensive_scroll(driver)
|
54 |
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
57 |
total_comments = len(comment_elements)
|
58 |
|
59 |
if total_comments == 0:
|
60 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
return pd.DataFrame()
|
62 |
|
63 |
-
print(f"Toplam {total_comments} yorum bulundu")
|
64 |
-
|
65 |
data = []
|
66 |
-
for i in
|
67 |
try:
|
68 |
-
|
69 |
-
username = driver.find_element(By.XPATH, username_xpath).text
|
70 |
except:
|
71 |
username = "N/A"
|
|
|
72 |
|
73 |
try:
|
74 |
-
|
75 |
-
comment = driver.find_element(By.XPATH, comment_xpath).text
|
76 |
except:
|
77 |
comment = "N/A"
|
|
|
78 |
|
79 |
try:
|
80 |
-
|
81 |
-
date = driver.find_element(By.XPATH, date_xpath).text
|
82 |
except:
|
83 |
date = "N/A"
|
|
|
84 |
|
85 |
try:
|
86 |
-
|
87 |
-
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
|
88 |
-
star_count = len(full_stars)
|
89 |
except:
|
90 |
-
|
|
|
91 |
|
92 |
data.append({
|
93 |
"Kullanıcı_id": i,
|
94 |
"Kullanıcı Adı": username,
|
95 |
"Yorum": comment,
|
96 |
"Tarih": date,
|
97 |
-
"Yıldız Sayısı":
|
98 |
})
|
99 |
|
100 |
if i % 5 == 0:
|
101 |
-
print(f"{i} yorum işlendi
|
102 |
|
|
|
103 |
return pd.DataFrame(data)
|
104 |
|
105 |
except Exception as e:
|
106 |
-
print(f"
|
|
|
|
|
|
|
107 |
return pd.DataFrame()
|
108 |
|
109 |
finally:
|
110 |
if 'driver' in locals():
|
111 |
-
driver.quit()
|
|
|
|
10 |
import random
|
11 |
|
12 |
def scrape_reviews(url):
|
13 |
+
print("Scraping başlatılıyor...")
|
14 |
+
|
15 |
data_directory = "data"
|
16 |
if not os.path.exists(data_directory):
|
17 |
os.makedirs(data_directory)
|
18 |
|
19 |
def comprehensive_scroll(driver):
|
20 |
+
print("Sayfa kaydırma başlıyor...")
|
21 |
last_height = driver.execute_script("return document.body.scrollHeight")
|
22 |
+
scroll_count = 0
|
23 |
while True:
|
24 |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
25 |
time.sleep(3)
|
26 |
new_height = driver.execute_script("return document.body.scrollHeight")
|
27 |
+
scroll_count += 1
|
28 |
+
print(f"Scroll {scroll_count}: {new_height}")
|
29 |
if new_height == last_height:
|
30 |
break
|
31 |
last_height = new_height
|
32 |
+
print("Sayfa kaydırma tamamlandı")
|
33 |
|
|
|
34 |
chrome_options = Options()
|
35 |
chrome_options.add_argument('--headless')
|
36 |
chrome_options.add_argument('--no-sandbox')
|
|
|
41 |
chrome_options.add_argument("--window-size=1920,1080")
|
42 |
|
43 |
try:
|
44 |
+
print("Chrome başlatılıyor...")
|
45 |
service = Service()
|
46 |
driver = webdriver.Chrome(service=service, options=chrome_options)
|
47 |
|
48 |
+
print(f"URL'ye gidiliyor: {url}")
|
49 |
driver.get(url)
|
50 |
+
time.sleep(5) # Sayfa yüklenme süresini artırdık
|
51 |
+
|
52 |
+
# Sayfa kaynağını kontrol et
|
53 |
+
page_source = driver.page_source
|
54 |
+
print(f"Sayfa uzunluğu: {len(page_source)}")
|
55 |
|
56 |
try:
|
57 |
+
print("Çerez popup'ı aranıyor...")
|
58 |
+
cookie_button = WebDriverWait(driver, 10).until(
|
59 |
+
EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler"))
|
60 |
+
)
|
61 |
+
cookie_button.click()
|
62 |
+
print("Çerez popup'ı kapatıldı")
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Çerez popup'ı işlemi: {str(e)}")
|
65 |
|
66 |
comprehensive_scroll(driver)
|
67 |
|
68 |
+
print("Yorum elementleri aranıyor...")
|
69 |
+
# Önce yorum container'ını bul
|
70 |
+
review_container = driver.find_element(By.CLASS_NAME, "pr-rnr-com-w")
|
71 |
+
print("Yorum container'ı bulundu")
|
72 |
+
|
73 |
+
# Yorum elementlerini bul
|
74 |
+
comment_elements = review_container.find_elements(By.CLASS_NAME, "comment-cards-item")
|
75 |
total_comments = len(comment_elements)
|
76 |
|
77 |
if total_comments == 0:
|
78 |
+
print("Alternatif yorum elementi aranıyor...")
|
79 |
+
comment_elements = driver.find_elements(By.CSS_SELECTOR, "div.comment-cards-item")
|
80 |
+
total_comments = len(comment_elements)
|
81 |
+
|
82 |
+
print(f"Bulunan yorum sayısı: {total_comments}")
|
83 |
+
|
84 |
+
if total_comments == 0:
|
85 |
+
print("Hiç yorum bulunamadı!")
|
86 |
return pd.DataFrame()
|
87 |
|
|
|
|
|
88 |
data = []
|
89 |
+
for i, element in enumerate(comment_elements, 1):
|
90 |
try:
|
91 |
+
username = element.find_element(By.CLASS_NAME, "user-name").text
|
|
|
92 |
except:
|
93 |
username = "N/A"
|
94 |
+
print(f"Kullanıcı adı alınamadı: {i}")
|
95 |
|
96 |
try:
|
97 |
+
comment = element.find_element(By.CLASS_NAME, "comment-text").text
|
|
|
98 |
except:
|
99 |
comment = "N/A"
|
100 |
+
print(f"Yorum metni alınamadı: {i}")
|
101 |
|
102 |
try:
|
103 |
+
date = element.find_element(By.CLASS_NAME, "comment-date").text
|
|
|
104 |
except:
|
105 |
date = "N/A"
|
106 |
+
print(f"Tarih alınamadı: {i}")
|
107 |
|
108 |
try:
|
109 |
+
stars = len(element.find_elements(By.CSS_SELECTOR, "div.full[style='width: 100%; max-width: 100%;']"))
|
|
|
|
|
110 |
except:
|
111 |
+
stars = 0
|
112 |
+
print(f"Yıldız sayısı alınamadı: {i}")
|
113 |
|
114 |
data.append({
|
115 |
"Kullanıcı_id": i,
|
116 |
"Kullanıcı Adı": username,
|
117 |
"Yorum": comment,
|
118 |
"Tarih": date,
|
119 |
+
"Yıldız Sayısı": stars
|
120 |
})
|
121 |
|
122 |
if i % 5 == 0:
|
123 |
+
print(f"{i} yorum işlendi")
|
124 |
|
125 |
+
print("Veri toplama tamamlandı")
|
126 |
return pd.DataFrame(data)
|
127 |
|
128 |
except Exception as e:
|
129 |
+
print(f"Kritik hata: {str(e)}")
|
130 |
+
if 'driver' in locals():
|
131 |
+
print("Son sayfa kaynağı:")
|
132 |
+
print(driver.page_source[:500]) # İlk 500 karakteri göster
|
133 |
return pd.DataFrame()
|
134 |
|
135 |
finally:
|
136 |
if 'driver' in locals():
|
137 |
+
driver.quit()
|
138 |
+
print("Chrome kapatıldı")
|