enesmanan commited on
Commit
26b9192
·
verified ·
1 Parent(s): fb24c70

Upload 3 files

Browse files
Files changed (1) hide show
  1. scrape/trendyol_scraper.py +56 -29
scrape/trendyol_scraper.py CHANGED
@@ -10,22 +10,27 @@ import os
10
  import random
11
 
12
  def scrape_reviews(url):
13
- # Create data directory if it doesn't exist
 
14
  data_directory = "data"
15
  if not os.path.exists(data_directory):
16
  os.makedirs(data_directory)
17
 
18
  def comprehensive_scroll(driver):
 
19
  last_height = driver.execute_script("return document.body.scrollHeight")
 
20
  while True:
21
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
22
  time.sleep(3)
23
  new_height = driver.execute_script("return document.body.scrollHeight")
 
 
24
  if new_height == last_height:
25
  break
26
  last_height = new_height
 
27
 
28
- # Chrome ayarları
29
  chrome_options = Options()
30
  chrome_options.add_argument('--headless')
31
  chrome_options.add_argument('--no-sandbox')
@@ -36,76 +41,98 @@ def scrape_reviews(url):
36
  chrome_options.add_argument("--window-size=1920,1080")
37
 
38
  try:
39
- # Linux için ChromeDriver ayarları
40
  service = Service()
41
  driver = webdriver.Chrome(service=service, options=chrome_options)
42
 
 
43
  driver.get(url)
44
- time.sleep(3)
 
 
 
 
45
 
46
  try:
47
- WebDriverWait(driver, 10).until(
48
- EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
49
- ).click()
50
- except:
51
- print("Çerez popup'ı bulunamadı, devam ediliyor...")
 
 
 
52
 
53
  comprehensive_scroll(driver)
54
 
55
- # İlk çalışan xpath'leri kullanalım
56
- comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
 
 
 
 
 
57
  total_comments = len(comment_elements)
58
 
59
  if total_comments == 0:
60
- print("Yorum elementi bulunamadı!")
 
 
 
 
 
 
 
61
  return pd.DataFrame()
62
 
63
- print(f"Toplam {total_comments} yorum bulundu")
64
-
65
  data = []
66
- for i in range(1, total_comments + 1):
67
  try:
68
- username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
69
- username = driver.find_element(By.XPATH, username_xpath).text
70
  except:
71
  username = "N/A"
 
72
 
73
  try:
74
- comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
75
- comment = driver.find_element(By.XPATH, comment_xpath).text
76
  except:
77
  comment = "N/A"
 
78
 
79
  try:
80
- date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
81
- date = driver.find_element(By.XPATH, date_xpath).text
82
  except:
83
  date = "N/A"
 
84
 
85
  try:
86
- star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
87
- full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
88
- star_count = len(full_stars)
89
  except:
90
- star_count = 0
 
91
 
92
  data.append({
93
  "Kullanıcı_id": i,
94
  "Kullanıcı Adı": username,
95
  "Yorum": comment,
96
  "Tarih": date,
97
- "Yıldız Sayısı": star_count
98
  })
99
 
100
  if i % 5 == 0:
101
- print(f"{i} yorum işlendi...")
102
 
 
103
  return pd.DataFrame(data)
104
 
105
  except Exception as e:
106
- print(f"Hata detayı: {str(e)}")
 
 
 
107
  return pd.DataFrame()
108
 
109
  finally:
110
  if 'driver' in locals():
111
- driver.quit()
 
 
10
  import random
11
 
12
  def scrape_reviews(url):
13
+ print("Scraping başlatılıyor...")
14
+
15
  data_directory = "data"
16
  if not os.path.exists(data_directory):
17
  os.makedirs(data_directory)
18
 
19
  def comprehensive_scroll(driver):
20
+ print("Sayfa kaydırma başlıyor...")
21
  last_height = driver.execute_script("return document.body.scrollHeight")
22
+ scroll_count = 0
23
  while True:
24
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
25
  time.sleep(3)
26
  new_height = driver.execute_script("return document.body.scrollHeight")
27
+ scroll_count += 1
28
+ print(f"Scroll {scroll_count}: {new_height}")
29
  if new_height == last_height:
30
  break
31
  last_height = new_height
32
+ print("Sayfa kaydırma tamamlandı")
33
 
 
34
  chrome_options = Options()
35
  chrome_options.add_argument('--headless')
36
  chrome_options.add_argument('--no-sandbox')
 
41
  chrome_options.add_argument("--window-size=1920,1080")
42
 
43
  try:
44
+ print("Chrome başlatılıyor...")
45
  service = Service()
46
  driver = webdriver.Chrome(service=service, options=chrome_options)
47
 
48
+ print(f"URL'ye gidiliyor: {url}")
49
  driver.get(url)
50
+ time.sleep(5) # Sayfa yüklenme süresini artırdık
51
+
52
+ # Sayfa kaynağını kontrol et
53
+ page_source = driver.page_source
54
+ print(f"Sayfa uzunluğu: {len(page_source)}")
55
 
56
  try:
57
+ print("Çerez popup'ı aranıyor...")
58
+ cookie_button = WebDriverWait(driver, 10).until(
59
+ EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler"))
60
+ )
61
+ cookie_button.click()
62
+ print("Çerez popup'ı kapatıldı")
63
+ except Exception as e:
64
+ print(f"Çerez popup'ı işlemi: {str(e)}")
65
 
66
  comprehensive_scroll(driver)
67
 
68
+ print("Yorum elementleri aranıyor...")
69
+ # Önce yorum container'ını bul
70
+ review_container = driver.find_element(By.CLASS_NAME, "pr-rnr-com-w")
71
+ print("Yorum container'ı bulundu")
72
+
73
+ # Yorum elementlerini bul
74
+ comment_elements = review_container.find_elements(By.CLASS_NAME, "comment-cards-item")
75
  total_comments = len(comment_elements)
76
 
77
  if total_comments == 0:
78
+ print("Alternatif yorum elementi aranıyor...")
79
+ comment_elements = driver.find_elements(By.CSS_SELECTOR, "div.comment-cards-item")
80
+ total_comments = len(comment_elements)
81
+
82
+ print(f"Bulunan yorum sayısı: {total_comments}")
83
+
84
+ if total_comments == 0:
85
+ print("Hiç yorum bulunamadı!")
86
  return pd.DataFrame()
87
 
 
 
88
  data = []
89
+ for i, element in enumerate(comment_elements, 1):
90
  try:
91
+ username = element.find_element(By.CLASS_NAME, "user-name").text
 
92
  except:
93
  username = "N/A"
94
+ print(f"Kullanıcı adı alınamadı: {i}")
95
 
96
  try:
97
+ comment = element.find_element(By.CLASS_NAME, "comment-text").text
 
98
  except:
99
  comment = "N/A"
100
+ print(f"Yorum metni alınamadı: {i}")
101
 
102
  try:
103
+ date = element.find_element(By.CLASS_NAME, "comment-date").text
 
104
  except:
105
  date = "N/A"
106
+ print(f"Tarih alınamadı: {i}")
107
 
108
  try:
109
+ stars = len(element.find_elements(By.CSS_SELECTOR, "div.full[style='width: 100%; max-width: 100%;']"))
 
 
110
  except:
111
+ stars = 0
112
+ print(f"Yıldız sayısı alınamadı: {i}")
113
 
114
  data.append({
115
  "Kullanıcı_id": i,
116
  "Kullanıcı Adı": username,
117
  "Yorum": comment,
118
  "Tarih": date,
119
+ "Yıldız Sayısı": stars
120
  })
121
 
122
  if i % 5 == 0:
123
+ print(f"{i} yorum işlendi")
124
 
125
+ print("Veri toplama tamamlandı")
126
  return pd.DataFrame(data)
127
 
128
  except Exception as e:
129
+ print(f"Kritik hata: {str(e)}")
130
+ if 'driver' in locals():
131
+ print("Son sayfa kaynağı:")
132
+ print(driver.page_source[:500]) # İlk 500 karakteri göster
133
  return pd.DataFrame()
134
 
135
  finally:
136
  if 'driver' in locals():
137
+ driver.quit()
138
+ print("Chrome kapatıldı")