enesmanan commited on
Commit
4c0f60d
·
verified ·
1 Parent(s): 6b48b71

scraper fix

Browse files
Files changed (1) hide show
  1. scrape/trendyol_scraper.py +59 -24
scrape/trendyol_scraper.py CHANGED
@@ -7,6 +7,7 @@ from selenium.webdriver.chrome.options import Options
7
  import time
8
  import pandas as pd
9
  import os
 
10
 
11
  def scrape_reviews(url):
12
  # Create data directory if it doesn't exist
@@ -17,76 +18,110 @@ def scrape_reviews(url):
17
  def comprehensive_scroll(driver):
18
  last_height = driver.execute_script("return document.body.scrollHeight")
19
  while True:
20
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
21
- time.sleep(3)
 
22
  new_height = driver.execute_script("return document.body.scrollHeight")
23
  if new_height == last_height:
24
  break
25
  last_height = new_height
26
 
27
- # Space'in kendi Chrome ayarlarını kullan
28
  chrome_options = Options()
29
  chrome_options.add_argument('--headless')
30
  chrome_options.add_argument('--no-sandbox')
31
  chrome_options.add_argument('--disable-dev-shm-usage')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  try:
34
- # Space'in ChromeDriver path'ini kullan
35
  driver = webdriver.Chrome(options=chrome_options)
36
 
 
 
 
 
 
37
  driver.get(url)
 
38
 
39
  try:
40
- WebDriverWait(driver, 10).until(
 
41
  EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
42
  ).click()
43
- except:
44
- print("Çerez popup'ı bulunamadı, devam ediliyor...")
45
-
 
 
46
  comprehensive_scroll(driver)
47
-
48
- comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
 
 
 
49
  total_comments = len(comment_elements)
50
-
 
 
 
 
 
 
51
  data = []
52
- for i in range(1, total_comments + 1):
53
  try:
54
- username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
55
- username = driver.find_element(By.XPATH, username_xpath).text
56
  except:
57
  username = "N/A"
58
 
59
  try:
60
- comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
61
- comment = driver.find_element(By.XPATH, comment_xpath).text
62
  except:
63
  comment = "N/A"
64
 
65
  try:
66
- date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
67
- date = driver.find_element(By.XPATH, date_xpath).text
68
  except:
69
  date = "N/A"
70
 
71
  try:
72
- star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
73
- full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
74
- star_count = len(full_stars)
75
  except:
76
- star_count = 0
77
 
78
  data.append({
79
  "Kullanıcı_id": i,
80
  "Kullanıcı Adı": username,
81
  "Yorum": comment,
82
  "Tarih": date,
83
- "Yıldız Sayısı": star_count
84
  })
 
 
 
 
85
 
86
  return pd.DataFrame(data)
87
 
88
  except Exception as e:
89
- print(f"Hata oluştu: {str(e)}")
90
  return pd.DataFrame()
91
 
92
  finally:
 
7
  import time
8
  import pandas as pd
9
  import os
10
+ import random
11
 
12
  def scrape_reviews(url):
13
  # Create data directory if it doesn't exist
 
18
  def comprehensive_scroll(driver):
19
  last_height = driver.execute_script("return document.body.scrollHeight")
20
  while True:
21
+ # Rastgele scroll
22
+ driver.execute_script(f"window.scrollTo(0, {last_height * random.uniform(0.7, 1.0)});")
23
+ time.sleep(random.uniform(2, 4)) # Rastgele bekleme
24
  new_height = driver.execute_script("return document.body.scrollHeight")
25
  if new_height == last_height:
26
  break
27
  last_height = new_height
28
 
29
+ # Chrome ayarları
30
  chrome_options = Options()
31
  chrome_options.add_argument('--headless')
32
  chrome_options.add_argument('--no-sandbox')
33
  chrome_options.add_argument('--disable-dev-shm-usage')
34
+ chrome_options.add_argument('--disable-blink-features=AutomationControlled')
35
+ chrome_options.add_argument('--disable-extensions')
36
+ chrome_options.add_argument('--ignore-certificate-errors')
37
+ chrome_options.add_argument('--disable-gpu')
38
+ chrome_options.add_argument("--window-size=1920,1080")
39
+
40
+ # User agent ekleme
41
+ user_agents = [
42
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
43
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
44
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
45
+ ]
46
+ chrome_options.add_argument(f'user-agent={random.choice(user_agents)}')
47
+
48
+ # Bot tespitini zorlaştırmak için ek ayarlar
49
+ chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
50
+ chrome_options.add_experimental_option('useAutomationExtension', False)
51
 
52
  try:
 
53
  driver = webdriver.Chrome(options=chrome_options)
54
 
55
+ # WebDriver özelliğini gizle
56
+ driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
57
+
58
+ # Sayfayı yükle
59
+ print("Sayfa yükleniyor...")
60
  driver.get(url)
61
+ time.sleep(random.uniform(3, 5)) # Rastgele bekleme
62
 
63
  try:
64
+ # Çerez popup'ını bekle ve tıkla
65
+ WebDriverWait(driver, 15).until(
66
  EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
67
  ).click()
68
+ time.sleep(random.uniform(1, 2))
69
+ except Exception as e:
70
+ print(f"Çerez popup'ı hatası: {str(e)}")
71
+
72
+ print("Yorumlar yükleniyor...")
73
  comprehensive_scroll(driver)
74
+
75
+ # Yorumları bulmadan önce kısa bir bekleme
76
+ time.sleep(random.uniform(2, 3))
77
+
78
+ comment_elements = driver.find_elements(By.XPATH, '//div[contains(@class, "comment-cards-item")]')
79
  total_comments = len(comment_elements)
80
+
81
+ if total_comments == 0:
82
+ print("Yorum elementi bulunamadı!")
83
+ return pd.DataFrame()
84
+
85
+ print(f"Toplam {total_comments} yorum bulundu.")
86
+
87
  data = []
88
+ for i, element in enumerate(comment_elements, 1):
89
  try:
90
+ username = element.find_element(By.CLASS_NAME, "user-name").text
 
91
  except:
92
  username = "N/A"
93
 
94
  try:
95
+ comment = element.find_element(By.CLASS_NAME, "comment-text").text
 
96
  except:
97
  comment = "N/A"
98
 
99
  try:
100
+ date = element.find_element(By.CLASS_NAME, "comment-date").text
 
101
  except:
102
  date = "N/A"
103
 
104
  try:
105
+ stars = len(element.find_elements(By.CSS_SELECTOR, "div.full[style='width: 100%; max-width: 100%;']"))
 
 
106
  except:
107
+ stars = 0
108
 
109
  data.append({
110
  "Kullanıcı_id": i,
111
  "Kullanıcı Adı": username,
112
  "Yorum": comment,
113
  "Tarih": date,
114
+ "Yıldız Sayısı": stars
115
  })
116
+
117
+ # Her 5 yorumda bir rastgele bekleme
118
+ if i % 5 == 0:
119
+ time.sleep(random.uniform(0.5, 1))
120
 
121
  return pd.DataFrame(data)
122
 
123
  except Exception as e:
124
+ print(f"Hata detayı: {str(e)}")
125
  return pd.DataFrame()
126
 
127
  finally: