enesmanan commited on
Commit
f6d39c8
·
verified ·
1 Parent(s): c63bed5
Files changed (1) hide show
  1. scrape/trendyol_scraper.py +75 -45
scrape/trendyol_scraper.py CHANGED
@@ -3,108 +3,138 @@ from selenium.webdriver.chrome.service import Service
3
  from selenium.webdriver.common.by import By
4
  from selenium.webdriver.support.ui import WebDriverWait
5
  from selenium.webdriver.support import expected_conditions as EC
 
6
  import time
7
  import pandas as pd
8
  import os
 
 
 
9
 
10
  def comprehensive_scroll(driver):
11
- # Scroll until no more new content is loaded
12
- last_height = driver.execute_script("return document.body.scrollHeight")
13
- while True:
14
- # Scroll to bottom
15
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
16
- time.sleep(3) # Wait for potential content loading
17
-
18
- # Calculate new scroll height
19
- new_height = driver.execute_script("return document.body.scrollHeight")
20
-
21
- # Check if bottom has been reached
22
- if new_height == last_height:
23
- break
24
-
25
- last_height = new_height
26
 
27
  def scrape_reviews(url):
28
  """URL'den yorumları çeken fonksiyon"""
29
- # Data directory oluştur
30
- data_directory = "data"
31
- if not os.path.exists(data_directory):
32
- os.makedirs(data_directory)
 
 
33
 
34
- # Chrome options ayarları
35
- chrome_options = webdriver.ChromeOptions()
36
- chrome_options.add_argument('--headless')
37
- chrome_options.add_argument('--disable-gpu')
38
- chrome_options.add_argument('--no-sandbox')
39
- chrome_options.add_argument('--disable-dev-shm-usage')
40
- chrome_options.add_argument("--window-size=1920,1080")
41
 
42
- try:
43
  # Linux için ChromeDriver ayarı
44
- service = Service('chromedriver') # Linux'ta path belirtmeye gerek yok
45
- driver = webdriver.Chrome(service=service, options=chrome_options)
 
 
 
 
 
 
 
 
 
 
 
46
 
 
47
  driver.get(url)
48
 
49
  # Çerez popup'ını kabul et
50
- WebDriverWait(driver, 10).until(
51
- EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
52
- ).click()
 
 
 
 
53
 
 
54
  comprehensive_scroll(driver)
55
 
 
56
  comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
57
  total_comments = len(comment_elements)
 
58
 
59
  data = []
60
  for i in range(1, total_comments + 1):
61
- kullanıcı_id = i
62
  try:
63
  username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
64
  username = driver.find_element(By.XPATH, username_xpath).text
65
- except:
66
  username = "N/A"
67
 
68
  try:
69
  comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
70
  comment = driver.find_element(By.XPATH, comment_xpath).text
71
- except:
72
  comment = "N/A"
73
 
74
  try:
75
  date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
76
  date = driver.find_element(By.XPATH, date_xpath).text
77
- except:
78
  date = "N/A"
79
 
80
- star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
81
  try:
 
82
  full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
83
  star_count = len(full_stars)
84
- except:
85
  star_count = 0
86
 
87
  data.append({
88
- "Kullanıcı_id": kullanıcı_id,
89
  "Kullanıcı Adı": username,
90
  "Yorum": comment,
91
  "Tarih": date,
92
  "Yıldız Sayısı": star_count
93
  })
94
 
 
 
 
 
 
95
  # Geçici dosya olarak kaydet
96
  temp_file = os.path.join(data_directory, 'temp_comments.csv')
97
- df = pd.DataFrame(data)
98
  df.to_csv(temp_file, index=False, encoding='utf-8-sig')
 
99
 
100
  return df
101
 
102
  except Exception as e:
103
- print(f"Hata oluştu: {str(e)}")
104
- return pd.DataFrame() # Boş DataFrame döndür
105
 
106
  finally:
107
- driver.quit()
 
 
108
  # Geçici dosyayı sil
109
- if os.path.exists(os.path.join(data_directory, 'temp_comments.csv')):
110
- os.remove(os.path.join(data_directory, 'temp_comments.csv'))
 
 
 
3
  from selenium.webdriver.common.by import By
4
  from selenium.webdriver.support.ui import WebDriverWait
5
  from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException
7
  import time
8
  import pandas as pd
9
  import os
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
13
 
14
  def comprehensive_scroll(driver):
15
+ """Scroll until no more new content is loaded"""
16
+ try:
17
+ last_height = driver.execute_script("return document.body.scrollHeight")
18
+ while True:
19
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
20
+ time.sleep(3)
21
+
22
+ new_height = driver.execute_script("return document.body.scrollHeight")
23
+ if new_height == last_height:
24
+ break
25
+ last_height = new_height
26
+ except Exception as e:
27
+ logger.error(f"Scroll sırasında hata: {str(e)}")
 
 
28
 
29
  def scrape_reviews(url):
30
  """URL'den yorumları çeken fonksiyon"""
31
+ driver = None
32
+ try:
33
+ # Data directory oluştur
34
+ data_directory = "data"
35
+ if not os.path.exists(data_directory):
36
+ os.makedirs(data_directory)
37
 
38
+ # Chrome options ayarları
39
+ chrome_options = webdriver.ChromeOptions()
40
+ chrome_options.add_argument('--headless')
41
+ chrome_options.add_argument('--disable-gpu')
42
+ chrome_options.add_argument('--no-sandbox')
43
+ chrome_options.add_argument('--disable-dev-shm-usage')
44
+ chrome_options.add_argument("--window-size=1920,1080")
45
 
 
46
  # Linux için ChromeDriver ayarı
47
+ try:
48
+ # Önce /usr/local/bin/chromedriver'ı dene
49
+ service = Service('/usr/local/bin/chromedriver')
50
+ driver = webdriver.Chrome(service=service, options=chrome_options)
51
+ except:
52
+ try:
53
+ # Eğer başarısız olursa /usr/bin/chromedriver'ı dene
54
+ service = Service('/usr/bin/chromedriver')
55
+ driver = webdriver.Chrome(service=service, options=chrome_options)
56
+ except:
57
+ # Son olarak PATH'teki chromedriver'ı dene
58
+ service = Service('chromedriver')
59
+ driver = webdriver.Chrome(service=service, options=chrome_options)
60
 
61
+ logger.info(f"URL'ye erişiliyor: {url}")
62
  driver.get(url)
63
 
64
  # Çerez popup'ını kabul et
65
+ try:
66
+ WebDriverWait(driver, 10).until(
67
+ EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
68
+ ).click()
69
+ logger.info("Çerez popup'ı kabul edildi")
70
+ except TimeoutException:
71
+ logger.warning("Çerez popup'ı bulunamadı veya tıklanamadı")
72
 
73
+ logger.info("Sayfa kaydırılıyor...")
74
  comprehensive_scroll(driver)
75
 
76
+ logger.info("Yorumlar toplanıyor...")
77
  comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
78
  total_comments = len(comment_elements)
79
+ logger.info(f"Toplam {total_comments} yorum bulundu")
80
 
81
  data = []
82
  for i in range(1, total_comments + 1):
 
83
  try:
84
  username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
85
  username = driver.find_element(By.XPATH, username_xpath).text
86
+ except NoSuchElementException:
87
  username = "N/A"
88
 
89
  try:
90
  comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
91
  comment = driver.find_element(By.XPATH, comment_xpath).text
92
+ except NoSuchElementException:
93
  comment = "N/A"
94
 
95
  try:
96
  date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
97
  date = driver.find_element(By.XPATH, date_xpath).text
98
+ except NoSuchElementException:
99
  date = "N/A"
100
 
 
101
  try:
102
+ star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
103
  full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
104
  star_count = len(full_stars)
105
+ except NoSuchElementException:
106
  star_count = 0
107
 
108
  data.append({
109
+ "Kullanıcı_id": i,
110
  "Kullanıcı Adı": username,
111
  "Yorum": comment,
112
  "Tarih": date,
113
  "Yıldız Sayısı": star_count
114
  })
115
 
116
+ if i % 10 == 0:
117
+ logger.info(f"{i}/{total_comments} yorum toplandı")
118
+
119
+ df = pd.DataFrame(data)
120
+
121
  # Geçici dosya olarak kaydet
122
  temp_file = os.path.join(data_directory, 'temp_comments.csv')
 
123
  df.to_csv(temp_file, index=False, encoding='utf-8-sig')
124
+ logger.info(f"Veriler {temp_file} dosyasına kaydedildi")
125
 
126
  return df
127
 
128
  except Exception as e:
129
+ logger.error(f"Veri çekme sırasında hata: {str(e)}")
130
+ return pd.DataFrame()
131
 
132
  finally:
133
+ if driver:
134
+ driver.quit()
135
+ logger.info("Chrome driver kapatıldı")
136
  # Geçici dosyayı sil
137
+ temp_file = os.path.join("data", 'temp_comments.csv')
138
+ if os.path.exists(temp_file):
139
+ os.remove(temp_file)
140
+ logger.info("Geçici dosya silindi")