enesmanan commited on
Commit
ab36536
·
verified ·
1 Parent(s): 4c0f60d

fix linux driver

Browse files
Files changed (1) hide show
  1. scrape/trendyol_scraper.py +124 -128
scrape/trendyol_scraper.py CHANGED
@@ -1,129 +1,125 @@
1
- from selenium import webdriver
2
- from selenium.webdriver.chrome.service import Service
3
- from selenium.webdriver.common.by import By
4
- from selenium.webdriver.support.ui import WebDriverWait
5
- from selenium.webdriver.support import expected_conditions as EC
6
- from selenium.webdriver.chrome.options import Options
7
- import time
8
- import pandas as pd
9
- import os
10
- import random
11
-
12
- def scrape_reviews(url):
13
- # Create data directory if it doesn't exist
14
- data_directory = "data"
15
- if not os.path.exists(data_directory):
16
- os.makedirs(data_directory)
17
-
18
- def comprehensive_scroll(driver):
19
- last_height = driver.execute_script("return document.body.scrollHeight")
20
- while True:
21
- # Rastgele scroll
22
- driver.execute_script(f"window.scrollTo(0, {last_height * random.uniform(0.7, 1.0)});")
23
- time.sleep(random.uniform(2, 4)) # Rastgele bekleme
24
- new_height = driver.execute_script("return document.body.scrollHeight")
25
- if new_height == last_height:
26
- break
27
- last_height = new_height
28
-
29
- # Chrome ayarları
30
- chrome_options = Options()
31
- chrome_options.add_argument('--headless')
32
- chrome_options.add_argument('--no-sandbox')
33
- chrome_options.add_argument('--disable-dev-shm-usage')
34
- chrome_options.add_argument('--disable-blink-features=AutomationControlled')
35
- chrome_options.add_argument('--disable-extensions')
36
- chrome_options.add_argument('--ignore-certificate-errors')
37
- chrome_options.add_argument('--disable-gpu')
38
- chrome_options.add_argument("--window-size=1920,1080")
39
-
40
- # User agent ekleme
41
- user_agents = [
42
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
43
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
44
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
45
- ]
46
- chrome_options.add_argument(f'user-agent={random.choice(user_agents)}')
47
-
48
- # Bot tespitini zorlaştırmak için ek ayarlar
49
- chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
50
- chrome_options.add_experimental_option('useAutomationExtension', False)
51
-
52
- try:
53
- driver = webdriver.Chrome(options=chrome_options)
54
-
55
- # WebDriver özelliğini gizle
56
- driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
57
-
58
- # Sayfayı yükle
59
- print("Sayfa yükleniyor...")
60
- driver.get(url)
61
- time.sleep(random.uniform(3, 5)) # Rastgele bekleme
62
-
63
- try:
64
- # Çerez popup'ını bekle ve tıkla
65
- WebDriverWait(driver, 15).until(
66
- EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
67
- ).click()
68
- time.sleep(random.uniform(1, 2))
69
- except Exception as e:
70
- print(f"Çerez popup'ı hatası: {str(e)}")
71
-
72
- print("Yorumlar yükleniyor...")
73
- comprehensive_scroll(driver)
74
-
75
- # Yorumları bulmadan önce kısa bir bekleme
76
- time.sleep(random.uniform(2, 3))
77
-
78
- comment_elements = driver.find_elements(By.XPATH, '//div[contains(@class, "comment-cards-item")]')
79
- total_comments = len(comment_elements)
80
-
81
- if total_comments == 0:
82
- print("Yorum elementi bulunamadı!")
83
- return pd.DataFrame()
84
-
85
- print(f"Toplam {total_comments} yorum bulundu.")
86
-
87
- data = []
88
- for i, element in enumerate(comment_elements, 1):
89
- try:
90
- username = element.find_element(By.CLASS_NAME, "user-name").text
91
- except:
92
- username = "N/A"
93
-
94
- try:
95
- comment = element.find_element(By.CLASS_NAME, "comment-text").text
96
- except:
97
- comment = "N/A"
98
-
99
- try:
100
- date = element.find_element(By.CLASS_NAME, "comment-date").text
101
- except:
102
- date = "N/A"
103
-
104
- try:
105
- stars = len(element.find_elements(By.CSS_SELECTOR, "div.full[style='width: 100%; max-width: 100%;']"))
106
- except:
107
- stars = 0
108
-
109
- data.append({
110
- "Kullanıcı_id": i,
111
- "Kullanıcı Adı": username,
112
- "Yorum": comment,
113
- "Tarih": date,
114
- "Yıldız Sayısı": stars
115
- })
116
-
117
- # Her 5 yorumda bir rastgele bekleme
118
- if i % 5 == 0:
119
- time.sleep(random.uniform(0.5, 1))
120
-
121
- return pd.DataFrame(data)
122
-
123
- except Exception as e:
124
- print(f"Hata detayı: {str(e)}")
125
- return pd.DataFrame()
126
-
127
- finally:
128
- if 'driver' in locals():
129
  driver.quit()
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.service import Service
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.chrome.options import Options
7
+ from webdriver_manager.chrome import ChromeDriverManager
8
+ from webdriver_manager.core.os_manager import ChromeType
9
+ import time
10
+ import pandas as pd
11
+ import os
12
+ import random
13
+
14
+ def scrape_reviews(url):
15
+ # Create data directory if it doesn't exist
16
+ data_directory = "data"
17
+ if not os.path.exists(data_directory):
18
+ os.makedirs(data_directory)
19
+
20
+ def comprehensive_scroll(driver):
21
+ last_height = driver.execute_script("return document.body.scrollHeight")
22
+ while True:
23
+ # Rastgele scroll
24
+ driver.execute_script(f"window.scrollTo(0, {last_height * random.uniform(0.7, 1.0)});")
25
+ time.sleep(random.uniform(2, 4))
26
+ new_height = driver.execute_script("return document.body.scrollHeight")
27
+ if new_height == last_height:
28
+ break
29
+ last_height = new_height
30
+
31
+ # Chrome ayarları
32
+ chrome_options = Options()
33
+ chrome_options.add_argument('--headless')
34
+ chrome_options.add_argument('--no-sandbox')
35
+ chrome_options.add_argument('--disable-dev-shm-usage')
36
+ chrome_options.add_argument('--disable-blink-features=AutomationControlled')
37
+ chrome_options.add_argument('--disable-extensions')
38
+ chrome_options.add_argument('--ignore-certificate-errors')
39
+ chrome_options.add_argument('--disable-gpu')
40
+ chrome_options.add_argument("--window-size=1920,1080")
41
+
42
+ # User agent ekleme
43
+ user_agents = [
44
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
45
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
46
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
47
+ ]
48
+ chrome_options.add_argument(f'user-agent={random.choice(user_agents)}')
49
+
50
+ try:
51
+ # ChromeDriver'ı otomatik yönet
52
+ service = Service(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install())
53
+ driver = webdriver.Chrome(service=service, options=chrome_options)
54
+
55
+ # WebDriver özelliğini gizle
56
+ driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
57
+
58
+ print("Sayfa yükleniyor...")
59
+ driver.get(url)
60
+ time.sleep(random.uniform(3, 5))
61
+
62
+ try:
63
+ WebDriverWait(driver, 15).until(
64
+ EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
65
+ ).click()
66
+ time.sleep(random.uniform(1, 2))
67
+ except Exception as e:
68
+ print(f"Çerez popup'ı hatası: {str(e)}")
69
+
70
+ print("Yorumlar yükleniyor...")
71
+ comprehensive_scroll(driver)
72
+ time.sleep(random.uniform(2, 3))
73
+
74
+ # Yorum elementlerini bul
75
+ comment_elements = driver.find_elements(By.CSS_SELECTOR, "div.comment-cards-item")
76
+ total_comments = len(comment_elements)
77
+
78
+ if total_comments == 0:
79
+ print("Yorum elementi bulunamadı!")
80
+ return pd.DataFrame()
81
+
82
+ print(f"Toplam {total_comments} yorum bulundu.")
83
+
84
+ data = []
85
+ for i, element in enumerate(comment_elements, 1):
86
+ try:
87
+ username = element.find_element(By.CLASS_NAME, "user-name").text
88
+ except:
89
+ username = "N/A"
90
+
91
+ try:
92
+ comment = element.find_element(By.CLASS_NAME, "comment-text").text
93
+ except:
94
+ comment = "N/A"
95
+
96
+ try:
97
+ date = element.find_element(By.CLASS_NAME, "comment-date").text
98
+ except:
99
+ date = "N/A"
100
+
101
+ try:
102
+ stars = len(element.find_elements(By.CSS_SELECTOR, "div.full[style='width: 100%; max-width: 100%;']"))
103
+ except:
104
+ stars = 0
105
+
106
+ data.append({
107
+ "Kullanıcı_id": i,
108
+ "Kullanıcı Adı": username,
109
+ "Yorum": comment,
110
+ "Tarih": date,
111
+ "Yıldız Sayısı": stars
112
+ })
113
+
114
+ if i % 5 == 0:
115
+ time.sleep(random.uniform(0.5, 1))
116
+
117
+ return pd.DataFrame(data)
118
+
119
+ except Exception as e:
120
+ print(f"Hata detayı: {str(e)}")
121
+ return pd.DataFrame()
122
+
123
+ finally:
124
+ if 'driver' in locals():
 
 
 
 
125
  driver.quit()