fix
Browse files- app.py +5 -1
- scrape/trendyol_scraper.py +29 -43
app.py
CHANGED
@@ -324,4 +324,8 @@ def create_interface():
|
|
324 |
|
325 |
if __name__ == "__main__":
|
326 |
interface = create_interface()
|
327 |
-
interface.launch(
|
|
|
|
|
|
|
|
|
|
324 |
|
325 |
if __name__ == "__main__":
|
326 |
interface = create_interface()
|
327 |
+
interface.launch(
|
328 |
+
server_name="0.0.0.0", # Dış bağlantılara izin ver
|
329 |
+
share=True, # Public link oluştur
|
330 |
+
server_port=7860 # Space'in varsayılan portu
|
331 |
+
)
|
scrape/trendyol_scraper.py
CHANGED
@@ -4,8 +4,6 @@ from selenium.webdriver.common.by import By
|
|
4 |
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
from selenium.webdriver.support import expected_conditions as EC
|
6 |
from selenium.webdriver.chrome.options import Options
|
7 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
8 |
-
from webdriver_manager.core.os_manager import ChromeType
|
9 |
import time
|
10 |
import pandas as pd
|
11 |
import os
|
@@ -20,9 +18,8 @@ def scrape_reviews(url):
|
|
20 |
def comprehensive_scroll(driver):
|
21 |
last_height = driver.execute_script("return document.body.scrollHeight")
|
22 |
while True:
|
23 |
-
|
24 |
-
|
25 |
-
time.sleep(random.uniform(2, 4))
|
26 |
new_height = driver.execute_script("return document.body.scrollHeight")
|
27 |
if new_height == last_height:
|
28 |
break
|
@@ -33,86 +30,75 @@ def scrape_reviews(url):
|
|
33 |
chrome_options.add_argument('--headless')
|
34 |
chrome_options.add_argument('--no-sandbox')
|
35 |
chrome_options.add_argument('--disable-dev-shm-usage')
|
36 |
-
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
37 |
-
chrome_options.add_argument('--disable-extensions')
|
38 |
-
chrome_options.add_argument('--ignore-certificate-errors')
|
39 |
chrome_options.add_argument('--disable-gpu')
|
|
|
|
|
40 |
chrome_options.add_argument("--window-size=1920,1080")
|
41 |
|
42 |
-
# User agent ekleme
|
43 |
-
user_agents = [
|
44 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
45 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
46 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
|
47 |
-
]
|
48 |
-
chrome_options.add_argument(f'user-agent={random.choice(user_agents)}')
|
49 |
-
|
50 |
try:
|
51 |
-
# ChromeDriver
|
52 |
-
service = Service(
|
53 |
driver = webdriver.Chrome(service=service, options=chrome_options)
|
54 |
|
55 |
-
# WebDriver özelliğini gizle
|
56 |
-
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
57 |
-
|
58 |
-
print("Sayfa yükleniyor...")
|
59 |
driver.get(url)
|
60 |
-
time.sleep(
|
61 |
|
62 |
try:
|
63 |
-
WebDriverWait(driver,
|
64 |
EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
|
65 |
).click()
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
print("Yorumlar yükleniyor...")
|
71 |
comprehensive_scroll(driver)
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
comment_elements = driver.find_elements(By.CSS_SELECTOR, "div.comment-cards-item")
|
76 |
total_comments = len(comment_elements)
|
77 |
|
78 |
if total_comments == 0:
|
79 |
print("Yorum elementi bulunamadı!")
|
80 |
return pd.DataFrame()
|
81 |
|
82 |
-
print(f"Toplam {total_comments} yorum bulundu
|
83 |
|
84 |
data = []
|
85 |
-
for i
|
86 |
try:
|
87 |
-
|
|
|
88 |
except:
|
89 |
username = "N/A"
|
90 |
|
91 |
try:
|
92 |
-
|
|
|
93 |
except:
|
94 |
comment = "N/A"
|
95 |
|
96 |
try:
|
97 |
-
|
|
|
98 |
except:
|
99 |
date = "N/A"
|
100 |
|
101 |
try:
|
102 |
-
|
|
|
|
|
103 |
except:
|
104 |
-
|
105 |
|
106 |
data.append({
|
107 |
"Kullanıcı_id": i,
|
108 |
"Kullanıcı Adı": username,
|
109 |
"Yorum": comment,
|
110 |
"Tarih": date,
|
111 |
-
"Yıldız Sayısı":
|
112 |
})
|
113 |
-
|
114 |
if i % 5 == 0:
|
115 |
-
|
116 |
|
117 |
return pd.DataFrame(data)
|
118 |
|
|
|
4 |
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
from selenium.webdriver.support import expected_conditions as EC
|
6 |
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
7 |
import time
|
8 |
import pandas as pd
|
9 |
import os
|
|
|
18 |
def comprehensive_scroll(driver):
|
19 |
last_height = driver.execute_script("return document.body.scrollHeight")
|
20 |
while True:
|
21 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
22 |
+
time.sleep(3)
|
|
|
23 |
new_height = driver.execute_script("return document.body.scrollHeight")
|
24 |
if new_height == last_height:
|
25 |
break
|
|
|
30 |
chrome_options.add_argument('--headless')
|
31 |
chrome_options.add_argument('--no-sandbox')
|
32 |
chrome_options.add_argument('--disable-dev-shm-usage')
|
|
|
|
|
|
|
33 |
chrome_options.add_argument('--disable-gpu')
|
34 |
+
chrome_options.add_argument('--lang=tr')
|
35 |
+
chrome_options.add_argument('--disable-notifications')
|
36 |
chrome_options.add_argument("--window-size=1920,1080")
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
try:
|
39 |
+
# Linux için ChromeDriver ayarları
|
40 |
+
service = Service()
|
41 |
driver = webdriver.Chrome(service=service, options=chrome_options)
|
42 |
|
|
|
|
|
|
|
|
|
43 |
driver.get(url)
|
44 |
+
time.sleep(3)
|
45 |
|
46 |
try:
|
47 |
+
WebDriverWait(driver, 10).until(
|
48 |
EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
|
49 |
).click()
|
50 |
+
except:
|
51 |
+
print("Çerez popup'ı bulunamadı, devam ediliyor...")
|
52 |
+
|
|
|
|
|
53 |
comprehensive_scroll(driver)
|
54 |
+
|
55 |
+
# İlk çalışan xpath'leri kullanalım
|
56 |
+
comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
|
|
|
57 |
total_comments = len(comment_elements)
|
58 |
|
59 |
if total_comments == 0:
|
60 |
print("Yorum elementi bulunamadı!")
|
61 |
return pd.DataFrame()
|
62 |
|
63 |
+
print(f"Toplam {total_comments} yorum bulundu")
|
64 |
|
65 |
data = []
|
66 |
+
for i in range(1, total_comments + 1):
|
67 |
try:
|
68 |
+
username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
|
69 |
+
username = driver.find_element(By.XPATH, username_xpath).text
|
70 |
except:
|
71 |
username = "N/A"
|
72 |
|
73 |
try:
|
74 |
+
comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
|
75 |
+
comment = driver.find_element(By.XPATH, comment_xpath).text
|
76 |
except:
|
77 |
comment = "N/A"
|
78 |
|
79 |
try:
|
80 |
+
date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
|
81 |
+
date = driver.find_element(By.XPATH, date_xpath).text
|
82 |
except:
|
83 |
date = "N/A"
|
84 |
|
85 |
try:
|
86 |
+
star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
|
87 |
+
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
|
88 |
+
star_count = len(full_stars)
|
89 |
except:
|
90 |
+
star_count = 0
|
91 |
|
92 |
data.append({
|
93 |
"Kullanıcı_id": i,
|
94 |
"Kullanıcı Adı": username,
|
95 |
"Yorum": comment,
|
96 |
"Tarih": date,
|
97 |
+
"Yıldız Sayısı": star_count
|
98 |
})
|
99 |
+
|
100 |
if i % 5 == 0:
|
101 |
+
print(f"{i} yorum işlendi...")
|
102 |
|
103 |
return pd.DataFrame(data)
|
104 |
|