enesmanan commited on
Commit
bd87b2f
·
verified ·
1 Parent(s): 74fdc74
Files changed (3) hide show
  1. app.py +317 -0
  2. requirements.txt +12 -0
  3. scrape/trendyol_scraper.py +89 -0
app.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ import os
6
+ import shutil
7
+ from scrape.trendyol_scraper import scrape_reviews
8
+ import torch
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
10
+ import re
11
+ from tqdm import tqdm
12
+ import nltk
13
+ from nltk.corpus import stopwords
14
+
15
+ class ReviewAnalysisApp:
16
+ def __init__(self):
17
+ self.setup_models()
18
+ self.setup_stopwords()
19
+
20
+ def setup_stopwords(self):
21
+ """Türkçe stopwords'leri yükle"""
22
+ try:
23
+ nltk.data.find('corpora/stopwords')
24
+ except LookupError:
25
+ nltk.download('stopwords')
26
+
27
+ self.turkish_stopwords = set(stopwords.words('turkish'))
28
+ # Ekstra stopwords ekle
29
+ self.logistics_seller_words = {
30
+ 'kargo', 'kargocu', 'paket', 'gönderi', 'satıcı', 'mağaza',
31
+ 'sipariş', 'teslimat', 'gönderim', 'kutu', 'paketleme'
32
+ }
33
+ self.turkish_stopwords.update(self.logistics_seller_words)
34
+
35
+ def setup_models(self):
36
+ """Modelleri yükle ve hazırla"""
37
+ # Sentiment model setup
38
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
39
+ print(f"Cihaz: {self.device}")
40
+
41
+ model_name = "savasy/bert-base-turkish-sentiment-cased"
42
+ self.sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
43
+ self.sentiment_model = (
44
+ AutoModelForSequenceClassification.from_pretrained(model_name)
45
+ .to(self.device)
46
+ .to(torch.float32)
47
+ )
48
+
49
+ # Summary model setup
50
+ print("Trendyol-LLM modeli yükleniyor...")
51
+ model_id = "Trendyol/Trendyol-LLM-8b-chat-v2.0"
52
+ self.summary_pipe = pipeline(
53
+ "text-generation",
54
+ model=model_id,
55
+ torch_dtype="auto",
56
+ device_map='auto',
57
+ )
58
+
59
+ self.terminators = [
60
+ self.summary_pipe.tokenizer.eos_token_id,
61
+ self.summary_pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
62
+ ]
63
+
64
+ self.sampling_params = {
65
+ "do_sample": True,
66
+ "temperature": 0.3,
67
+ "top_k": 50,
68
+ "top_p": 0.9,
69
+ "repetition_penalty": 1.1
70
+ }
71
+
72
+ def preprocess_text(self, text):
73
+ """Metin ön işleme"""
74
+ if isinstance(text, str):
75
+ # Küçük harfe çevir
76
+ text = text.lower()
77
+ # Özel karakterleri temizle
78
+ text = re.sub(r'[^\w\s]', '', text)
79
+ # Sayıları temizle
80
+ text = re.sub(r'\d+', '', text)
81
+ # Fazla boşlukları temizle
82
+ text = re.sub(r'\s+', ' ', text).strip()
83
+ # Stop words'leri çıkar
84
+ words = text.split()
85
+ words = [word for word in words if word not in self.turkish_stopwords]
86
+ return ' '.join(words)
87
+ return ''
88
+
89
+ def filter_product_reviews(self, df):
90
+ """Ürün ile ilgili olmayan yorumları filtrele"""
91
+ def is_product_review(text):
92
+ if not isinstance(text, str):
93
+ return False
94
+ return not any(word in text.lower() for word in self.logistics_seller_words)
95
+
96
+ filtered_df = df[df['Yorum'].apply(is_product_review)].copy()
97
+
98
+ print(f"\nFiltreleme İstatistikleri:")
99
+ print(f"Toplam yorum sayısı: {len(df)}")
100
+ print(f"Ürün yorumu sayısı: {len(filtered_df)}")
101
+ print(f"Filtrelenen yorum sayısı: {len(df) - len(filtered_df)}")
102
+ print(f"Filtreleme oranı: {((len(df) - len(filtered_df)) / len(df) * 100):.2f}%")
103
+
104
+ return filtered_df
105
+
106
+ def predict_sentiment(self, text):
107
+ """Tek bir yorum için sentiment analizi yap"""
108
+ # Önce metni temizle
109
+ text = self.preprocess_text(text)
110
+
111
+ if not text:
112
+ return {"label": "nötr", "score": 0.5}
113
+
114
+ inputs = self.sentiment_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
115
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
116
+
117
+ with torch.no_grad():
118
+ outputs = self.sentiment_model(**inputs)
119
+ scores = torch.nn.functional.softmax(outputs.logits, dim=1)
120
+
121
+ positive_score = scores[0][1].item()
122
+ label = "pozitif" if positive_score > 0.5 else "negatif"
123
+
124
+ return {"label": label, "score": positive_score}
125
+
126
+ def analyze_reviews(self, df):
127
+ """Tüm yorumları analiz et"""
128
+ print("\nSentiment analizi başlatılıyor...")
129
+
130
+ # Önce ürün ile ilgili olmayan yorumları filtrele
131
+ df = self.filter_product_reviews(df)
132
+
133
+ # Sentiment analizi
134
+ results = []
135
+ for text in tqdm(df['Yorum'], desc="Yorumlar analiz ediliyor"):
136
+ sentiment = self.predict_sentiment(text)
137
+ results.append(sentiment)
138
+
139
+ df['sentiment_score'] = [r['score'] for r in results]
140
+ df['sentiment_label'] = [r['label'] for r in results]
141
+
142
+ return df
143
+
144
+ def generate_summary(self, df):
145
+ """Yorumları özetle"""
146
+ # Temel istatistikler
147
+ avg_rating = df['Yıldız Sayısı'].mean()
148
+ total_reviews = len(df)
149
+
150
+ # Sentiment bazlı gruplandırma
151
+ positive_comments = df[df['sentiment_label'] == 'pozitif']['Yorum'].tolist()
152
+ negative_comments = df[df['sentiment_label'] == 'negatif']['Yorum'].tolist()
153
+ positive_count = len(positive_comments)
154
+ negative_count = len(negative_comments)
155
+
156
+ # Yıldız dağılımı
157
+ star_dist = df['Yıldız Sayısı'].value_counts().sort_index()
158
+ star_dist_text = "\n".join([f"{star} yıldız: {count} yorum" for star, count in star_dist.items()])
159
+
160
+ # En sık geçen kelimeler (stopwords temizlenmiş)
161
+ all_words = []
162
+ for text in df['Yorum']:
163
+ cleaned_text = self.preprocess_text(text)
164
+ if cleaned_text:
165
+ all_words.extend(cleaned_text.split())
166
+
167
+ from collections import Counter
168
+ word_freq = Counter(all_words).most_common(10)
169
+ frequent_words = ", ".join([f"{word} ({count} kez)" for word, count in word_freq])
170
+
171
+ # Prompt hazırlama
172
+ prompt = f"""Bu ürün için yapılan {total_reviews} yorumun detaylı analizi:
173
+
174
+ 1. Genel Değerlendirme:
175
+ - Ortalama puan: {avg_rating:.1f}/5
176
+ - Toplam yorum sayısı: {total_reviews}
177
+ - Pozitif yorum sayısı: {positive_count}
178
+ - Negatif yorum sayısı: {negative_count}
179
+
180
+ 2. Yıldız Dağılımı:
181
+ {star_dist_text}
182
+
183
+ 3. En Sık Kullanılan Kelimeler:
184
+ {frequent_words}
185
+
186
+ 4. Örnek Yorumlar:
187
+ Pozitif yorumlardan:
188
+ {' | '.join(positive_comments[:3])}
189
+
190
+ Negatif yorumlardan:
191
+ {' | '.join(negative_comments[:3])}
192
+
193
+ Yukarıdaki verilere dayanarak:
194
+ 1. Ürünün genel kalitesi ve kullanıcı memnuniyeti hakkında
195
+ 2. Ürünün güçlü ve zayıf yönleri hakkında
196
+ 3. Potansiyel alıcılar için önemli noktalar hakkında
197
+ kapsamlı bir değerlendirme yazar mısın?
198
+ """
199
+
200
+ # Özet oluştur
201
+ response = self.summary_pipe(
202
+ prompt,
203
+ max_new_tokens=800, # Daha uzun özet için
204
+ eos_token_id=self.terminators,
205
+ **self.sampling_params
206
+ )[0]['generated_text']
207
+
208
+ # Prompt'u çıkar ve sadece özeti döndür
209
+ summary = response[len(prompt):].strip()
210
+
211
+ # Özeti formatla
212
+ formatted_summary = f"""📊 ÜRÜN ANAL�Z RAPORU
213
+
214
+ ⭐ Ortalama Puan: {avg_rating:.1f}/5
215
+ 📝 Toplam Yorum: {total_reviews}
216
+ ✅ Pozitif Yorum: {positive_count}
217
+ ❌ Negatif Yorum: {negative_count}
218
+
219
+ 🔍 DETAYLI ANALİZ:
220
+ {summary}"""
221
+
222
+ return formatted_summary
223
+
224
+ def analyze_url(self, url):
225
+ try:
226
+ # Temizlik
227
+ if os.path.exists("data"):
228
+ shutil.rmtree("data")
229
+
230
+ # Yorumları çek
231
+ df = scrape_reviews(url)
232
+
233
+ if df.empty:
234
+ return "Yorumlar çekilemedi. Lütfen URL'yi kontrol edin.", None, None, None
235
+
236
+ # Sentiment analizi yap
237
+ analyzed_df = self.analyze_reviews(df)
238
+
239
+ # Özet oluştur
240
+ summary = self.generate_summary(analyzed_df)
241
+
242
+ # Grafikleri oluştur
243
+ fig1 = self.create_sentiment_distribution(analyzed_df)
244
+ fig2 = self.create_rating_distribution(analyzed_df)
245
+ fig3 = self.create_sentiment_by_rating(analyzed_df)
246
+
247
+ return summary, fig1, fig2, fig3
248
+
249
+ except Exception as e:
250
+ return f"Bir hata oluştu: {str(e)}", None, None, None
251
+
252
+ finally:
253
+ # Temizlik
254
+ if os.path.exists("data"):
255
+ shutil.rmtree("data")
256
+
257
+ def create_sentiment_distribution(self, df):
258
+ fig = px.pie(df,
259
+ names='sentiment_label',
260
+ title='Duygu Analizi Dağılımı')
261
+ return fig
262
+
263
+ def create_rating_distribution(self, df):
264
+ fig = px.bar(df['Yıldız Sayısı'].value_counts().sort_index(),
265
+ title='Yıldız Dağılımı')
266
+ fig.update_layout(xaxis_title='Yıldız Sayısı',
267
+ yaxis_title='Yorum Sayısı')
268
+ return fig
269
+
270
+ def create_sentiment_by_rating(self, df):
271
+ avg_sentiment = df.groupby('Yıldız Sayısı')['sentiment_score'].mean()
272
+ fig = px.line(avg_sentiment,
273
+ title='Yıldız Sayısına Göre Ortalama Sentiment Skoru')
274
+ fig.update_layout(xaxis_title='Yıldız Sayısı',
275
+ yaxis_title='Ortalama Sentiment Skoru')
276
+ return fig
277
+
278
+ def create_interface():
279
+ app = ReviewAnalysisApp()
280
+
281
+ with gr.Blocks(theme=gr.themes.Soft()) as interface:
282
+ gr.Markdown("# Trendyol Yorum Analizi")
283
+
284
+ with gr.Row():
285
+ url_input = gr.Textbox(
286
+ label="Trendyol Ürün Yorumları URL'si",
287
+ placeholder="https://www.trendyol.com/..."
288
+ )
289
+
290
+ analyze_btn = gr.Button("Analiz Et")
291
+
292
+ with gr.Row():
293
+ with gr.Column(scale=1):
294
+ summary_output = gr.Textbox(
295
+ label="Özet",
296
+ lines=10
297
+ )
298
+
299
+ with gr.Column(scale=2):
300
+ with gr.Tab("Duygu Analizi"):
301
+ sentiment_dist = gr.Plot()
302
+ with gr.Tab("Yıldız Dağılımı"):
303
+ rating_dist = gr.Plot()
304
+ with gr.Tab("Sentiment-Yıldız İlişkisi"):
305
+ sentiment_rating = gr.Plot()
306
+
307
+ analyze_btn.click(
308
+ fn=app.analyze_url,
309
+ inputs=[url_input],
310
+ outputs=[summary_output, sentiment_dist, rating_dist, sentiment_rating]
311
+ )
312
+
313
+ return interface
314
+
315
+ if __name__ == "__main__":
316
+ interface = create_interface()
317
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ torch
4
+ transformers
5
+ nltk
6
+ plotly
7
+ gradio
8
+ selenium
9
+ webdriver_manager
10
+ tqdm
11
+ regex
12
+ scikit-learn
scrape/trendyol_scraper.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.service import Service
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ import time
7
+ import pandas as pd
8
+ import os
9
+
10
+ def scrape_reviews(url):
11
+ # Create data directory if it doesn't exist
12
+ data_directory = "data"
13
+ if not os.path.exists(data_directory):
14
+ os.makedirs(data_directory)
15
+
16
+ def comprehensive_scroll(driver):
17
+ last_height = driver.execute_script("return document.body.scrollHeight")
18
+ while True:
19
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
20
+ time.sleep(3)
21
+ new_height = driver.execute_script("return document.body.scrollHeight")
22
+ if new_height == last_height:
23
+ break
24
+ last_height = new_height
25
+
26
+ chrome_options = webdriver.ChromeOptions()
27
+ chrome_options.add_argument('--headless')
28
+ chrome_options.add_argument('--disable-gpu')
29
+ chrome_options.add_argument('--no-sandbox')
30
+ chrome_options.add_argument('--disable-dev-shm-usage')
31
+ chrome_options.add_argument("--window-size=1920,1080")
32
+
33
+ try:
34
+ service = Service() # Hugging Face Spaces için path belirtmeye gerek yok
35
+ driver = webdriver.Chrome(service=service, options=chrome_options)
36
+ driver.get(url)
37
+
38
+ WebDriverWait(driver, 10).until(
39
+ EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
40
+ ).click()
41
+
42
+ comprehensive_scroll(driver)
43
+
44
+ comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
45
+ total_comments = len(comment_elements)
46
+
47
+ data = []
48
+ for i in range(1, total_comments + 1):
49
+ try:
50
+ username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
51
+ username = driver.find_element(By.XPATH, username_xpath).text
52
+ except:
53
+ username = "N/A"
54
+
55
+ try:
56
+ comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
57
+ comment = driver.find_element(By.XPATH, comment_xpath).text
58
+ except:
59
+ comment = "N/A"
60
+
61
+ try:
62
+ date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
63
+ date = driver.find_element(By.XPATH, date_xpath).text
64
+ except:
65
+ date = "N/A"
66
+
67
+ try:
68
+ star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
69
+ full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
70
+ star_count = len(full_stars)
71
+ except:
72
+ star_count = 0
73
+
74
+ data.append({
75
+ "Kullanıcı_id": i,
76
+ "Kullanıcı Adı": username,
77
+ "Yorum": comment,
78
+ "Tarih": date,
79
+ "Yıldız Sayısı": star_count
80
+ })
81
+
82
+ return pd.DataFrame(data)
83
+
84
+ except Exception as e:
85
+ print(f"Hata oluştu: {str(e)}")
86
+ return pd.DataFrame()
87
+
88
+ finally:
89
+ driver.quit()