Spaces:

enesmanan
/

trendyol-review-summarizer

Sleeping

App Files Files Community

enesmanan commited on Jan 8

Commit

bd87b2f

verified ·

1 Parent(s): 74fdc74

add files

Browse files

Files changed (3) hide show

app.py +317 -0
requirements.txt +12 -0
scrape/trendyol_scraper.py +89 -0

app.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import gradio as gr
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+import os
+import shutil
+from scrape.trendyol_scraper import scrape_reviews
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+import re
+from tqdm import tqdm
+import nltk
+from nltk.corpus import stopwords
+class ReviewAnalysisApp:
+    def __init__(self):
+        self.setup_models()
+        self.setup_stopwords()
+    def setup_stopwords(self):
+        """Türkçe stopwords'leri yükle"""
+        try:
+            nltk.data.find('corpora/stopwords')
+        except LookupError:
+            nltk.download('stopwords')
+        self.turkish_stopwords = set(stopwords.words('turkish'))
+        # Ekstra stopwords ekle
+        self.logistics_seller_words = {
+            'kargo', 'kargocu', 'paket', 'gönderi', 'satıcı', 'mağaza',
+            'sipariş', 'teslimat', 'gönderim', 'kutu', 'paketleme'
+        }
+        self.turkish_stopwords.update(self.logistics_seller_words)
+    def setup_models(self):
+        """Modelleri yükle ve hazırla"""
+        # Sentiment model setup
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Cihaz: {self.device}")
+        model_name = "savasy/bert-base-turkish-sentiment-cased"
+        self.sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.sentiment_model = (
+            AutoModelForSequenceClassification.from_pretrained(model_name)
+            .to(self.device)
+            .to(torch.float32)
+        )
+        # Summary model setup
+        print("Trendyol-LLM modeli yükleniyor...")
+        model_id = "Trendyol/Trendyol-LLM-8b-chat-v2.0"
+        self.summary_pipe = pipeline(
+            "text-generation",
+            model=model_id,
+            torch_dtype="auto",
+            device_map='auto',
+        )
+        self.terminators = [
+            self.summary_pipe.tokenizer.eos_token_id,
+            self.summary_pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+        self.sampling_params = {
+            "do_sample": True,
+            "temperature": 0.3,
+            "top_k": 50,
+            "top_p": 0.9,
+            "repetition_penalty": 1.1
+        }
+    def preprocess_text(self, text):
+        """Metin ön işleme"""
+        if isinstance(text, str):
+            # Küçük harfe çevir
+            text = text.lower()
+            # Özel karakterleri temizle
+            text = re.sub(r'[^\w\s]', '', text)
+            # Sayıları temizle
+            text = re.sub(r'\d+', '', text)
+            # Fazla boşlukları temizle
+            text = re.sub(r'\s+', ' ', text).strip()
+            # Stop words'leri çıkar
+            words = text.split()
+            words = [word for word in words if word not in self.turkish_stopwords]
+            return ' '.join(words)
+        return ''
+    def filter_product_reviews(self, df):
+        """Ürün ile ilgili olmayan yorumları filtrele"""
+        def is_product_review(text):
+            if not isinstance(text, str):
+                return False
+            return not any(word in text.lower() for word in self.logistics_seller_words)
+        filtered_df = df[df['Yorum'].apply(is_product_review)].copy()
+        print(f"\nFiltreleme İstatistikleri:")
+        print(f"Toplam yorum sayısı: {len(df)}")
+        print(f"Ürün yorumu sayısı: {len(filtered_df)}")
+        print(f"Filtrelenen yorum sayısı: {len(df) - len(filtered_df)}")
+        print(f"Filtreleme oranı: {((len(df) - len(filtered_df)) / len(df) * 100):.2f}%")
+        return filtered_df
+    def predict_sentiment(self, text):
+        """Tek bir yorum için sentiment analizi yap"""
+        # Önce metni temizle
+        text = self.preprocess_text(text)
+        if not text:
+            return {"label": "nötr", "score": 0.5}
+        inputs = self.sentiment_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.sentiment_model(**inputs)
+            scores = torch.nn.functional.softmax(outputs.logits, dim=1)
+        positive_score = scores[0][1].item()
+        label = "pozitif" if positive_score > 0.5 else "negatif"
+        return {"label": label, "score": positive_score}
+    def analyze_reviews(self, df):
+        """Tüm yorumları analiz et"""
+        print("\nSentiment analizi başlatılıyor...")
+        # Önce ürün ile ilgili olmayan yorumları filtrele
+        df = self.filter_product_reviews(df)
+        # Sentiment analizi
+        results = []
+        for text in tqdm(df['Yorum'], desc="Yorumlar analiz ediliyor"):
+            sentiment = self.predict_sentiment(text)
+            results.append(sentiment)
+        df['sentiment_score'] = [r['score'] for r in results]
+        df['sentiment_label'] = [r['label'] for r in results]
+        return df
+    def generate_summary(self, df):
+        """Yorumları özetle"""
+        # Temel istatistikler
+        avg_rating = df['Yıldız Sayısı'].mean()
+        total_reviews = len(df)
+        # Sentiment bazlı gruplandırma
+        positive_comments = df[df['sentiment_label'] == 'pozitif']['Yorum'].tolist()
+        negative_comments = df[df['sentiment_label'] == 'negatif']['Yorum'].tolist()
+        positive_count = len(positive_comments)
+        negative_count = len(negative_comments)
+        # Yıldız dağılımı
+        star_dist = df['Yıldız Sayısı'].value_counts().sort_index()
+        star_dist_text = "\n".join([f"{star} yıldız: {count} yorum" for star, count in star_dist.items()])
+        # En sık geçen kelimeler (stopwords temizlenmiş)
+        all_words = []
+        for text in df['Yorum']:
+            cleaned_text = self.preprocess_text(text)
+            if cleaned_text:
+                all_words.extend(cleaned_text.split())
+        from collections import Counter
+        word_freq = Counter(all_words).most_common(10)
+        frequent_words = ", ".join([f"{word} ({count} kez)" for word, count in word_freq])
+        # Prompt hazırlama
+        prompt = f"""Bu ürün için yapılan {total_reviews} yorumun detaylı analizi:
+1. Genel Değerlendirme:
+- Ortalama puan: {avg_rating:.1f}/5
+- Toplam yorum sayısı: {total_reviews}
+- Pozitif yorum sayısı: {positive_count}
+- Negatif yorum sayısı: {negative_count}
+2. Yıldız Dağılımı:
+{star_dist_text}
+3. En Sık Kullanılan Kelimeler:
+{frequent_words}
+4. Örnek Yorumlar:
+Pozitif yorumlardan:
+{' | '.join(positive_comments[:3])}
+Negatif yorumlardan:
+{' | '.join(negative_comments[:3])}
+Yukarıdaki verilere dayanarak:
+1. Ürünün genel kalitesi ve kullanıcı memnuniyeti hakkında
+2. Ürünün güçlü ve zayıf yönleri hakkında
+3. Potansiyel alıcılar için önemli noktalar hakkında
+kapsamlı bir değerlendirme yazar mısın?
+"""
+        # Özet oluştur
+        response = self.summary_pipe(
+            prompt,
+            max_new_tokens=800,  # Daha uzun özet için
+            eos_token_id=self.terminators,
+            **self.sampling_params
+        )[0]['generated_text']
+        # Prompt'u çıkar ve sadece özeti döndür
+        summary = response[len(prompt):].strip()
+        # Özeti formatla
+        formatted_summary = f"""📊 ÜRÜN ANAL�Z RAPORU
+⭐ Ortalama Puan: {avg_rating:.1f}/5
+📝 Toplam Yorum: {total_reviews}
+✅ Pozitif Yorum: {positive_count}
+❌ Negatif Yorum: {negative_count}
+🔍 DETAYLI ANALİZ:
+{summary}"""
+        return formatted_summary
+    def analyze_url(self, url):
+        try:
+            # Temizlik
+            if os.path.exists("data"):
+                shutil.rmtree("data")
+            # Yorumları çek
+            df = scrape_reviews(url)
+            if df.empty:
+                return "Yorumlar çekilemedi. Lütfen URL'yi kontrol edin.", None, None, None
+            # Sentiment analizi yap
+            analyzed_df = self.analyze_reviews(df)
+            # Özet oluştur
+            summary = self.generate_summary(analyzed_df)
+            # Grafikleri oluştur
+            fig1 = self.create_sentiment_distribution(analyzed_df)
+            fig2 = self.create_rating_distribution(analyzed_df)
+            fig3 = self.create_sentiment_by_rating(analyzed_df)
+            return summary, fig1, fig2, fig3
+        except Exception as e:
+            return f"Bir hata oluştu: {str(e)}", None, None, None
+        finally:
+            # Temizlik
+            if os.path.exists("data"):
+                shutil.rmtree("data")
+    def create_sentiment_distribution(self, df):
+        fig = px.pie(df,
+                    names='sentiment_label',
+                    title='Duygu Analizi Dağılımı')
+        return fig
+    def create_rating_distribution(self, df):
+        fig = px.bar(df['Yıldız Sayısı'].value_counts().sort_index(),
+                    title='Yıldız Dağılımı')
+        fig.update_layout(xaxis_title='Yıldız Sayısı',
+                         yaxis_title='Yorum Sayısı')
+        return fig
+    def create_sentiment_by_rating(self, df):
+        avg_sentiment = df.groupby('Yıldız Sayısı')['sentiment_score'].mean()
+        fig = px.line(avg_sentiment,
+                     title='Yıldız Sayısına Göre Ortalama Sentiment Skoru')
+        fig.update_layout(xaxis_title='Yıldız Sayısı',
+                         yaxis_title='Ortalama Sentiment Skoru')
+        return fig
+def create_interface():
+    app = ReviewAnalysisApp()
+    with gr.Blocks(theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# Trendyol Yorum Analizi")
+        with gr.Row():
+            url_input = gr.Textbox(
+                label="Trendyol Ürün Yorumları URL'si",
+                placeholder="https://www.trendyol.com/..."
+            )
+        analyze_btn = gr.Button("Analiz Et")
+        with gr.Row():
+            with gr.Column(scale=1):
+                summary_output = gr.Textbox(
+                    label="Özet",
+                    lines=10
+                )
+            with gr.Column(scale=2):
+                with gr.Tab("Duygu Analizi"):
+                    sentiment_dist = gr.Plot()
+                with gr.Tab("Yıldız Dağılımı"):
+                    rating_dist = gr.Plot()
+                with gr.Tab("Sentiment-Yıldız İlişkisi"):
+                    sentiment_rating = gr.Plot()
+        analyze_btn.click(
+            fn=app.analyze_url,
+            inputs=[url_input],
+            outputs=[summary_output, sentiment_dist, rating_dist, sentiment_rating]
+        )
+    return interface
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+pandas
+numpy
+torch
+transformers
+nltk
+plotly
+gradio
+selenium
+webdriver_manager
+tqdm
+regex
+scikit-learn

scrape/trendyol_scraper.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import time
+import pandas as pd
+import os
+def scrape_reviews(url):
+    # Create data directory if it doesn't exist
+    data_directory = "data"
+    if not os.path.exists(data_directory):
+        os.makedirs(data_directory)
+    def comprehensive_scroll(driver):
+        last_height = driver.execute_script("return document.body.scrollHeight")
+        while True:
+            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            time.sleep(3)
+            new_height = driver.execute_script("return document.body.scrollHeight")
+            if new_height == last_height:
+                break
+            last_height = new_height
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.add_argument('--headless')
+    chrome_options.add_argument('--disable-gpu')
+    chrome_options.add_argument('--no-sandbox')
+    chrome_options.add_argument('--disable-dev-shm-usage')
+    chrome_options.add_argument("--window-size=1920,1080")
+    try:
+        service = Service()  # Hugging Face Spaces için path belirtmeye gerek yok
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+        driver.get(url)
+        WebDriverWait(driver, 10).until(
+            EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
+        ).click()
+        comprehensive_scroll(driver)
+        comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
+        total_comments = len(comment_elements)
+        data = []
+        for i in range(1, total_comments + 1):
+            try:
+                username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
+                username = driver.find_element(By.XPATH, username_xpath).text
+            except:
+                username = "N/A"
+            try:
+                comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
+                comment = driver.find_element(By.XPATH, comment_xpath).text
+            except:
+                comment = "N/A"
+            try:
+                date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
+                date = driver.find_element(By.XPATH, date_xpath).text
+            except:
+                date = "N/A"
+            try:
+                star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
+                full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
+                star_count = len(full_stars)
+            except:
+                star_count = 0
+            data.append({
+                "Kullanıcı_id": i,
+                "Kullanıcı Adı": username,
+                "Yorum": comment,
+                "Tarih": date,
+                "Yıldız Sayısı": star_count
+            })
+        return pd.DataFrame(data)
+    except Exception as e:
+        print(f"Hata oluştu: {str(e)}")
+        return pd.DataFrame()
+    finally:
+        driver.quit()