Spaces:

enesmanan
/

trendyol-review-summarizer

Running

File size: 11,523 Bytes

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import shutil
from scrape.trendyol_scraper import scrape_reviews
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import re
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

class ReviewAnalysisApp:
    def __init__(self):
        self.setup_models()
        self.setup_stopwords()
        
    def setup_stopwords(self):
        """Türkçe stopwords'leri yükle"""
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
        
        self.turkish_stopwords = set(stopwords.words('turkish'))
        # Ekstra stopwords ekle
        self.logistics_seller_words = {
            'kargo', 'kargocu', 'paket', 'gönderi', 'satıcı', 'mağaza', 
            'sipariş', 'teslimat', 'gönderim', 'kutu', 'paketleme'
        }
        self.turkish_stopwords.update(self.logistics_seller_words)
        
    def setup_models(self):
        """Modelleri yükle ve hazırla"""
        # Sentiment model setup
        self.device = "cpu"  # Spaces'de CPU kullanacağız
        print(f"Cihaz: {self.device}")
        
        model_name = "savasy/bert-base-turkish-sentiment-cased"
        self.sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.sentiment_model = (
            AutoModelForSequenceClassification.from_pretrained(
                model_name,
                low_cpu_mem_usage=False  # CPU için False yapıyoruz
            )
            .to(self.device)
            .to(torch.float32)
        )
        
        # Summary model setup
        print("Trendyol-LLM modeli yükleniyor...")
        model_id = "Trendyol/Trendyol-LLM-8b-chat-v2.0"
        self.summary_pipe = pipeline(
            "text-generation",
            model=model_id,
            torch_dtype=torch.float32,
            device=self.device,  # device_map yerine device kullanıyoruz
        )
        
        self.terminators = [
            self.summary_pipe.tokenizer.eos_token_id,
            self.summary_pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        
        self.sampling_params = {
            "do_sample": True,
            "temperature": 0.3,
            "top_k": 50,
            "top_p": 0.9,
            "repetition_penalty": 1.1
        }
        
    def preprocess_text(self, text):
        """Metin ön işleme"""
        if isinstance(text, str):
            # Küçük harfe çevir
            text = text.lower()
            # Özel karakterleri temizle
            text = re.sub(r'[^\w\s]', '', text)
            # Sayıları temizle
            text = re.sub(r'\d+', '', text)
            # Fazla boşlukları temizle
            text = re.sub(r'\s+', ' ', text).strip()
            # Stop words'leri çıkar
            words = text.split()
            words = [word for word in words if word not in self.turkish_stopwords]
            return ' '.join(words)
        return ''
    
    def filter_product_reviews(self, df):
        """Ürün ile ilgili olmayan yorumları filtrele"""
        def is_product_review(text):
            if not isinstance(text, str):
                return False
            return not any(word in text.lower() for word in self.logistics_seller_words)

        filtered_df = df[df['Yorum'].apply(is_product_review)].copy()
        
        print(f"\nFiltreleme İstatistikleri:")
        print(f"Toplam yorum sayısı: {len(df)}")
        print(f"Ürün yorumu sayısı: {len(filtered_df)}")
        print(f"Filtrelenen yorum sayısı: {len(df) - len(filtered_df)}")
        print(f"Filtreleme oranı: {((len(df) - len(filtered_df)) / len(df) * 100):.2f}%")
        
        return filtered_df
    
    def predict_sentiment(self, text):
        """Tek bir yorum için sentiment analizi yap"""
        # Önce metni temizle
        text = self.preprocess_text(text)
        
        if not text:
            return {"label": "nötr", "score": 0.5}
            
        inputs = self.sentiment_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.sentiment_model(**inputs)
            scores = torch.nn.functional.softmax(outputs.logits, dim=1)
            
        positive_score = scores[0][1].item()
        label = "pozitif" if positive_score > 0.5 else "negatif"
        
        return {"label": label, "score": positive_score}
    
    def analyze_reviews(self, df):
        """Tüm yorumları analiz et"""
        print("\nSentiment analizi başlatılıyor...")
        
        # Önce ürün ile ilgili olmayan yorumları filtrele
        df = self.filter_product_reviews(df)
        
        # Sentiment analizi
        results = []
        for text in tqdm(df['Yorum'], desc="Yorumlar analiz ediliyor"):
            sentiment = self.predict_sentiment(text)
            results.append(sentiment)
        
        df['sentiment_score'] = [r['score'] for r in results]
        df['sentiment_label'] = [r['label'] for r in results]
        
        return df
    
    def generate_summary(self, df):
        """Yorumları özetle"""
        # Temel istatistikler
        avg_rating = df['Yıldız Sayısı'].mean()
        total_reviews = len(df)
        
        # Sentiment bazlı gruplandırma
        positive_comments = df[df['sentiment_label'] == 'pozitif']['Yorum'].tolist()
        negative_comments = df[df['sentiment_label'] == 'negatif']['Yorum'].tolist()
        positive_count = len(positive_comments)
        negative_count = len(negative_comments)
        
        # Yıldız dağılımı
        star_dist = df['Yıldız Sayısı'].value_counts().sort_index()
        star_dist_text = "\n".join([f"{star} yıldız: {count} yorum" for star, count in star_dist.items()])
        
        # En sık geçen kelimeler (stopwords temizlenmiş)
        all_words = []
        for text in df['Yorum']:
            cleaned_text = self.preprocess_text(text)
            if cleaned_text:
                all_words.extend(cleaned_text.split())
        
        from collections import Counter
        word_freq = Counter(all_words).most_common(10)
        frequent_words = ", ".join([f"{word} ({count} kez)" for word, count in word_freq])
        
        # Prompt hazırlama
        prompt = f"""Bu ürün için yapılan {total_reviews} yorumun detaylı analizi:



1. Genel Değerlendirme:

- Ortalama puan: {avg_rating:.1f}/5

- Toplam yorum sayısı: {total_reviews}

- Pozitif yorum sayısı: {positive_count}

- Negatif yorum sayısı: {negative_count}



2. Yıldız Dağılımı:

{star_dist_text}



3. En Sık Kullanılan Kelimeler:

{frequent_words}



4. Örnek Yorumlar:

Pozitif yorumlardan:

{' | '.join(positive_comments[:3])}



Negatif yorumlardan:

{' | '.join(negative_comments[:3])}



Yukarıdaki verilere dayanarak:

1. Ürünün genel kalitesi ve kullanıcı memnuniyeti hakkında

2. Ürünün güçlü ve zayıf yönleri hakkında

3. Potansiyel alıcılar için önemli noktalar hakkında

kapsamlı bir değerlendirme yazar mısın?

"""
        
        # Özet oluştur
        response = self.summary_pipe(
            prompt,
            max_new_tokens=800,  # Daha uzun özet için
            eos_token_id=self.terminators,
            **self.sampling_params
        )[0]['generated_text']
        
        # Prompt'u çıkar ve sadece özeti döndür
        summary = response[len(prompt):].strip()
        
        # Özeti formatla
        formatted_summary = f"""📊 ÜRÜN ANAL�Z RAPORU



⭐ Ortalama Puan: {avg_rating:.1f}/5

📝 Toplam Yorum: {total_reviews}

✅ Pozitif Yorum: {positive_count}

❌ Negatif Yorum: {negative_count}



🔍 DETAYLI ANALİZ:

{summary}"""
        
        return formatted_summary
        
    def analyze_url(self, url):
        try:
            # Temizlik
            if os.path.exists("data"):
                shutil.rmtree("data")
            
            # Yorumları çek
            df = scrape_reviews(url)
            
            if df.empty:
                return "Yorumlar çekilemedi. Lütfen URL'yi kontrol edin.", None, None, None
            
            # Sentiment analizi yap
            analyzed_df = self.analyze_reviews(df)
            
            # Özet oluştur
            summary = self.generate_summary(analyzed_df)
            
            # Grafikleri oluştur
            fig1 = self.create_sentiment_distribution(analyzed_df)
            fig2 = self.create_rating_distribution(analyzed_df)
            fig3 = self.create_sentiment_by_rating(analyzed_df)
            
            return summary, fig1, fig2, fig3
            
        except Exception as e:
            return f"Bir hata oluştu: {str(e)}", None, None, None
        
        finally:
            # Temizlik
            if os.path.exists("data"):
                shutil.rmtree("data")
    
    def create_sentiment_distribution(self, df):
        fig = px.pie(df, 
                    names='sentiment_label', 
                    title='Duygu Analizi Dağılımı')
        return fig
    
    def create_rating_distribution(self, df):
        fig = px.bar(df['Yıldız Sayısı'].value_counts().sort_index(), 
                    title='Yıldız Dağılımı')
        fig.update_layout(xaxis_title='Yıldız Sayısı', 
                         yaxis_title='Yorum Sayısı')
        return fig
    
    def create_sentiment_by_rating(self, df):
        avg_sentiment = df.groupby('Yıldız Sayısı')['sentiment_score'].mean()
        fig = px.line(avg_sentiment, 
                     title='Yıldız Sayısına Göre Ortalama Sentiment Skoru')
        fig.update_layout(xaxis_title='Yıldız Sayısı', 
                         yaxis_title='Ortalama Sentiment Skoru')
        return fig

def create_interface():
    app = ReviewAnalysisApp()
    
    with gr.Blocks(theme=gr.themes.Soft()) as interface:
        gr.Markdown("# Trendyol Yorum Analizi")
        
        with gr.Row():
            url_input = gr.Textbox(
                label="Trendyol Ürün Yorumları URL'si",
                placeholder="https://www.trendyol.com/..."
            )
        
        analyze_btn = gr.Button("Analiz Et")
        
        with gr.Row():
            with gr.Column(scale=1):
                summary_output = gr.Textbox(
                    label="Özet",
                    lines=10
                )
            
            with gr.Column(scale=2):
                with gr.Tab("Duygu Analizi"):
                    sentiment_dist = gr.Plot()
                with gr.Tab("Yıldız Dağılımı"):
                    rating_dist = gr.Plot()
                with gr.Tab("Sentiment-Yıldız İlişkisi"):
                    sentiment_rating = gr.Plot()
        
        analyze_btn.click(
            fn=app.analyze_url,
            inputs=[url_input],
            outputs=[summary_output, sentiment_dist, rating_dist, sentiment_rating]
        )
    
    return interface

if __name__ == "__main__":
    interface = create_interface()
    interface.launch()