Spaces:

enesmanan
/

trendyol-review-summarizer

Sleeping

App Files Files Community

trendyol-review-summarizer / app.py

enesmanan

fix

d0ee054 verified 6 months ago

raw

history blame

11.5 kB

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import os
	import shutil
	from scrape.trendyol_scraper import scrape_reviews
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
	import re
	from tqdm import tqdm
	import nltk
	from nltk.corpus import stopwords

	class ReviewAnalysisApp:
	def __init__(self):
	self.setup_models()
	self.setup_stopwords()

	def setup_stopwords(self):
	"""Türkçe stopwords'leri yükle"""
	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords')

	self.turkish_stopwords = set(stopwords.words('turkish'))
	# Ekstra stopwords ekle
	self.logistics_seller_words = {
	'kargo', 'kargocu', 'paket', 'gönderi', 'satıcı', 'mağaza',
	'sipariş', 'teslimat', 'gönderim', 'kutu', 'paketleme'
	}
	self.turkish_stopwords.update(self.logistics_seller_words)

	def setup_models(self):
	"""Modelleri yükle ve hazırla"""
	# Sentiment model setup
	self.device = "cpu" # Spaces'de CPU kullanacağız
	print(f"Cihaz: {self.device}")

	model_name = "savasy/bert-base-turkish-sentiment-cased"
	self.sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.sentiment_model = (
	AutoModelForSequenceClassification.from_pretrained(
	model_name,
	low_cpu_mem_usage=False # CPU için False yapıyoruz
	)
	.to(self.device)
	.to(torch.float32)
	)

	# Summary model setup
	print("Trendyol-LLM modeli yükleniyor...")
	model_id = "Trendyol/Trendyol-LLM-8b-chat-v2.0"
	self.summary_pipe = pipeline(
	"text-generation",
	model=model_id,
	torch_dtype=torch.float32,
	device=self.device, # device_map yerine device kullanıyoruz
	)

	self.terminators = [
	self.summary_pipe.tokenizer.eos_token_id,
	self.summary_pipe.tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	]

	self.sampling_params = {
	"do_sample": True,
	"temperature": 0.3,
	"top_k": 50,
	"top_p": 0.9,
	"repetition_penalty": 1.1
	}

	def preprocess_text(self, text):
	"""Metin ön işleme"""
	if isinstance(text, str):
	# Küçük harfe çevir
	text = text.lower()
	# Özel karakterleri temizle
	text = re.sub(r'[^\w\s]', '', text)
	# Sayıları temizle
	text = re.sub(r'\d+', '', text)
	# Fazla boşlukları temizle
	text = re.sub(r'\s+', ' ', text).strip()
	# Stop words'leri çıkar
	words = text.split()
	words = [word for word in words if word not in self.turkish_stopwords]
	return ' '.join(words)
	return ''

	def filter_product_reviews(self, df):
	"""Ürün ile ilgili olmayan yorumları filtrele"""
	def is_product_review(text):
	if not isinstance(text, str):
	return False
	return not any(word in text.lower() for word in self.logistics_seller_words)

	filtered_df = df[df['Yorum'].apply(is_product_review)].copy()

	print(f"\nFiltreleme İstatistikleri:")
	print(f"Toplam yorum sayısı: {len(df)}")
	print(f"Ürün yorumu sayısı: {len(filtered_df)}")
	print(f"Filtrelenen yorum sayısı: {len(df) - len(filtered_df)}")
	print(f"Filtreleme oranı: {((len(df) - len(filtered_df)) / len(df) * 100):.2f}%")

	return filtered_df

	def predict_sentiment(self, text):
	"""Tek bir yorum için sentiment analizi yap"""
	# Önce metni temizle
	text = self.preprocess_text(text)

	if not text:
	return {"label": "nötr", "score": 0.5}

	inputs = self.sentiment_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
	inputs = {k: v.to(self.device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = self.sentiment_model(**inputs)
	scores = torch.nn.functional.softmax(outputs.logits, dim=1)

	positive_score = scores[0][1].item()
	label = "pozitif" if positive_score > 0.5 else "negatif"

	return {"label": label, "score": positive_score}

	def analyze_reviews(self, df):
	"""Tüm yorumları analiz et"""
	print("\nSentiment analizi başlatılıyor...")

	# Önce ürün ile ilgili olmayan yorumları filtrele
	df = self.filter_product_reviews(df)

	# Sentiment analizi
	results = []
	for text in tqdm(df['Yorum'], desc="Yorumlar analiz ediliyor"):
	sentiment = self.predict_sentiment(text)
	results.append(sentiment)

	df['sentiment_score'] = [r['score'] for r in results]
	df['sentiment_label'] = [r['label'] for r in results]

	return df

	def generate_summary(self, df):
	"""Yorumları özetle"""
	# Temel istatistikler
	avg_rating = df['Yıldız Sayısı'].mean()
	total_reviews = len(df)

	# Sentiment bazlı gruplandırma
	positive_comments = df[df['sentiment_label'] == 'pozitif']['Yorum'].tolist()
	negative_comments = df[df['sentiment_label'] == 'negatif']['Yorum'].tolist()
	positive_count = len(positive_comments)
	negative_count = len(negative_comments)

	# Yıldız dağılımı
	star_dist = df['Yıldız Sayısı'].value_counts().sort_index()
	star_dist_text = "\n".join([f"{star} yıldız: {count} yorum" for star, count in star_dist.items()])

	# En sık geçen kelimeler (stopwords temizlenmiş)
	all_words = []
	for text in df['Yorum']:
	cleaned_text = self.preprocess_text(text)
	if cleaned_text:
	all_words.extend(cleaned_text.split())

	from collections import Counter
	word_freq = Counter(all_words).most_common(10)
	frequent_words = ", ".join([f"{word} ({count} kez)" for word, count in word_freq])

	# Prompt hazırlama
	prompt = f"""Bu ürün için yapılan {total_reviews} yorumun detaylı analizi:

	1. Genel Değerlendirme:
	- Ortalama puan: {avg_rating:.1f}/5
	- Toplam yorum sayısı: {total_reviews}
	- Pozitif yorum sayısı: {positive_count}
	- Negatif yorum sayısı: {negative_count}

	2. Yıldız Dağılımı:
	{star_dist_text}

	3. En Sık Kullanılan Kelimeler:
	{frequent_words}

	4. Örnek Yorumlar:
	Pozitif yorumlardan:
	{' \| '.join(positive_comments[:3])}

	Negatif yorumlardan:
	{' \| '.join(negative_comments[:3])}

	Yukarıdaki verilere dayanarak:
	1. Ürünün genel kalitesi ve kullanıcı memnuniyeti hakkında
	2. Ürünün güçlü ve zayıf yönleri hakkında
	3. Potansiyel alıcılar için önemli noktalar hakkında
	kapsamlı bir değerlendirme yazar mısın?
	"""

	# Özet oluştur
	response = self.summary_pipe(
	prompt,
	max_new_tokens=800, # Daha uzun özet için
	eos_token_id=self.terminators,
	**self.sampling_params
	)[0]['generated_text']

	# Prompt'u çıkar ve sadece özeti döndür
	summary = response[len(prompt):].strip()

	# Özeti formatla
	formatted_summary = f"""📊 ÜRÜN ANAL�Z RAPORU

	⭐ Ortalama Puan: {avg_rating:.1f}/5
	📝 Toplam Yorum: {total_reviews}
	✅ Pozitif Yorum: {positive_count}
	❌ Negatif Yorum: {negative_count}

	🔍 DETAYLI ANALİZ:
	{summary}"""

	return formatted_summary

	def analyze_url(self, url):
	try:
	# Temizlik
	if os.path.exists("data"):
	shutil.rmtree("data")

	# Yorumları çek
	df = scrape_reviews(url)

	if df.empty:
	return "Yorumlar çekilemedi. Lütfen URL'yi kontrol edin.", None, None, None

	# Sentiment analizi yap
	analyzed_df = self.analyze_reviews(df)

	# Özet oluştur
	summary = self.generate_summary(analyzed_df)

	# Grafikleri oluştur
	fig1 = self.create_sentiment_distribution(analyzed_df)
	fig2 = self.create_rating_distribution(analyzed_df)
	fig3 = self.create_sentiment_by_rating(analyzed_df)

	return summary, fig1, fig2, fig3

	except Exception as e:
	return f"Bir hata oluştu: {str(e)}", None, None, None

	finally:
	# Temizlik
	if os.path.exists("data"):
	shutil.rmtree("data")

	def create_sentiment_distribution(self, df):
	fig = px.pie(df,
	names='sentiment_label',
	title='Duygu Analizi Dağılımı')
	return fig

	def create_rating_distribution(self, df):
	fig = px.bar(df['Yıldız Sayısı'].value_counts().sort_index(),
	title='Yıldız Dağılımı')
	fig.update_layout(xaxis_title='Yıldız Sayısı',
	yaxis_title='Yorum Sayısı')
	return fig

	def create_sentiment_by_rating(self, df):
	avg_sentiment = df.groupby('Yıldız Sayısı')['sentiment_score'].mean()
	fig = px.line(avg_sentiment,
	title='Yıldız Sayısına Göre Ortalama Sentiment Skoru')
	fig.update_layout(xaxis_title='Yıldız Sayısı',
	yaxis_title='Ortalama Sentiment Skoru')
	return fig

	def create_interface():
	app = ReviewAnalysisApp()

	with gr.Blocks(theme=gr.themes.Soft()) as interface:
	gr.Markdown("# Trendyol Yorum Analizi")

	with gr.Row():
	url_input = gr.Textbox(
	label="Trendyol Ürün Yorumları URL'si",
	placeholder="https://www.trendyol.com/..."
	)

	analyze_btn = gr.Button("Analiz Et")

	with gr.Row():
	with gr.Column(scale=1):
	summary_output = gr.Textbox(
	label="Özet",
	lines=10
	)

	with gr.Column(scale=2):
	with gr.Tab("Duygu Analizi"):
	sentiment_dist = gr.Plot()
	with gr.Tab("Yıldız Dağılımı"):
	rating_dist = gr.Plot()
	with gr.Tab("Sentiment-Yıldız İlişkisi"):
	sentiment_rating = gr.Plot()

	analyze_btn.click(
	fn=app.analyze_url,
	inputs=[url_input],
	outputs=[summary_output, sentiment_dist, rating_dist, sentiment_rating]
	)

	return interface

	if __name__ == "__main__":
	interface = create_interface()
	interface.launch()