topic-prediction

Running on Zero

App Files Files Community

topic-prediction / app.py

openfree

Update app.py

6efc05e verified 5 months ago

raw

history blame

13 kB

	import json
	import gradio as gr
	import spaces
	import wbgtopic
	import plotly.graph_objects as go
	import plotly.express as px
	import numpy as np
	import pandas as pd
	import nltk
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.sentiment import SentimentIntensityAnalyzer
	from sklearn.cluster import KMeans
	import torch

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# Initialize WBGDocTopic
	clf = wbgtopic.WBGDocTopic(device=device)

	try:
	nltk.download('punkt', quiet=True)
	nltk.download('vader_lexicon', quiet=True)
	except:
	pass

	SAMPLE_TEXT = """Your sample text here ..."""

	def safe_process(func):
	def wrapper(args, *kwargs):
	try:
	return func(args, *kwargs)
	except Exception as e:
	print(f"Error in {func.__name__}: {str(e)}")
	return None
	return wrapper


	################################################################
	# 1) Convert Raw Results into a Consistent Format #
	################################################################

	@safe_process
	def parse_wbg_results(raw_output):
	"""
	Example: raw_output might be something like:
	[
	{ 'Innovation and Entrepreneurship': 0.32,
	'Digital Development': 0.27,
	...}
	]
	or it might be [ [ {...}, {...} ] ]

	Adjust logic so we end up with a list of dicts:
	[
	{'label': 'Innovation and Entrepreneurship', 'score_mean': 0.32, 'score_std': 0.0},
	{'label': 'Digital Development', 'score_mean': 0.27, 'score_std': 0.0},
	...
	]
	"""
	if not raw_output:
	return []

	# If the library returns a list with a single dictionary:
	# raw_output[0] might be a dict of {topic: score}
	# or it might be a list of dicts with 'label'/'score_mean' keys
	first_item = raw_output[0]

	# If it's already a list of dicts with 'label', 'score_mean', etc.
	if isinstance(first_item, dict) and 'label' in first_item:
	# Possibly we already have the correct format
	return raw_output

	# If it's a dict of {topic_label: numeric_score}
	if isinstance(first_item, dict):
	# Then let's convert it
	parsed_list = []
	for label, val in first_item.items():
	parsed_list.append({
	'label': label,
	'score_mean': float(val),
	'score_std': 0.0 # If no std is given, default 0
	})
	return parsed_list

	# If it’s something else, handle it
	return []


	################################################################
	# 2) Section-based Analysis #
	################################################################

	@safe_process
	def analyze_text_sections(text):
	"""
	Splits text into sections, calls clf.suggest_topics on each,
	and returns a list-of-lists:
	section_topics = [
	[ {'label':'...', 'score_mean':...}, {...} ],
	[ {'label':'...', 'score_mean':...}, {...} ],
	...
	]
	"""
	sentences = sent_tokenize(text)
	# e.g. group 3 sentences per section
	sections = [' '.join(sentences[i:i+3]) for i in range(0, len(sentences), 3)]

	section_topics = []
	for section in sections:
	raw_sec = clf.suggest_topics(section)
	parsed_sec = parse_wbg_results(raw_sec)
	section_topics.append(parsed_sec)

	return section_topics


	################################################################
	# 3) Basic Summaries (Correlation, Sentiment, Clusters etc.) #
	################################################################

	@safe_process
	def calculate_topic_correlations(topic_dicts):
	"""
	If we only want a single dimension correlation (like score_mean),
	we can do a simple correlation across different topics.
	But typically you'd want multiple texts or some multi-dimensional approach.
	"""
	if len(topic_dicts) < 2:
	# Not enough to do correlation
	return np.array([[1.0]]), ["Insufficient topics"]

	labels = [d['label'] for d in topic_dicts]
	scores = [d['score_mean'] for d in topic_dicts] # single dimension

	if len(scores) < 2:
	return np.array([[1.0]]), ["Insufficient topics"]

	corr_matrix = np.corrcoef(scores)
	return corr_matrix, labels


	@safe_process
	def perform_sentiment_analysis(text):
	sia = SentimentIntensityAnalyzer()
	sents = sent_tokenize(text)
	results = [sia.polarity_scores(s) for s in sents]
	return pd.DataFrame(results)


	@safe_process
	def create_topic_clusters(topic_dicts):
	if len(topic_dicts) < 3:
	return [0]*len(topic_dicts) # trivial cluster

	# Must have 'score_mean' and 'score_std' or something else
	X = []
	for t in topic_dicts:
	X.append([t['score_mean'], t.get('score_std', 0.0)])

	X = np.array(X)
	if X.shape[0] < 3:
	return [0]*X.shape[0]

	kmeans = KMeans(n_clusters=min(3, X.shape[0]), random_state=42)
	clusters = kmeans.fit_predict(X)
	return clusters.tolist() # safe to JSON-encode


	################################################################
	# 4) Charts (Bar, Radar, Correlation Heatmap, etc.) #
	################################################################

	@safe_process
	def create_main_charts(topic_dicts):
	"""
	Expects a list of dicts with keys: 'label', 'score_mean', ...
	We'll just use 'score_mean' (or a scaled version).
	"""
	if not topic_dicts:
	return go.Figure(), go.Figure()

	# Bar chart
	labels = [t['label'] for t in topic_dicts]
	scores = [t['score_mean']*100 for t in topic_dicts] # convert to %

	bar_fig = go.Figure(
	data=[go.Bar(x=labels, y=scores, marker_color='rgb(55, 83, 109)')]
	)
	bar_fig.update_layout(
	title='주제 분석 결과',
	xaxis_title='주제',
	yaxis_title='관련도 (%)',
	template='plotly_white',
	height=500,
	)

	# Radar chart
	radar_fig = go.Figure()
	radar_fig.add_trace(go.Scatterpolar(
	r=scores,
	theta=labels,
	fill='toself',
	name='주제 분포'
	))
	radar_fig.update_layout(
	title='주제 레이더 차트',
	template='plotly_white',
	height=500,
	polar=dict(radialaxis=dict(visible=True)),
	showlegend=False
	)
	return bar_fig, radar_fig


	@safe_process
	def create_correlation_heatmap(corr_matrix, labels):
	if corr_matrix.ndim == 0:
	# It's a scalar => shape ()
	corr_matrix = np.array([[corr_matrix]])

	if corr_matrix.shape == (1,1):
	# Usually means not enough data
	fig = go.Figure()
	fig.add_annotation(text="Not enough topics for correlation", showarrow=False)
	return fig

	fig = go.Figure(data=go.Heatmap(
	z=corr_matrix,
	x=labels,
	y=labels,
	colorscale='Viridis'
	))
	fig.update_layout(
	title='주제 간 상관관계',
	height=500,
	template='plotly_white'
	)
	return fig


	@safe_process
	def create_topic_evolution(section_topics):
	"""
	section_topics: list of [ {label:..., score_mean:...}, ...]
	one element per section
	"""
	fig = go.Figure()
	if not section_topics or len(section_topics) == 0:
	return fig

	# Take the first section’s list as reference
	if not section_topics[0]:
	return fig

	# For each topic in the first section, gather its evolution
	for topic_dict in section_topics[0]:
	label = topic_dict['label']
	score_list = []
	for sec_list in section_topics:
	# find matching label
	match = next((d for d in sec_list if d['label'] == label), None)
	if match:
	score_list.append(match['score_mean'])
	else:
	score_list.append(0.0)

	fig.add_trace(go.Scatter(
	x=list(range(len(section_topics))),
	y=score_list,
	name=label,
	mode='lines+markers'
	))

	fig.update_layout(
	title='주제 변화 추이',
	xaxis_title='섹션',
	yaxis_title='score_mean',
	height=500,
	template='plotly_white'
	)
	return fig


	@safe_process
	def create_confidence_gauge(topic_dicts):
	"""
	If your data doesn’t actually have a separate confidence measure,
	you may skip or adapt. For example, you might define confidence
	= (1 - score_std)*100
	"""
	if not topic_dicts:
	return go.Figure()

	fig = go.Figure()
	num_topics = len(topic_dicts)

	for i, t in enumerate(topic_dicts):
	confidence_val = 100.0*(1.0 - t.get('score_std', 0.0)) # an example
	fig.add_trace(go.Indicator(
	mode="gauge+number",
	value=confidence_val,
	title={'text': t['label']},
	domain={'row': 0, 'column': i}
	))

	fig.update_layout(
	grid={'rows': 1, 'columns': num_topics},
	height=400,
	template='plotly_white'
	)
	return fig


	################################################################
	# 5) Putting Everything into `process_all_analysis` #
	################################################################

	@spaces.GPU()
	def process_all_analysis(text):
	try:
	# 1) Suggest topics on the entire text
	raw_results = clf.suggest_topics(text)
	all_topics = parse_wbg_results(raw_results) # keep full list of dicts

	# 2) Top 5 (if you want to highlight them)
	# Sort by score_mean descending
	sorted_topics = sorted(all_topics, key=lambda x: x['score_mean'], reverse=True)
	top_topics = sorted_topics[:5]

	# 3) Section-based
	section_topics = analyze_text_sections(text) # list of lists

	# 4) Extra analyses
	corr_matrix, corr_labels = calculate_topic_correlations(all_topics)
	sentiments_df = perform_sentiment_analysis(text)
	clusters = create_topic_clusters(all_topics)

	# 5) Build charts
	bar_chart, radar_chart = create_main_charts(top_topics) # show top 5 on bar
	heatmap = create_correlation_heatmap(corr_matrix, corr_labels)
	evolution_chart = create_topic_evolution(section_topics)
	gauge_chart = create_confidence_gauge(top_topics)

	# 6) Prepare output for the JSON field
	# Make sure everything is JSON-serializable with string keys
	results = {
	"top_topics": top_topics, # list of dict
	"clusters": clusters, # list of ints
	"sentiments": sentiments_df.to_dict(orient="records"),
	}

	return (
	results, # JSON output
	bar_chart, # plot1
	radar_chart, # plot2
	heatmap, # plot3
	evolution_chart,# plot4
	gauge_chart, # plot5
	go.Figure() # plot6 (placeholder for sentiment plot, or skip)
	)

	except Exception as e:
	print(f"Analysis error: {str(e)}")
	empty_fig = go.Figure()
	return (
	{"error": str(e), "topics": []},
	empty_fig,
	empty_fig,
	empty_fig,
	empty_fig,
	empty_fig,
	empty_fig
	)


	################################################################
	# 6) Gradio UI #
	################################################################

	with gr.Blocks(title="고급 문서 주제 분석기") as demo:
	gr.Markdown("## 📊 고급 문서 주제 분석기")

	with gr.Row():
	text_input = gr.Textbox(
	value=SAMPLE_TEXT,
	label="분석할 텍스트",
	lines=8
	)
	with gr.Row():
	submit_btn = gr.Button("분석 시작", variant="primary")

	with gr.Tabs():
	with gr.TabItem("주요 분석"):
	with gr.Row():
	plot1 = gr.Plot(label="주제 분포")
	plot2 = gr.Plot(label="레이더 차트")
	with gr.TabItem("상세 분석"):
	with gr.Row():
	plot3 = gr.Plot(label="상관관계 히트맵")
	plot4 = gr.Plot(label="주제 변화 추이")
	with gr.TabItem("신뢰도 분석"):
	plot5 = gr.Plot(label="신뢰도 게이지")
	with gr.TabItem("감성 분석"):
	plot6 = gr.Plot(label="감성 분석 결과")

	with gr.Row():
	output_json = gr.JSON(label="상세 분석 결과")

	submit_btn.click(
	fn=process_all_analysis,
	inputs=[text_input],
	outputs=[output_json, plot1, plot2, plot3, plot4, plot5, plot6]
	)

	if __name__ == "__main__":
	demo.queue(max_size=1)
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	debug=True
	)