similarity-checker

Sleeping

App Files Files Community

similarity-checker / app.py

ahm14

Create app.py

706fc89 verified 7 months ago

raw

history blame

8.19 kB

	import streamlit as st
	import re
	from langdetect import detect
	from transformers import pipeline
	import nltk
	from docx import Document
	import io

	# Download required NLTK resources
	nltk.download('punkt')

	# Updated tone categories
	tone_categories = {
	"Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis"],
	"Critical": ["corrupt", "oppression", "failure", "repression", "unjust"],
	"Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief"],
	"Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change"],
	"Informative": ["announcement", "event", "scheduled", "update", "details"],
	"Positive": ["progress", "unity", "hope", "victory", "solidarity"],
	"Urgent": ["urgent", "violence", "disappearances", "forced", "killing", "concern", "crisis"],
	"Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust"],
	"Negative": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief"],
	"Empowering": ["rise", "resist", "mobilize", "inspire", "courage", "change"],
	"Neutral": ["announcement", "event", "scheduled", "update", "details", "protest on"],
	"Hopeful": ["progress", "unity", "hope", "victory", "together", "solidarity"]
	}

	# Updated frame categories
	frame_categories = {
	"Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
	"Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
	"Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
	"Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
	"Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
	"Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
	"Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
	"Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
	"Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
	"Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
	"Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
	"Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
	"Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
	"Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
	"Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
	}

	# Detect language
	def detect_language(text):
	try:
	return detect(text)
	except Exception as e:
	st.write(f"Error detecting language: {e}")
	return "unknown"

	# Analyze tone based on predefined categories
	def analyze_tone(text):
	detected_tones = set()
	for category, keywords in tone_categories.items():
	if any(word in text.lower() for word in keywords):
	detected_tones.add(category)

	if not detected_tones:
	tone_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	model_result = tone_model(text, candidate_labels=list(tone_categories.keys()))
	detected_tones.update(model_result["labels"][:2])

	return list(detected_tones)

	# Extract hashtags
	def extract_hashtags(text):
	return re.findall(r"#\w+", text)

	# Extract frames based on predefined categories
	def extract_frames(text):
	detected_frames = set()
	for category, keywords in frame_categories.items():
	if any(word in text.lower() for word in keywords):
	detected_frames.add(category)

	if not detected_frames:
	frame_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	model_result = frame_model(text, candidate_labels=list(frame_categories.keys()))
	detected_frames.update(model_result["labels"][:2])

	return list(detected_frames)

	# Extract captions from DOCX file based on "Post X"
	def extract_captions_from_docx(docx_file):
	doc = Document(docx_file)
	captions = {}
	current_post = None
	for para in doc.paragraphs:
	text = para.text.strip()
	if re.match(r"Post \d+", text, re.IGNORECASE):
	current_post = text
	captions[current_post] = []
	elif current_post:
	captions[current_post].append(text)

	return {post: " ".join(lines) for post, lines in captions.items() if lines}

	# Generate a DOCX file in-memory with full captions
	def generate_docx(output_data):
	doc = Document()
	doc.add_heading('Activism Message Analysis', 0)

	for index, (caption, result) in enumerate(output_data.items(), start=1):
	doc.add_heading(f"{index}. {caption}", level=1)
	doc.add_paragraph("Full Caption:")
	doc.add_paragraph(result['Full Caption'], style="Quote")

	doc.add_paragraph(f"Language: {result['Language']}")
	doc.add_paragraph(f"Tone of Caption: {', '.join(result['Tone of Caption'])}")
	doc.add_paragraph(f"Number of Hashtags: {result['Hashtag Count']}")
	doc.add_paragraph(f"Hashtags Found: {', '.join(result['Hashtags'])}")

	doc.add_heading('Frames:', level=2)
	for frame in result['Frames']:
	doc.add_paragraph(frame)

	doc_io = io.BytesIO()
	doc.save(doc_io)
	doc_io.seek(0)

	return doc_io

	# Streamlit app
	st.title('AI-Powered Activism Message Analyzer with Intersectionality')

	st.write("Enter the text to analyze or upload a DOCX file containing captions:")

	# Text Input
	input_text = st.text_area("Input Text", height=200)

	# File Upload
	uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])

	# Initialize output dictionary
	output_data = {}

	if input_text:
	language = detect_language(input_text)
	tone = analyze_tone(input_text)
	hashtags = extract_hashtags(input_text)
	frames = extract_frames(input_text)

	output_data["Manual Input"] = {
	'Full Caption': input_text,
	'Language': language,
	'Tone of Caption': tone,
	'Hashtags': hashtags,
	'Hashtag Count': len(hashtags),
	'Frames': frames
	}

	st.success("Analysis completed for text input.")

	if uploaded_file:
	captions = extract_captions_from_docx(uploaded_file)
	for caption, text in captions.items():
	language = detect_language(text)
	tone = analyze_tone(text)
	hashtags = extract_hashtags(text)
	frames = extract_frames(text)

	output_data[caption] = {
	'Full Caption': text,
	'Language': language,
	'Tone of Caption': tone,
	'Hashtags': hashtags,
	'Hashtag Count': len(hashtags),
	'Frames': frames
	}

	st.success(f"Analysis completed for {len(captions)} posts from the DOCX file.")

	# Display results
	if output_data:
	with st.expander("Generated Output"):
	st.subheader("Analysis Results")
	for index, (caption, result) in enumerate(output_data.items(), start=1):
	st.write(f"### {index}. {caption}")
	st.write("Full Caption:")
	st.write(f"> {result['Full Caption']}")
	st.write(f"Language: {result['Language']}")
	st.write(f"Tone of Caption: {', '.join(result['Tone of Caption'])}")
	st.write(f"Number of Hashtags: {result['Hashtag Count']}")
	st.write(f"Hashtags Found: {', '.join(result['Hashtags'])}")
	st.write("Frames:")
	for frame in result['Frames']:
	st.write(f"- {frame}")

	docx_file = generate_docx(output_data)

	if docx_file:
	st.download_button(
	label="Download Analysis as DOCX",
	data=docx_file,
	file_name="activism_message_analysis.docx",
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)