Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

ama-autism / app.py

wakeupmh

refactor: using falcon

7a11d41 5 months ago

raw

history blame

13.7 kB

	import streamlit as st
	import pandas as pd
	import torch
	import logging
	import os
	from transformers import AutoTokenizer, T5ForConditionalGeneration
	import arxiv
	import requests
	import xml.etree.ElementTree as ET
	import re

	# Configure logging
	logging.basicConfig(level=logging.INFO)

	# Define data paths and constants
	DATA_DIR = "/data" if os.path.exists("/data") else "."
	DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
	DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
	TOKENIZER_MODEL = "google/flan-t5-small"
	SUMMARIZATION_MODEL= "Falconsai/text_summarization"
	# SUMMARIZATION_MODEL="rhaymison/t5-portuguese-small-summarization"

	@st.cache_resource
	def load_local_model():
	"""Load the local Hugging Face model"""
	try:
	tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)
	model = T5ForConditionalGeneration.from_pretrained(
	SUMMARIZATION_MODEL,
	device_map={"": "cpu"}, # Force CPU
	torch_dtype=torch.float32
	)
	return model, tokenizer
	except Exception as e:
	st.error(f"Error loading model: {str(e)}")
	return None, None

	def clean_text(text):
	"""Clean and normalize text content"""
	if not text:
	return ""

	# Remove special characters and normalize spaces
	text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
	text = re.sub(r'\s+', ' ', text)
	text = text.replace('â€™', "'").replace('â€œ', '"').replace('â€', '"')

	# Remove any remaining weird characters
	text = ''.join(char for char in text if ord(char) < 128)

	return text.strip()

	def format_paper(title, abstract):
	"""Format paper information consistently"""
	title = clean_text(title)
	abstract = clean_text(abstract)

	if len(abstract) > 1000:
	abstract = abstract[:997] + "..."

	return f"""Title: {title}

	Abstract: {abstract}

	---"""

	def fetch_arxiv_papers(query, max_results=5):
	"""Fetch papers from arXiv"""
	client = arxiv.Client()

	# Always include autism in the search query
	search_query = f"(ti:autism OR abs:autism) AND (ti:\"{query}\" OR abs:\"{query}\") AND cat:q-bio"

	# Search arXiv
	search = arxiv.Search(
	query=search_query,
	max_results=max_results,
	sort_by=arxiv.SortCriterion.Relevance
	)

	papers = []
	for result in client.results(search):
	# Only include papers that mention autism in title or abstract
	if ('autism' in result.title.lower() or
	'asd' in result.title.lower() or
	'autism' in result.summary.lower() or
	'asd' in result.summary.lower()):
	papers.append({
	'title': result.title,
	'abstract': result.summary,
	'url': result.pdf_url,
	'published': result.published.strftime("%Y-%m-%d"),
	'relevance_score': 1 if 'autism' in result.title.lower() else 0.5
	})

	return papers

	def fetch_pubmed_papers(query, max_results=5):
	"""Fetch papers from PubMed"""
	base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

	# Always include autism in the search term
	search_term = f"(autism[Title/Abstract] OR ASD[Title/Abstract]) AND ({query}[Title/Abstract])"

	# Search for papers
	search_url = f"{base_url}/esearch.fcgi"
	search_params = {
	'db': 'pubmed',
	'term': search_term,
	'retmax': max_results,
	'sort': 'relevance',
	'retmode': 'xml'
	}

	papers = []
	try:
	# Get paper IDs
	response = requests.get(search_url, params=search_params)
	root = ET.fromstring(response.content)
	id_list = [id_elem.text for id_elem in root.findall('.//Id')]

	if not id_list:
	return papers

	# Fetch paper details
	fetch_url = f"{base_url}/efetch.fcgi"
	fetch_params = {
	'db': 'pubmed',
	'id': ','.join(id_list),
	'retmode': 'xml'
	}

	response = requests.get(fetch_url, params=fetch_params)
	articles = ET.fromstring(response.content)

	for article in articles.findall('.//PubmedArticle'):
	title = article.find('.//ArticleTitle')
	abstract = article.find('.//Abstract/AbstractText')
	year = article.find('.//PubDate/Year')
	pmid = article.find('.//PMID')

	if title is not None and abstract is not None:
	title_text = title.text.lower()
	abstract_text = abstract.text.lower()

	# Only include papers that mention autism
	if ('autism' in title_text or 'asd' in title_text or
	'autism' in abstract_text or 'asd' in abstract_text):
	papers.append({
	'title': title.text,
	'abstract': abstract.text,
	'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid.text}/",
	'published': year.text if year is not None else 'Unknown',
	'relevance_score': 1 if ('autism' in title_text or 'asd' in title_text) else 0.5
	})

	except Exception as e:
	st.error(f"Error fetching PubMed papers: {str(e)}")

	return papers

	def search_research_papers(query):
	"""Search both arXiv and PubMed for papers"""
	arxiv_papers = fetch_arxiv_papers(query)
	pubmed_papers = fetch_pubmed_papers(query)

	# Combine and format papers
	all_papers = []
	for paper in arxiv_papers + pubmed_papers:
	if paper['abstract'] and len(paper['abstract'].strip()) > 0:
	# Clean and format the paper content
	clean_title = clean_text(paper['title'])
	clean_abstract = clean_text(paper['abstract'])

	# Check if the paper is actually about autism
	if ('autism' in clean_title.lower() or
	'asd' in clean_title.lower() or
	'autism' in clean_abstract.lower() or
	'asd' in clean_abstract.lower()):

	formatted_text = format_paper(clean_title, clean_abstract)

	all_papers.append({
	'title': clean_title,
	'text': formatted_text,
	'url': paper['url'],
	'published': paper['published'],
	'relevance_score': paper.get('relevance_score', 0.5)
	})

	# Sort papers by relevance score and convert to DataFrame
	all_papers.sort(key=lambda x: x['relevance_score'], reverse=True)
	df = pd.DataFrame(all_papers)

	if df.empty:
	st.warning("No autism-related papers found. Please try a different search term.")
	return pd.DataFrame(columns=['title', 'text', 'url', 'published', 'relevance_score'])

	return df

	def generate_answer(question, context, max_length=512):
	"""Generate a comprehensive answer using the local model"""
	model, tokenizer = load_local_model()

	if model is None or tokenizer is None:
	return "Error: Could not load the model. Please try again later."

	# Clean and format the context
	clean_context = clean_text(context)
	clean_question = clean_text(question)

	# Format the input for T5 (it expects a specific format)
	input_text = f"""Context
	Input Question: {clean_question}
	Source Materials: {clean_context}
	Primary Objective
	Generate a comprehensive yet accessible summary of autism research that bridges the gap between academic knowledge and public understanding. The response should be evidence-based while remaining engaging and practical for general readers.
	Content Structure
	1. Opening Overview

	Begin with a concise, jargon-free definition of autism
	Frame the topic within everyday experiences
	Establish relevance to the reader's understanding

	2. Key Concepts Breakdown

	Transform complex research findings into digestible information
	Structure information in a logical progression
	Connect each point to real-world scenarios

	3. Research Integration
	Present research findings using this framework:

	Main finding: [Clear statement of what was discovered]
	Real-world meaning: [Practical implications]
	Context: [How this fits into broader understanding]

	4. Examples and Applications
	Include:

	Concrete, relatable scenarios
	Day-to-day situations
	Practical implications for families and individuals

	Writing Guidelines
	Language Requirements

	Target reading level: 8th grade
	Sentence length: Maximum 20 words
	Paragraph length: 2-4 sentences
	Technical terms: Must include plain language explanation in parentheses

	Tone and Style

	Empathetic and respectful
	Solution-focused approach
	Balanced perspective
	Inclusive language

	Formatting Specifications

	Use headers for major sections
	Include white space between concepts
	Implement bullet points for lists
	Bold key terms with immediate explanations

	Research Citation Format
	When referencing studies, follow this pattern:
	"Research from [Institution] shows [finding in simple terms]. This means [practical interpretation]."
	Quality Checks
	Before finalizing, ensure the summary:

	Answers the original question directly
	Maintains scientific accuracy while being accessible
	Provides actionable insights
	Respects neurodiversity perspectives
	Balances depth with clarity

	Response Framework

	Introduction (2-3 sentences)

	Core definition
	Relevance statement


	Main Body (3-4 key points)

	Evidence-based insights
	Practical examples
	Real-world applications


	Conclusion (2-3 sentences)

	Summary of key takeaways
	Actionable next steps or implications



	Engagement Elements

	Include thought-provoking questions
	Provide relatable scenarios
	Connect to common experiences
	Offer practical applications

	Modified Output Analysis
	The response should be evaluated against these criteria:

	Clarity: Is the information immediately understandable?
	Accuracy: Does it reflect the research correctly?
	Relevance: Does it address the specific question?
	Practicality: Are the insights actionable?
	Engagement: Does it maintain reader interest?

	Special Considerations

	Acknowledge spectrum nature of autism
	Respect diverse perspectives
	Focus on strengths and challenges
	Avoid deficit-based language
	Include support-oriented information

	Remember to adapt the depth and complexity based on the specific question while maintaining accessibility and scientific accuracy."""

	try:
	# T5 expects a specific format for the input
	inputs = tokenizer(input_text,
	return_tensors="pt",
	max_length=1024,
	truncation=True,
	padding=True)

	with torch.inference_mode():
	outputs = model.generate(
	**inputs,
	max_length=max_length,
	min_length=200,
	num_beams=3, # Reduzindo para mais variedade
	length_penalty=1.2, # Melhor equilíbrio entre concisão e detalhes
	temperature=0.8, # Aumentando um pouco para mais fluidez
	repetition_penalty=1.2,
	early_stopping=True,
	no_repeat_ngram_size=2, # Mantendo variação no texto
	do_sample=True,
	top_k=30, # Reduzindo para respostas mais coerentes
	top_p=0.9 # Equilibrando diversidade e precisão
	)


	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	response = clean_text(response)

	# If response is too short or empty, provide a general overview
	if len(response.strip()) < 100:
	return """Autism Spectrum Disorder (ASD) is a complex neurodevelopmental condition. Unfortunately, the provided papers don't contain specific information about this aspect of autism.

	To get research-based information, try asking more specific questions about:
	- Genetics and environmental factors
	- Early intervention
	- Treatments and therapies
	- Neurological development

	This will allow us to provide accurate information supported by recent scientific research."""

	# Format the response for better readability
	formatted_response = response.replace(". ", ".\n").replace("• ", "\n• ")

	return formatted_response

	except Exception as e:
	st.error(f"Error generating response: {str(e)}")
	return "Error: Could not generate response. Please try again with a different question."

	# Streamlit App
	st.title("🧩 AMA Autism")

	st.write("""
	This app searches through scientific papers to answer your questions about autism.
	For best results, be specific in your questions.
	""")

	query = st.text_input("Please ask me anything about autism ✨")

	if query:
	with st.status("Searching for answers...") as status:
	# Search for papers
	df = search_research_papers(query)

	st.write("Searching for data in PubMed and arXiv...")
	st.write(f"Found {len(df)} relevant papers!")

	# Get relevant context
	context = "\n".join([
	f"{text[:1000]}" for text in df['text'].head(3)
	])

	# Generate answer
	st.write("Generating answer...")
	answer = generate_answer(query, context)
	# Display paper sources
	with st.expander("View source papers"):
	for _, paper in df.iterrows():
	st.markdown(f"- [{paper['title']}]({paper['url']}) ({paper['published']})")
	st.success("Answer found!")
	st.markdown(answer)