Spaces:
Sleeping
Sleeping
fix: search in results
Browse files- app.py +36 -30
- faiss_index/index.py +12 -6
app.py
CHANGED
@@ -40,29 +40,35 @@ def load_dataset(query):
|
|
40 |
search_query = query
|
41 |
|
42 |
papers = idx.fetch_arxiv_papers(search_query, max_results=25)
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
46 |
|
47 |
-
|
48 |
|
49 |
# Load and convert to pandas for easier handling
|
50 |
dataset = load_from_disk(DATASET_PATH)
|
51 |
df = pd.DataFrame({
|
52 |
'title': dataset['title'],
|
53 |
-
'text': dataset['text']
|
|
|
|
|
54 |
})
|
55 |
return df
|
56 |
|
57 |
def generate_answer(question, context, max_length=150):
|
58 |
tokenizer, model = load_models()
|
59 |
|
60 |
-
# Improve prompt to
|
61 |
-
prompt = f"""Based on scientific research about autism,
|
|
|
62 |
If the context doesn't contain relevant information about autism, respond with 'I cannot find specific information about this topic in the autism research papers.'
|
63 |
|
64 |
Question: {question}
|
65 |
-
Context: {context}
|
|
|
|
|
66 |
|
67 |
# Optimize input processing
|
68 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
@@ -99,26 +105,26 @@ if query:
|
|
99 |
with st.status("Searching for answers..."):
|
100 |
# Load dataset
|
101 |
df = load_dataset(query)
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
st.write(f"**Summary:** {row['text'][:200]}...")
|
122 |
-
st.write("---")
|
123 |
-
else:
|
124 |
-
st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")
|
|
|
40 |
search_query = query
|
41 |
|
42 |
papers = idx.fetch_arxiv_papers(search_query, max_results=25)
|
43 |
+
|
44 |
+
if not papers:
|
45 |
+
st.warning("No relevant papers found. Please try rephrasing your question.")
|
46 |
+
return pd.DataFrame(columns=['title', 'text', 'url', 'published'])
|
47 |
|
48 |
+
idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
|
49 |
|
50 |
# Load and convert to pandas for easier handling
|
51 |
dataset = load_from_disk(DATASET_PATH)
|
52 |
df = pd.DataFrame({
|
53 |
'title': dataset['title'],
|
54 |
+
'text': dataset['text'],
|
55 |
+
'url': [p['url'] for p in papers],
|
56 |
+
'published': [p['published'] for p in papers]
|
57 |
})
|
58 |
return df
|
59 |
|
60 |
def generate_answer(question, context, max_length=150):
|
61 |
tokenizer, model = load_models()
|
62 |
|
63 |
+
# Improve prompt to generate concise, summarized answers
|
64 |
+
prompt = f"""Based on scientific research about autism, provide a brief, clear summary answering the following question.
|
65 |
+
Focus only on the most important findings and be concise.
|
66 |
If the context doesn't contain relevant information about autism, respond with 'I cannot find specific information about this topic in the autism research papers.'
|
67 |
|
68 |
Question: {question}
|
69 |
+
Context: {context}
|
70 |
+
|
71 |
+
Provide a concise summary:"""
|
72 |
|
73 |
# Optimize input processing
|
74 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
|
|
105 |
with st.status("Searching for answers..."):
|
106 |
# Load dataset
|
107 |
df = load_dataset(query)
|
108 |
+
# Get relevant context
|
109 |
+
context = "\n".join([
|
110 |
+
f"{text[:1000]}" for text in df['text'].head(3)
|
111 |
+
])
|
112 |
|
113 |
+
# Generate answer
|
114 |
+
answer = generate_answer(query, context)
|
115 |
+
status.update(
|
116 |
+
label="Search complete!", state="complete", expanded=False
|
117 |
+
)
|
118 |
+
if answer and not answer.isspace():
|
119 |
+
st.success("Answer found!")
|
120 |
+
st.write(answer)
|
121 |
+
|
122 |
+
st.write("### Sources used:")
|
123 |
+
for _, row in df.head(3).iterrows():
|
124 |
+
st.markdown(f"**[{row['title']}]({row['url']})** ({row['published']})")
|
125 |
+
st.write(f"**Summary:** {row['text'][:200]}...")
|
126 |
+
st.write("---")
|
127 |
+
else:
|
128 |
+
st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")
|
129 |
+
if df.empty:
|
130 |
+
st.warning("I couldn't find any relevant research papers about this topic. Please try rephrasing your question or ask something else about autism.")
|
|
|
|
|
|
|
|
faiss_index/index.py
CHANGED
@@ -22,8 +22,12 @@ def fetch_arxiv_papers(query, max_results=10):
|
|
22 |
query = query.replace('and', '').strip() # Remove 'and' as it's treated as AND operator
|
23 |
terms = [term.strip() for term in query.split() if term.strip()]
|
24 |
|
25 |
-
#
|
26 |
-
|
|
|
|
|
|
|
|
|
27 |
search_query = f'({search_query}) AND (cat:q-bio* OR cat:med*)'
|
28 |
|
29 |
logging.info(f"Searching arXiv with query: {search_query}")
|
@@ -39,18 +43,20 @@ def fetch_arxiv_papers(query, max_results=10):
|
|
39 |
papers = []
|
40 |
|
41 |
for i, result in enumerate(results):
|
42 |
-
#
|
43 |
text = (result.title + " " + result.summary).lower()
|
44 |
-
if
|
45 |
papers.append({
|
46 |
"id": str(i),
|
47 |
"text": result.summary,
|
48 |
-
"title": result.title
|
|
|
|
|
49 |
})
|
50 |
if len(papers) >= max_results:
|
51 |
break
|
52 |
|
53 |
-
logging.info(f"Found {len(papers)} relevant papers from arXiv")
|
54 |
return papers
|
55 |
except Exception as e:
|
56 |
logging.error(f"Error fetching papers from arXiv: {str(e)}")
|
|
|
22 |
query = query.replace('and', '').strip() # Remove 'and' as it's treated as AND operator
|
23 |
terms = [term.strip() for term in query.split() if term.strip()]
|
24 |
|
25 |
+
# Always include autism in the search
|
26 |
+
if 'autism' not in [t.lower() for t in terms]:
|
27 |
+
terms.insert(0, 'autism')
|
28 |
+
|
29 |
+
# Create search query with required autism term
|
30 |
+
search_query = f'(abs:"autism" OR ti:"autism") AND ({" OR ".join([f'abs:"{term}" OR ti:"{term}"' for term in terms if term.lower() != "autism"])})'
|
31 |
search_query = f'({search_query}) AND (cat:q-bio* OR cat:med*)'
|
32 |
|
33 |
logging.info(f"Searching arXiv with query: {search_query}")
|
|
|
43 |
papers = []
|
44 |
|
45 |
for i, result in enumerate(results):
|
46 |
+
# Only include papers that mention autism
|
47 |
text = (result.title + " " + result.summary).lower()
|
48 |
+
if 'autism' in text:
|
49 |
papers.append({
|
50 |
"id": str(i),
|
51 |
"text": result.summary,
|
52 |
+
"title": result.title,
|
53 |
+
"url": result.entry_id, # Add the paper URL
|
54 |
+
"published": result.published.strftime("%Y-%m-%d") # Add publication date
|
55 |
})
|
56 |
if len(papers) >= max_results:
|
57 |
break
|
58 |
|
59 |
+
logging.info(f"Found {len(papers)} relevant papers about autism from arXiv")
|
60 |
return papers
|
61 |
except Exception as e:
|
62 |
logging.error(f"Error fetching papers from arXiv: {str(e)}")
|