wakeupmh commited on
Commit
62b3157
·
1 Parent(s): cc41495

fix: search in results

Browse files
Files changed (2) hide show
  1. app.py +36 -30
  2. faiss_index/index.py +12 -6
app.py CHANGED
@@ -40,29 +40,35 @@ def load_dataset(query):
40
  search_query = query
41
 
42
  papers = idx.fetch_arxiv_papers(search_query, max_results=25)
43
- if not papers:
44
- st.warning("No relevant papers found. Please try rephrasing your question.")
45
- return pd.DataFrame(columns=['title', 'text'])
 
46
 
47
- idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
48
 
49
  # Load and convert to pandas for easier handling
50
  dataset = load_from_disk(DATASET_PATH)
51
  df = pd.DataFrame({
52
  'title': dataset['title'],
53
- 'text': dataset['text']
 
 
54
  })
55
  return df
56
 
57
  def generate_answer(question, context, max_length=150):
58
  tokenizer, model = load_models()
59
 
60
- # Improve prompt to focus on autism-related information
61
- prompt = f"""Based on scientific research about autism, answer the following question.
 
62
  If the context doesn't contain relevant information about autism, respond with 'I cannot find specific information about this topic in the autism research papers.'
63
 
64
  Question: {question}
65
- Context: {context}"""
 
 
66
 
67
  # Optimize input processing
68
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
@@ -99,26 +105,26 @@ if query:
99
  with st.status("Searching for answers..."):
100
  # Load dataset
101
  df = load_dataset(query)
 
 
 
 
102
 
103
- if df.empty:
104
- st.warning("I couldn't find any relevant research papers about this topic. Please try rephrasing your question or ask something else about autism.")
105
- else:
106
- # Get relevant context
107
- context = "\n".join([
108
- f"{text[:1000]}" for text in df['text'].head(3)
109
- ])
110
-
111
- # Generate answer
112
- answer = generate_answer(query, context)
113
-
114
- if answer and not answer.isspace():
115
- st.success("Answer found!")
116
- st.write(answer)
117
-
118
- st.write("### Sources used:")
119
- for _, row in df.head(3).iterrows():
120
- st.write(f"**Title:** {row['title']}")
121
- st.write(f"**Summary:** {row['text'][:200]}...")
122
- st.write("---")
123
- else:
124
- st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")
 
40
  search_query = query
41
 
42
  papers = idx.fetch_arxiv_papers(search_query, max_results=25)
43
+
44
+ if not papers:
45
+ st.warning("No relevant papers found. Please try rephrasing your question.")
46
+ return pd.DataFrame(columns=['title', 'text', 'url', 'published'])
47
 
48
+ idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
49
 
50
  # Load and convert to pandas for easier handling
51
  dataset = load_from_disk(DATASET_PATH)
52
  df = pd.DataFrame({
53
  'title': dataset['title'],
54
+ 'text': dataset['text'],
55
+ 'url': [p['url'] for p in papers],
56
+ 'published': [p['published'] for p in papers]
57
  })
58
  return df
59
 
60
  def generate_answer(question, context, max_length=150):
61
  tokenizer, model = load_models()
62
 
63
+ # Improve prompt to generate concise, summarized answers
64
+ prompt = f"""Based on scientific research about autism, provide a brief, clear summary answering the following question.
65
+ Focus only on the most important findings and be concise.
66
  If the context doesn't contain relevant information about autism, respond with 'I cannot find specific information about this topic in the autism research papers.'
67
 
68
  Question: {question}
69
+ Context: {context}
70
+
71
+ Provide a concise summary:"""
72
 
73
  # Optimize input processing
74
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
 
105
  with st.status("Searching for answers..."):
106
  # Load dataset
107
  df = load_dataset(query)
108
+ # Get relevant context
109
+ context = "\n".join([
110
+ f"{text[:1000]}" for text in df['text'].head(3)
111
+ ])
112
 
113
+ # Generate answer
114
+ answer = generate_answer(query, context)
115
+ status.update(
116
+ label="Search complete!", state="complete", expanded=False
117
+ )
118
+ if answer and not answer.isspace():
119
+ st.success("Answer found!")
120
+ st.write(answer)
121
+
122
+ st.write("### Sources used:")
123
+ for _, row in df.head(3).iterrows():
124
+ st.markdown(f"**[{row['title']}]({row['url']})** ({row['published']})")
125
+ st.write(f"**Summary:** {row['text'][:200]}...")
126
+ st.write("---")
127
+ else:
128
+ st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")
129
+ if df.empty:
130
+ st.warning("I couldn't find any relevant research papers about this topic. Please try rephrasing your question or ask something else about autism.")
 
 
 
 
faiss_index/index.py CHANGED
@@ -22,8 +22,12 @@ def fetch_arxiv_papers(query, max_results=10):
22
  query = query.replace('and', '').strip() # Remove 'and' as it's treated as AND operator
23
  terms = [term.strip() for term in query.split() if term.strip()]
24
 
25
- # Create a more flexible search query
26
- search_query = ' OR '.join([f'abs:"{term}" OR ti:"{term}"' for term in terms])
 
 
 
 
27
  search_query = f'({search_query}) AND (cat:q-bio* OR cat:med*)'
28
 
29
  logging.info(f"Searching arXiv with query: {search_query}")
@@ -39,18 +43,20 @@ def fetch_arxiv_papers(query, max_results=10):
39
  papers = []
40
 
41
  for i, result in enumerate(results):
42
- # Include paper if it contains any of the search terms
43
  text = (result.title + " " + result.summary).lower()
44
- if any(term.lower() in text for term in terms):
45
  papers.append({
46
  "id": str(i),
47
  "text": result.summary,
48
- "title": result.title
 
 
49
  })
50
  if len(papers) >= max_results:
51
  break
52
 
53
- logging.info(f"Found {len(papers)} relevant papers from arXiv")
54
  return papers
55
  except Exception as e:
56
  logging.error(f"Error fetching papers from arXiv: {str(e)}")
 
22
  query = query.replace('and', '').strip() # Remove 'and' as it's treated as AND operator
23
  terms = [term.strip() for term in query.split() if term.strip()]
24
 
25
+ # Always include autism in the search
26
+ if 'autism' not in [t.lower() for t in terms]:
27
+ terms.insert(0, 'autism')
28
+
29
+ # Create search query with required autism term
30
+ search_query = f'(abs:"autism" OR ti:"autism") AND ({" OR ".join([f'abs:"{term}" OR ti:"{term}"' for term in terms if term.lower() != "autism"])})'
31
  search_query = f'({search_query}) AND (cat:q-bio* OR cat:med*)'
32
 
33
  logging.info(f"Searching arXiv with query: {search_query}")
 
43
  papers = []
44
 
45
  for i, result in enumerate(results):
46
+ # Only include papers that mention autism
47
  text = (result.title + " " + result.summary).lower()
48
+ if 'autism' in text:
49
  papers.append({
50
  "id": str(i),
51
  "text": result.summary,
52
+ "title": result.title,
53
+ "url": result.entry_id, # Add the paper URL
54
+ "published": result.published.strftime("%Y-%m-%d") # Add publication date
55
  })
56
  if len(papers) >= max_results:
57
  break
58
 
59
+ logging.info(f"Found {len(papers)} relevant papers about autism from arXiv")
60
  return papers
61
  except Exception as e:
62
  logging.error(f"Error fetching papers from arXiv: {str(e)}")