wakeupmh commited on
Commit
54a5022
·
1 Parent(s): cc0b0d6

fix: embeddings

Browse files
Files changed (2) hide show
  1. app.py +27 -20
  2. faiss_index/index.py +12 -0
app.py CHANGED
@@ -34,8 +34,12 @@ def load_dataset(query):
34
  with st.spinner("Searching autism research papers..."):
35
  import faiss_index.index as idx
36
  # Make the query more specific to autism and b12
37
- search_query = f"autism {query} AND (cat:q-bio.NC OR cat:q-bio.QM OR cat:q-bio.GN OR cat:q-bio.CB OR cat:q-bio.MN)"
38
  papers = idx.fetch_arxiv_papers(search_query, max_results=25)
 
 
 
 
39
  idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
40
 
41
  # Load and convert to pandas for easier handling
@@ -91,22 +95,25 @@ if query:
91
  # Load dataset
92
  df = load_dataset(query)
93
 
94
- # Get relevant context
95
- context = "\n".join([
96
- f"{text[:1000]}" for text in df['text'].head(3)
97
- ])
98
-
99
- # Generate answer
100
- answer = generate_answer(query, context)
101
-
102
- if answer and not answer.isspace():
103
- st.success("Answer found!")
104
- st.write(answer)
105
-
106
- st.write("### Sources Used:")
107
- for _, row in df.head(3).iterrows():
108
- st.write(f"**Title:** {row['title']}")
109
- st.write(f"**Summary:** {row['text'][:200]}...")
110
- st.write("---")
111
- else:
112
- st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")
 
 
 
 
34
  with st.spinner("Searching autism research papers..."):
35
  import faiss_index.index as idx
36
  # Make the query more specific to autism and b12
37
+ search_query = f"{query} AND (cat:q-bio.NC OR cat:q-bio.QM OR cat:q-bio.GN OR cat:q-bio.CB OR cat:q-bio.MN)"
38
  papers = idx.fetch_arxiv_papers(search_query, max_results=25)
39
+ if not papers:
40
+ st.warning("No relevant papers found. Please try rephrasing your question.")
41
+ return pd.DataFrame(columns=['title', 'text'])
42
+
43
  idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
44
 
45
  # Load and convert to pandas for easier handling
 
95
  # Load dataset
96
  df = load_dataset(query)
97
 
98
+ if df.empty:
99
+ st.warning("I couldn't find any relevant research papers about this topic. Please try rephrasing your question or ask something else about autism.")
100
+ else:
101
+ # Get relevant context
102
+ context = "\n".join([
103
+ f"{text[:1000]}" for text in df['text'].head(3)
104
+ ])
105
+
106
+ # Generate answer
107
+ answer = generate_answer(query, context)
108
+
109
+ if answer and not answer.isspace():
110
+ st.success("Answer found!")
111
+ st.write(answer)
112
+
113
+ st.write("### Sources used:")
114
+ for _, row in df.head(3).iterrows():
115
+ st.write(f"**Title:** {row['title']}")
116
+ st.write(f"**Summary:** {row['text'][:200]}...")
117
+ st.write("---")
118
+ else:
119
+ st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")
faiss_index/index.py CHANGED
@@ -49,6 +49,18 @@ def fetch_arxiv_papers(query, max_results=10):
49
 
50
  def build_faiss_index(papers, dataset_dir=DATASET_DIR):
51
  """Build and save dataset with FAISS index for RAG"""
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Initialize smaller DPR encoder
53
  ctx_encoder = DPRContextEncoder.from_pretrained(
54
  "facebook/dpr-ctx_encoder-single-nq-base",
 
49
 
50
  def build_faiss_index(papers, dataset_dir=DATASET_DIR):
51
  """Build and save dataset with FAISS index for RAG"""
52
+ if not papers:
53
+ logging.warning("No papers found. Creating empty dataset.")
54
+ # Create an empty dataset with the expected structure
55
+ dataset = Dataset.from_dict({
56
+ "text": [],
57
+ "embeddings": [],
58
+ "title": []
59
+ })
60
+ os.makedirs(dataset_dir, exist_ok=True)
61
+ dataset.save_to_disk(os.path.join(dataset_dir, "dataset"))
62
+ return dataset_dir
63
+
64
  # Initialize smaller DPR encoder
65
  ctx_encoder = DPRContextEncoder.from_pretrained(
66
  "facebook/dpr-ctx_encoder-single-nq-base",