Spaces:

wakeupmh
/

ama-autism

Running

wakeupmh commited on Feb 15

Commit

0452175

1 Parent(s): 884d2bd

fix: run in hf

Files changed (2) hide show

app.py CHANGED Viewed

@@ -9,6 +9,12 @@ import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Cache models and dataset
 @st.cache_resource  # Cache models in memory
 def load_models():
@@ -16,8 +22,8 @@ def load_models():
     retriever = RagRetriever.from_pretrained(
         "facebook/rag-sequence-nq",
         index_name="custom",
-        passages_path="rag_dataset/dataset",
-        index_path="rag_dataset/embeddings.faiss"
     )
     model = RagSequenceForGeneration.from_pretrained(
         "facebook/rag-sequence-nq",
@@ -28,7 +34,13 @@ def load_models():
 @st.cache_data  # Cache dataset on disk
 def load_dataset():
-    return load_from_disk("rag_dataset/dataset")
 # RAG Pipeline
 def rag_pipeline(query, dataset, index):

 # Configure logging
 logging.basicConfig(level=logging.INFO)
+# Define data paths
+DATA_DIR = "/data" if os.path.exists("/data") else "."
+DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
+DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
+INDEX_PATH = os.path.join(DATASET_DIR, "embeddings.faiss")
 # Cache models and dataset
 @st.cache_resource  # Cache models in memory
 def load_models():
     retriever = RagRetriever.from_pretrained(
         "facebook/rag-sequence-nq",
         index_name="custom",
+        passages_path=DATASET_PATH,
+        index_path=INDEX_PATH
     )
     model = RagSequenceForGeneration.from_pretrained(
         "facebook/rag-sequence-nq",
 @st.cache_data  # Cache dataset on disk
 def load_dataset():
+    # Create initial dataset if it doesn't exist
+    if not os.path.exists(DATASET_PATH):
+        with st.spinner("Building initial dataset from autism research papers..."):
+            import faiss_index.index as idx
+            papers = idx.fetch_arxiv_papers("autism research", max_results=100)
+            idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
+    return load_from_disk(DATASET_PATH)
 # RAG Pipeline
 def rag_pipeline(query, dataset, index):

faiss_index/index.py CHANGED Viewed

@@ -10,6 +10,10 @@ import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 def fetch_arxiv_papers(query, max_results=10):
     """Fetch papers from arXiv and format them for RAG"""
     client = arxiv.Client()
@@ -23,7 +27,7 @@ def fetch_arxiv_papers(query, max_results=10):
     logging.info(f"Fetched {len(papers)} papers from arXiv")
     return papers
-def build_faiss_index(papers, dataset_dir="rag_dataset"):
     """Build and save dataset with FAISS index for RAG"""
     # Initialize DPR encoder
     ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
@@ -66,9 +70,3 @@ def build_faiss_index(papers, dataset_dir="rag_dataset"):
     logging.info(f"Saved dataset to {dataset_path}")
     logging.info(f"Saved index to {index_path}")
     return dataset_dir
-# Example usage
-if __name__ == "__main__":
-    query = "autism research"
-    papers = fetch_arxiv_papers(query, max_results=100)
-    build_faiss_index(papers)

 # Configure logging
 logging.basicConfig(level=logging.INFO)
+# Define data paths
+DATA_DIR = os.getenv("DATA_DIR", "/data" if os.path.exists("/data") else ".")
+DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 def fetch_arxiv_papers(query, max_results=10):
     """Fetch papers from arXiv and format them for RAG"""
     client = arxiv.Client()
     logging.info(f"Fetched {len(papers)} papers from arXiv")
     return papers
+def build_faiss_index(papers, dataset_dir=DATASET_DIR):
     """Build and save dataset with FAISS index for RAG"""
     # Initialize DPR encoder
     ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
     logging.info(f"Saved dataset to {dataset_path}")
     logging.info(f"Saved index to {index_path}")
     return dataset_dir