Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

wakeupmh commited on Feb 15

Commit

975c327

1 Parent(s): 50dc0c2

fix: write

Browse files

Files changed (2) hide show

app.py +27 -14
faiss_index/index.py +8 -5

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from datasets import load_from_disk
 import torch
 import logging
 import warnings
 # Configure logging
 logging.basicConfig(level=logging.WARNING)
@@ -18,14 +19,23 @@ st.title("🧩 AMA Austim")
 query = st.text_input("Please ask me anything about autism ✨")
 @st.cache_resource
-def load_rag_components(model_name="facebook/rag-sequence-nq"):
     """Load and cache RAG components to avoid reloading."""
     tokenizer = RagTokenizer.from_pretrained(model_name)
-    retriever = RagRetriever.from_pretrained(
-        model_name,
-        index_name="custom",
-        use_dummy_dataset=True  # We'll configure the dataset later
-    )
     model = RagSequenceForGeneration.from_pretrained(model_name)
     return tokenizer, retriever, model
@@ -37,17 +47,20 @@ def load_rag_dataset(dataset_dir="rag_dataset"):
             initial_papers = faiss_index_index.fetch_arxiv_papers("autism research", max_results=100)
             dataset_dir = faiss_index_index.build_faiss_index(initial_papers, dataset_dir)
     # Load the dataset and index
-    dataset = load_from_disk(os.path.join(dataset_dir, "dataset"))
-    index = faiss.read_index(os.path.join(dataset_dir, "embeddings.faiss"))
-    return dataset, index
 # RAG Pipeline
-def rag_pipeline(query, dataset, index):
     try:
-        # Load cached components
-        tokenizer, retriever, model = load_rag_components()
         # Configure retriever with our dataset
         retriever.index.dataset = dataset
@@ -76,9 +89,9 @@ def rag_pipeline(query, dataset, index):
 if query:
     with st.status("Looking for data in the best sources...", expanded=True) as status:
         st.write("Still looking... this may take a while as we look at some prestigious papers...")
-        dataset, index = load_rag_dataset()
         st.write("Found the best sources!")
-        answer = rag_pipeline(query, dataset, index)
         st.write("Now answering your question...")
         status.update(
             label="Searching complete!",

 import torch
 import logging
 import warnings
+from pathlib import Path
 # Configure logging
 logging.basicConfig(level=logging.WARNING)
 query = st.text_input("Please ask me anything about autism ✨")
 @st.cache_resource
+def load_rag_components(_dataset_path=None, _index_path=None):
     """Load and cache RAG components to avoid reloading."""
+    model_name = "facebook/rag-sequence-nq"
     tokenizer = RagTokenizer.from_pretrained(model_name)
+    retriever_config = {
+        "index_name": "custom",
+        "use_dummy_dataset": True
+    }
+    if _dataset_path and _index_path:
+        retriever_config.update({
+            "passages_path": _dataset_path,
+            "index_path": _index_path
+        })
+    retriever = RagRetriever.from_pretrained(model_name, **retriever_config)
     model = RagSequenceForGeneration.from_pretrained(model_name)
     return tokenizer, retriever, model
             initial_papers = faiss_index_index.fetch_arxiv_papers("autism research", max_results=100)
             dataset_dir = faiss_index_index.build_faiss_index(initial_papers, dataset_dir)
+    dataset_path = os.path.join(dataset_dir, "dataset")
+    index_path = os.path.join(dataset_dir, "embeddings.faiss")
     # Load the dataset and index
+    dataset = load_from_disk(dataset_path)
+    index = faiss.read_index(index_path)
+    return dataset, index, dataset_path, index_path
 # RAG Pipeline
+def rag_pipeline(query, dataset, index, dataset_path, index_path):
     try:
+        # Load cached components with paths
+        tokenizer, retriever, model = load_rag_components(dataset_path, index_path)
         # Configure retriever with our dataset
         retriever.index.dataset = dataset
 if query:
     with st.status("Looking for data in the best sources...", expanded=True) as status:
         st.write("Still looking... this may take a while as we look at some prestigious papers...")
+        dataset, index, dataset_path, index_path = load_rag_dataset()
         st.write("Found the best sources!")
+        answer = rag_pipeline(query, dataset, index, dataset_path, index_path)
         st.write("Now answering your question...")
         status.update(
             label="Searching complete!",

faiss_index/index.py CHANGED Viewed

@@ -20,11 +20,12 @@ def fetch_arxiv_papers(query, max_results=10):
 # Build and save dataset with FAISS index
 def build_faiss_index(papers, dataset_dir="rag_dataset"):
-    # Create dataset
     dataset = Dataset.from_dict({
         "id": [p["id"] for p in papers],
         "title": [p["title"] for p in papers],
-        "text": [p["text"] for p in papers],
     })
     # Initialize DPR context encoder (same as used by RAG)
@@ -45,9 +46,6 @@ def build_faiss_index(papers, dataset_dir="rag_dataset"):
     embeddings = np.vstack(embeddings)
-    # Add embeddings to dataset
-    dataset = dataset.add_column("embeddings", [emb.tolist() for emb in embeddings])
     # Create FAISS index
     dimension = embeddings.shape[1]  # Should be 768 for DPR
     index = faiss.IndexFlatL2(dimension)
@@ -55,7 +53,12 @@ def build_faiss_index(papers, dataset_dir="rag_dataset"):
     # Save dataset and index
     os.makedirs(dataset_dir, exist_ok=True)
     dataset.save_to_disk(os.path.join(dataset_dir, "dataset"))
     faiss.write_index(index, os.path.join(dataset_dir, "embeddings.faiss"))
     return dataset_dir

 # Build and save dataset with FAISS index
 def build_faiss_index(papers, dataset_dir="rag_dataset"):
+    # Create dataset with required columns for RAG
     dataset = Dataset.from_dict({
         "id": [p["id"] for p in papers],
+        "text": [p["text"] for p in papers],  # RAG expects 'text' field
         "title": [p["title"] for p in papers],
+        "embeddings": None,  # Will be filled later
     })
     # Initialize DPR context encoder (same as used by RAG)
     embeddings = np.vstack(embeddings)
     # Create FAISS index
     dimension = embeddings.shape[1]  # Should be 768 for DPR
     index = faiss.IndexFlatL2(dimension)
     # Save dataset and index
     os.makedirs(dataset_dir, exist_ok=True)
+    # Save dataset with embeddings
+    dataset = dataset.add_column("embeddings", [emb.tolist() for emb in embeddings])
     dataset.save_to_disk(os.path.join(dataset_dir, "dataset"))
+    # Save FAISS index
     faiss.write_index(index, os.path.join(dataset_dir, "embeddings.faiss"))
     return dataset_dir