import numpy as np import faiss from sentence_transformers import SentenceTransformer import arxiv from datasets import Dataset import os # Fetch arXiv papers def fetch_arxiv_papers(query, max_results=10): client = arxiv.Client() search = arxiv.Search( query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate ) results = list(client.results(search)) papers = [{"title": result.title, "text": result.summary, "id": str(i)} for i, result in enumerate(results)] return papers # Build and save dataset with FAISS index def build_faiss_index(papers, dataset_dir="rag_dataset"): # Create dataset dataset = Dataset.from_dict({ "id": [p["id"] for p in papers], "title": [p["title"] for p in papers], "text": [p["text"] for p in papers], }) # Create embeddings embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') embeddings = embedder.encode(dataset["text"], show_progress_bar=True) # Add embeddings to dataset dataset = dataset.add_column("embeddings", [emb.tolist() for emb in embeddings]) # Create FAISS index dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings.astype(np.float32)) # Save dataset and index os.makedirs(dataset_dir, exist_ok=True) dataset.save_to_disk(os.path.join(dataset_dir, "dataset")) faiss.write_index(index, os.path.join(dataset_dir, "embeddings.faiss")) return dataset_dir # Example usage if __name__ == "__main__": query = "quantum computing" papers = fetch_arxiv_papers(query) build_faiss_index(papers)