Spaces:
Sleeping
Sleeping
import numpy as np | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
import arxiv | |
from datasets import Dataset | |
import os | |
# Fetch arXiv papers | |
def fetch_arxiv_papers(query, max_results=10): | |
client = arxiv.Client() | |
search = arxiv.Search( | |
query=query, | |
max_results=max_results, | |
sort_by=arxiv.SortCriterion.SubmittedDate | |
) | |
results = list(client.results(search)) | |
papers = [{"title": result.title, "text": result.summary, "id": str(i)} for i, result in enumerate(results)] | |
return papers | |
# Build and save dataset with FAISS index | |
def build_faiss_index(papers, dataset_dir="rag_dataset"): | |
# Create dataset | |
dataset = Dataset.from_dict({ | |
"id": [p["id"] for p in papers], | |
"title": [p["title"] for p in papers], | |
"text": [p["text"] for p in papers], | |
}) | |
# Create embeddings | |
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
embeddings = embedder.encode(dataset["text"], show_progress_bar=True) | |
# Add embeddings to dataset | |
dataset = dataset.add_column("embeddings", [emb.tolist() for emb in embeddings]) | |
# Create FAISS index | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(embeddings.astype(np.float32)) | |
# Save dataset and index | |
os.makedirs(dataset_dir, exist_ok=True) | |
dataset.save_to_disk(os.path.join(dataset_dir, "dataset")) | |
faiss.write_index(index, os.path.join(dataset_dir, "embeddings.faiss")) | |
return dataset_dir | |
# Example usage | |
if __name__ == "__main__": | |
query = "quantum computing" | |
papers = fetch_arxiv_papers(query) | |
build_faiss_index(papers) |