Spaces:
Build error
Build error
import os | |
from dotenv import load_dotenv | |
from transformers import pipeline | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.chains import RetrievalQA | |
from langchain.llms import HuggingFaceHub | |
# Load environment variables from .env file | |
load_dotenv() | |
def extract_text_from_pdf(pdf_file): | |
"""Extracts text from a PDF file.""" | |
reader = PdfReader(pdf_file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() or "" | |
return text | |
def create_vector_store(text, embeddings_model="sentence-transformers/all-MiniLM-L6-v2"): | |
"""Creates a FAISS vector store from the input text.""" | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
texts = text_splitter.split_text(text) | |
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model) | |
return FAISS.from_texts(texts, embeddings) | |
def create_qa_pipeline(vector_store, llm_model="EleutherAI/gpt-neo-2.7B"): | |
"""Creates a Retrieval-based Question-Answering pipeline.""" | |
# Get the Hugging Face API token from the environment variable | |
huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
if huggingfacehub_api_token is None: | |
raise ValueError("HuggingFace Hub API token is missing! Please set the 'HUGGINGFACEHUB_API_TOKEN' in your .env file.") | |
retriever = vector_store.as_retriever() | |
# Initialize Hugging Face LLM with the API token | |
llm = HuggingFaceHub( | |
repo_id=llm_model, # specify the repo_id (e.g., gpt-neo-2.7B) | |
huggingfacehub_api_token=huggingfacehub_api_token, | |
task="text-generation" # specify the task (e.g., text-generation for language models) | |
) | |
return RetrievalQA.from_chain_type(llm, retriever=retriever) | |
def process_pdf_and_answer(pdf_path): | |
"""Processes the PDF and returns answers to the text inside.""" | |
# Extract text from the PDF | |
text = extract_text_from_pdf(pdf_path) | |
# Create a FAISS vector store | |
vector_store = create_vector_store(text) | |
# Create a QA pipeline | |
qa_pipeline = create_qa_pipeline(vector_store) | |
# Answer the question | |
# Since you no longer need to ask a question manually, just extract some context | |
answer = qa_pipeline.run("Extract key information from the PDF.") # Modify to get a summary or key data | |
return answer | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser(description="RAG Pipeline for PDF analysis") | |
parser.add_argument("--pdf", type=str, required=True, help="Path to the PDF file") | |
args = parser.parse_args() | |
pdf_path = args.pdf | |
# Process the PDF and get results | |
answer = process_pdf_and_answer(pdf_path) | |
print(f"Answer: {answer}") | |