import os
from dotenv import load_dotenv
from transformers import pipeline
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
def extract_text_from_pdf(pdf_file):
"""Extracts text from a PDF file."""
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
def create_vector_store(text, embeddings_model="sentence-transformers/all-MiniLM-L6-v2"):
"""Creates a FAISS vector store from the input text."""
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_text(text)
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
return FAISS.from_texts(texts, embeddings)
def create_qa_pipeline(vector_store, llm_model="EleutherAI/gpt-neo-2.7B"):
"""Creates a Retrieval-based Question-Answering pipeline."""
huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if huggingfacehub_api_token is None:
raise ValueError("HuggingFace Hub API token is missing! Please set the 'HUGGINGFACEHUB_API_TOKEN' in your .env file.")
retriever = vector_store.as_retriever()
llm = HuggingFaceHub(
return RetrievalQA.from_chain_type(llm, retriever=retriever)
def process_pdf_and_answer(pdf_path):
"""Processes the PDF and returns answers to the text inside."""
text = extract_text_from_pdf(pdf_path)
vector_store = create_vector_store(text)
qa_pipeline = create_qa_pipeline(vector_store)
answer = qa_pipeline.run("Extract key information from the PDF.")
return answer
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="RAG Pipeline for PDF analysis")
parser.add_argument("--pdf", type=str, required=True, help="Path to the PDF file")
args = parser.parse_args()
pdf_path = args.pdf
answer = process_pdf_and_answer(pdf_path)
print(f"Answer: {answer}")