File size: 2,937 Bytes
6a020f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import os
from dotenv import load_dotenv
from transformers import pipeline
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
# Load environment variables from .env file
load_dotenv()
def extract_text_from_pdf(pdf_file):
"""Extracts text from a PDF file."""
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
def create_vector_store(text, embeddings_model="sentence-transformers/all-MiniLM-L6-v2"):
"""Creates a FAISS vector store from the input text."""
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_text(text)
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
return FAISS.from_texts(texts, embeddings)
def create_qa_pipeline(vector_store, llm_model="EleutherAI/gpt-neo-2.7B"):
"""Creates a Retrieval-based Question-Answering pipeline."""
# Get the Hugging Face API token from the environment variable
huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if huggingfacehub_api_token is None:
raise ValueError("HuggingFace Hub API token is missing! Please set the 'HUGGINGFACEHUB_API_TOKEN' in your .env file.")
retriever = vector_store.as_retriever()
# Initialize Hugging Face LLM with the API token
llm = HuggingFaceHub(
repo_id=llm_model, # specify the repo_id (e.g., gpt-neo-2.7B)
huggingfacehub_api_token=huggingfacehub_api_token,
task="text-generation" # specify the task (e.g., text-generation for language models)
)
return RetrievalQA.from_chain_type(llm, retriever=retriever)
def process_pdf_and_answer(pdf_path):
"""Processes the PDF and returns answers to the text inside."""
# Extract text from the PDF
text = extract_text_from_pdf(pdf_path)
# Create a FAISS vector store
vector_store = create_vector_store(text)
# Create a QA pipeline
qa_pipeline = create_qa_pipeline(vector_store)
# Answer the question
# Since you no longer need to ask a question manually, just extract some context
answer = qa_pipeline.run("Extract key information from the PDF.") # Modify to get a summary or key data
return answer
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="RAG Pipeline for PDF analysis")
parser.add_argument("--pdf", type=str, required=True, help="Path to the PDF file")
args = parser.parse_args()
pdf_path = args.pdf
# Process the PDF and get results
answer = process_pdf_and_answer(pdf_path)
print(f"Answer: {answer}")
|