legal_document_summarization / rag_pipeline.py
sohampawar1030's picture
Upload 13 files
6a020f1 verified
import os
from dotenv import load_dotenv
from transformers import pipeline
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
# Load environment variables from .env file
load_dotenv()
def extract_text_from_pdf(pdf_file):
"""Extracts text from a PDF file."""
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
def create_vector_store(text, embeddings_model="sentence-transformers/all-MiniLM-L6-v2"):
"""Creates a FAISS vector store from the input text."""
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_text(text)
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
return FAISS.from_texts(texts, embeddings)
def create_qa_pipeline(vector_store, llm_model="EleutherAI/gpt-neo-2.7B"):
"""Creates a Retrieval-based Question-Answering pipeline."""
# Get the Hugging Face API token from the environment variable
huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if huggingfacehub_api_token is None:
raise ValueError("HuggingFace Hub API token is missing! Please set the 'HUGGINGFACEHUB_API_TOKEN' in your .env file.")
retriever = vector_store.as_retriever()
# Initialize Hugging Face LLM with the API token
llm = HuggingFaceHub(
repo_id=llm_model, # specify the repo_id (e.g., gpt-neo-2.7B)
huggingfacehub_api_token=huggingfacehub_api_token,
task="text-generation" # specify the task (e.g., text-generation for language models)
)
return RetrievalQA.from_chain_type(llm, retriever=retriever)
def process_pdf_and_answer(pdf_path):
"""Processes the PDF and returns answers to the text inside."""
# Extract text from the PDF
text = extract_text_from_pdf(pdf_path)
# Create a FAISS vector store
vector_store = create_vector_store(text)
# Create a QA pipeline
qa_pipeline = create_qa_pipeline(vector_store)
# Answer the question
# Since you no longer need to ask a question manually, just extract some context
answer = qa_pipeline.run("Extract key information from the PDF.") # Modify to get a summary or key data
return answer
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="RAG Pipeline for PDF analysis")
parser.add_argument("--pdf", type=str, required=True, help="Path to the PDF file")
args = parser.parse_args()
pdf_path = args.pdf
# Process the PDF and get results
answer = process_pdf_and_answer(pdf_path)
print(f"Answer: {answer}")