Spaces:

sohampawar1030
/

legal_document_summarization

Sleeping

File size: 2,937 Bytes

6a020f1

import os
from dotenv import load_dotenv
from transformers import pipeline
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub

# Load environment variables from .env file
load_dotenv()

def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF file."""
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

def create_vector_store(text, embeddings_model="sentence-transformers/all-MiniLM-L6-v2"):
    """Creates a FAISS vector store from the input text."""
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_text(text)
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
    return FAISS.from_texts(texts, embeddings)

def create_qa_pipeline(vector_store, llm_model="EleutherAI/gpt-neo-2.7B"):
    """Creates a Retrieval-based Question-Answering pipeline."""
    
    # Get the Hugging Face API token from the environment variable
    huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
    
    if huggingfacehub_api_token is None:
        raise ValueError("HuggingFace Hub API token is missing! Please set the 'HUGGINGFACEHUB_API_TOKEN' in your .env file.")
    
    retriever = vector_store.as_retriever()

    # Initialize Hugging Face LLM with the API token
    llm = HuggingFaceHub(
        repo_id=llm_model,  # specify the repo_id (e.g., gpt-neo-2.7B)
        huggingfacehub_api_token=huggingfacehub_api_token, 
        task="text-generation"  # specify the task (e.g., text-generation for language models)
    )
    
    return RetrievalQA.from_chain_type(llm, retriever=retriever)

def process_pdf_and_answer(pdf_path):
    """Processes the PDF and returns answers to the text inside."""
    
    # Extract text from the PDF
    text = extract_text_from_pdf(pdf_path)

    # Create a FAISS vector store
    vector_store = create_vector_store(text)

    # Create a QA pipeline
    qa_pipeline = create_qa_pipeline(vector_store)

    # Answer the question
    # Since you no longer need to ask a question manually, just extract some context
    answer = qa_pipeline.run("Extract key information from the PDF.")  # Modify to get a summary or key data
    return answer

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="RAG Pipeline for PDF analysis")
    parser.add_argument("--pdf", type=str, required=True, help="Path to the PDF file")
    args = parser.parse_args()

    pdf_path = args.pdf

    # Process the PDF and get results
    answer = process_pdf_and_answer(pdf_path)
    print(f"Answer: {answer}")