File size: 2,937 Bytes
6a020f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
from dotenv import load_dotenv
from transformers import pipeline
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub

# Load environment variables from .env file
load_dotenv()

def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF file."""
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

def create_vector_store(text, embeddings_model="sentence-transformers/all-MiniLM-L6-v2"):
    """Creates a FAISS vector store from the input text."""
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_text(text)
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
    return FAISS.from_texts(texts, embeddings)

def create_qa_pipeline(vector_store, llm_model="EleutherAI/gpt-neo-2.7B"):
    """Creates a Retrieval-based Question-Answering pipeline."""
    
    # Get the Hugging Face API token from the environment variable
    huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
    
    if huggingfacehub_api_token is None:
        raise ValueError("HuggingFace Hub API token is missing! Please set the 'HUGGINGFACEHUB_API_TOKEN' in your .env file.")
    
    retriever = vector_store.as_retriever()

    # Initialize Hugging Face LLM with the API token
    llm = HuggingFaceHub(
        repo_id=llm_model,  # specify the repo_id (e.g., gpt-neo-2.7B)
        huggingfacehub_api_token=huggingfacehub_api_token, 
        task="text-generation"  # specify the task (e.g., text-generation for language models)
    )
    
    return RetrievalQA.from_chain_type(llm, retriever=retriever)

def process_pdf_and_answer(pdf_path):
    """Processes the PDF and returns answers to the text inside."""
    
    # Extract text from the PDF
    text = extract_text_from_pdf(pdf_path)

    # Create a FAISS vector store
    vector_store = create_vector_store(text)

    # Create a QA pipeline
    qa_pipeline = create_qa_pipeline(vector_store)

    # Answer the question
    # Since you no longer need to ask a question manually, just extract some context
    answer = qa_pipeline.run("Extract key information from the PDF.")  # Modify to get a summary or key data
    return answer

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="RAG Pipeline for PDF analysis")
    parser.add_argument("--pdf", type=str, required=True, help="Path to the PDF file")
    args = parser.parse_args()

    pdf_path = args.pdf

    # Process the PDF and get results
    answer = process_pdf_and_answer(pdf_path)
    print(f"Answer: {answer}")