Spaces:

sohampawar1030
/

legal_document_summarization

Sleeping

App Files Files Community

legal_document_summarization / rag_pipeline.py

sohampawar1030

Upload 13 files

6a020f1 verified about 2 months ago

raw

history blame contribute delete

2.94 kB

	import os
	from dotenv import load_dotenv
	from transformers import pipeline
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.vectorstores import FAISS
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.chains import RetrievalQA
	from langchain.llms import HuggingFaceHub

	# Load environment variables from .env file
	load_dotenv()

	def extract_text_from_pdf(pdf_file):
	"""Extracts text from a PDF file."""
	reader = PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() or ""
	return text

	def create_vector_store(text, embeddings_model="sentence-transformers/all-MiniLM-L6-v2"):
	"""Creates a FAISS vector store from the input text."""
	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	texts = text_splitter.split_text(text)
	embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
	return FAISS.from_texts(texts, embeddings)

	def create_qa_pipeline(vector_store, llm_model="EleutherAI/gpt-neo-2.7B"):
	"""Creates a Retrieval-based Question-Answering pipeline."""

	# Get the Hugging Face API token from the environment variable
	huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

	if huggingfacehub_api_token is None:
	raise ValueError("HuggingFace Hub API token is missing! Please set the 'HUGGINGFACEHUB_API_TOKEN' in your .env file.")

	retriever = vector_store.as_retriever()

	# Initialize Hugging Face LLM with the API token
	llm = HuggingFaceHub(
	repo_id=llm_model, # specify the repo_id (e.g., gpt-neo-2.7B)
	huggingfacehub_api_token=huggingfacehub_api_token,
	task="text-generation" # specify the task (e.g., text-generation for language models)
	)

	return RetrievalQA.from_chain_type(llm, retriever=retriever)

	def process_pdf_and_answer(pdf_path):
	"""Processes the PDF and returns answers to the text inside."""

	# Extract text from the PDF
	text = extract_text_from_pdf(pdf_path)

	# Create a FAISS vector store
	vector_store = create_vector_store(text)

	# Create a QA pipeline
	qa_pipeline = create_qa_pipeline(vector_store)

	# Answer the question
	# Since you no longer need to ask a question manually, just extract some context
	answer = qa_pipeline.run("Extract key information from the PDF.") # Modify to get a summary or key data
	return answer

	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser(description="RAG Pipeline for PDF analysis")
	parser.add_argument("--pdf", type=str, required=True, help="Path to the PDF file")
	args = parser.parse_args()

	pdf_path = args.pdf

	# Process the PDF and get results
	answer = process_pdf_and_answer(pdf_path)
	print(f"Answer: {answer}")