abhinavyadav11's picture
Upload 2 files
4fe3a6c verified
raw
history blame
2.29 kB
import re
from io import BytesIO
from typing import Tuple, List
import pickle
from langchain.docstore.document import Document
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from pypdf import PdfReader
import faiss
def parse_pdf(file: BytesIO, filename: str) -> Tuple[List[str], str]:
pdf = PdfReader(file)
output = []
for page in pdf.pages:
text = page.extract_text()
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
text = re.sub(r"\n\s*\n", "\n\n", text)
output.append(text)
return output, filename
def text_to_docs(text: List[str], filename: str) -> List[Document]:
if isinstance(text, str):
text = [text]
page_docs = [Document(page_content=page) for page in text]
for i, doc in enumerate(page_docs):
doc.metadata["page"] = i + 1
doc_chunks = []
for doc in page_docs:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=4000,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
chunk_overlap=0,
)
chunks = text_splitter.split_text(doc.page_content)
for i, chunk in enumerate(chunks):
doc = Document(
page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
)
doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
doc.metadata["filename"] = filename # Add filename to metadata
doc_chunks.append(doc)
return doc_chunks
def docs_to_index(docs, huggingface_model_name):
# Using Hugging Face embeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
index = FAISS.from_documents(docs, embedding_model)
return index
def get_index_for_pdf(pdf_files, pdf_names, huggingface_model_name):
documents = []
for pdf_file, pdf_name in zip(pdf_files, pdf_names):
text, filename = parse_pdf(BytesIO(pdf_file), pdf_name)
documents = documents + text_to_docs(text, filename)
index = docs_to_index(documents, huggingface_model_name)
return index