import re from io import BytesIO from typing import Tuple, List import pickle from langchain.docstore.document import Document from langchain.embeddings.huggingface import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores.faiss import FAISS from pypdf import PdfReader import faiss def parse_pdf(file: BytesIO, filename: str) -> Tuple[List[str], str]: pdf = PdfReader(file) output = [] for page in pdf.pages: text = page.extract_text() text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) text = re.sub(r"(? List[Document]: if isinstance(text, str): text = [text] page_docs = [Document(page_content=page) for page in text] for i, doc in enumerate(page_docs): doc.metadata["page"] = i + 1 doc_chunks = [] for doc in page_docs: text_splitter = RecursiveCharacterTextSplitter( chunk_size=4000, separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], chunk_overlap=0, ) chunks = text_splitter.split_text(doc.page_content) for i, chunk in enumerate(chunks): doc = Document( page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i} ) doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}" doc.metadata["filename"] = filename # Add filename to metadata doc_chunks.append(doc) return doc_chunks def docs_to_index(docs, huggingface_model_name): # Using Hugging Face embeddings embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") index = FAISS.from_documents(docs, embedding_model) return index def get_index_for_pdf(pdf_files, pdf_names, huggingface_model_name): documents = [] for pdf_file, pdf_name in zip(pdf_files, pdf_names): text, filename = parse_pdf(BytesIO(pdf_file), pdf_name) documents = documents + text_to_docs(text, filename) index = docs_to_index(documents, huggingface_model_name) return index