Spaces:
Runtime error
Runtime error
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') | |
import chainlit as cl | |
import pymupdf | |
import tiktoken | |
from langchain_core.documents.base import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# def tiktoken_len(text): | |
# tokens = tiktoken.encoding_for_model("gpt-4o").encode( | |
# text, | |
# ) | |
# return len(tokens) | |
async def on_chat_start(): | |
files = await cl.AskFileMessage( | |
content="Upload a file to proceed", | |
accept=["application/pdf"], | |
max_size_mb=50, | |
timeout=180, | |
).send() | |
file = files[0] | |
doc = pymupdf.Document(file.path) | |
toc = doc.get_toc() | |
# Want to find the List Of Figures page because that is the last page I want to skip | |
for _, title, page in toc: | |
if title == "List of Figures": | |
print(f"{title} on page {page}") | |
start_page = page + 1 | |
# get the last page I want included | |
for _, title, page in toc: | |
if ("References" in title) or ("Bibliography" in title): | |
print(f"{title} on page {page}") | |
end_page = page | |
print(f"Extraction should start on page {start_page} and end on page {end_page}") | |
# need a rect that will exclude headers and footers | |
rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0) | |
#create the final text | |
extracted_text = "" | |
for page in doc.pages(): | |
if page.number in range(start_page-1, end_page): | |
# print(page.get_text(clip=rect)) | |
extracted_text += page.get_text(clip=rect) | |
msg = cl.Message( | |
content=f"""Processing selected file: `{file.name}`... | |
Extraction beginning on page {start_page} and ending on page {end_page}. | |
Using a clipping rectangle to exclude headers and footers ({rect}). | |
Processed {end_page - start_page} pages of PDF document. | |
Length of extracted text string is {len(extracted_text)} | |
""" | |
) | |
await msg.send() | |
chunk_size = 2000 | |
chunk_overlap = 200 | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap = chunk_overlap, | |
# length_function = tiktoken_len | |
) | |
text_chunks = text_splitter.split_text(extracted_text) | |
# print(f"Number of chunks: {len(text_chunks)} ") | |
document = [Document(page_content=chunk) for chunk in text_chunks] | |
# print(f"Length of document: {len(document)}") | |
msg = cl.Message( | |
content=f"""Splitting the text with a recursive character splitter. | |
Set chunk size at {chunk_size} and overlap at {chunk_overlap}. | |
Number of resulting chunks: {len(text_chunks)}. | |
Document created from chunks to get stored in vector database. | |
Length of the document: {len(document)} (should be same as number of chunks). | |
""" | |
) | |
await msg.send() |