Spaces:

Mdean77
/

Informed_Consent

Runtime error

App Files Files Community

Informed_Consent / app.py

Mdean77

my files

1f49ee0 10 months ago

raw

history blame

2.89 kB

	import os
	from dotenv import load_dotenv

	load_dotenv()
	OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

	import chainlit as cl
	import pymupdf
	import tiktoken
	from langchain_core.documents.base import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# def tiktoken_len(text):
	# tokens = tiktoken.encoding_for_model("gpt-4o").encode(
	# text,
	# )
	# return len(tokens)

	@cl.on_chat_start
	async def on_chat_start():
	files = await cl.AskFileMessage(
	content="Upload a file to proceed",
	accept=["application/pdf"],
	max_size_mb=50,
	timeout=180,
	).send()

	file = files[0]

	doc = pymupdf.Document(file.path)
	toc = doc.get_toc()
	# Want to find the List Of Figures page because that is the last page I want to skip
	for _, title, page in toc:
	if title == "List of Figures":
	print(f"{title} on page {page}")
	start_page = page + 1

	# get the last page I want included
	for _, title, page in toc:
	if ("References" in title) or ("Bibliography" in title):
	print(f"{title} on page {page}")
	end_page = page

	print(f"Extraction should start on page {start_page} and end on page {end_page}")


	# need a rect that will exclude headers and footers
	rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)

	#create the final text
	extracted_text = ""
	for page in doc.pages():
	if page.number in range(start_page-1, end_page):
	# print(page.get_text(clip=rect))
	extracted_text += page.get_text(clip=rect)
	msg = cl.Message(
	content=f"""Processing selected file: `{file.name}`...
	Extraction beginning on page {start_page} and ending on page {end_page}.
	Using a clipping rectangle to exclude headers and footers ({rect}).
	Processed {end_page - start_page} pages of PDF document.
	Length of extracted text string is {len(extracted_text)}
	"""
	)
	await msg.send()

	chunk_size = 2000
	chunk_overlap = 200

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap = chunk_overlap,
	# length_function = tiktoken_len
	)

	text_chunks = text_splitter.split_text(extracted_text)
	# print(f"Number of chunks: {len(text_chunks)} ")
	document = [Document(page_content=chunk) for chunk in text_chunks]
	# print(f"Length of document: {len(document)}")

	msg = cl.Message(
	content=f"""Splitting the text with a recursive character splitter.
	Set chunk size at {chunk_size} and overlap at {chunk_overlap}.
	Number of resulting chunks: {len(text_chunks)}.
	Document created from chunks to get stored in vector database.
	Length of the document: {len(document)} (should be same as number of chunks).
	"""
	)

	await msg.send()