Spaces:
Runtime error
Runtime error
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') | |
| import chainlit as cl | |
| import pymupdf | |
| import tiktoken | |
| from langchain_core.documents.base import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import getVectorstore | |
| from getVectorstore import getVectorstore | |
| from qdrant_client.http import models as rest | |
| from langchain.prompts import ChatPromptTemplate | |
| import prompts | |
| from prompts import rag_prompt_template | |
| from defaults import default_llm | |
| from operator import itemgetter | |
| from langchain.schema.output_parser import StrOutputParser | |
| async def on_chat_start(): | |
| files = await cl.AskFileMessage( | |
| content="Upload a file to proceed", | |
| accept=["application/pdf"], | |
| max_size_mb=50, | |
| timeout=180, | |
| ).send() | |
| file = files[0] | |
| doc = pymupdf.Document(file.path) | |
| toc = doc.get_toc() | |
| # Want to find the List Of Figures page because that is the last page I want to skip | |
| # Default is 1 if I do not find better start location | |
| start_page = 1 | |
| for _, title, page in toc: | |
| if title == "List of Figures": | |
| print(f"{title} on page {page}") | |
| start_page = page + 1 | |
| # get the last page I want included | |
| # default is last page of document | |
| end_page = len(doc) | |
| for _, title, page in toc: | |
| if ("References" in title) or ("Bibliography" in title): | |
| print(f"{title} on page {page}") | |
| end_page = page | |
| print(f"Extraction should start on page {start_page} and end on page {end_page}") | |
| # need a rect that will exclude headers and footers | |
| rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0) | |
| #create the final text | |
| extracted_text = "" | |
| for page in doc.pages(): | |
| if page.number in range(start_page-1, end_page): | |
| # print(page.get_text(clip=rect)) | |
| extracted_text += page.get_text(clip=rect) | |
| msg = cl.Message( | |
| content=f"""Processing selected file: `{file.name}`... | |
| Extraction beginning on page {start_page} and ending on page {end_page}. | |
| Using a clipping rectangle to exclude headers and footers ({rect}). | |
| Processed {end_page - start_page} pages of PDF document. | |
| Length of extracted text string is {len(extracted_text)} | |
| """ | |
| ) | |
| await msg.send() | |
| chunk_size = 2000 | |
| chunk_overlap = 200 | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap = chunk_overlap, | |
| # length_function = tiktoken_len | |
| ) | |
| text_chunks = text_splitter.split_text(extracted_text) | |
| # print(f"Number of chunks: {len(text_chunks)} ") | |
| document = [Document(page_content=chunk) for chunk in text_chunks] | |
| # print(f"Length of document: {len(document)}") | |
| msg = cl.Message( | |
| content=f"""Splitting the text with a recursive character splitter. | |
| Set chunk size at {chunk_size} and overlap at {chunk_overlap}. | |
| Number of resulting chunks: {len(text_chunks)}. | |
| Document created from chunks to get stored in vector database. | |
| Length of the document: {len(document)} (should be same as number of chunks). | |
| """ | |
| ) | |
| await msg.send() | |
| qdrant_vectorstore = getVectorstore(document, file.path) | |
| document_titles = ["protocol.pdf", "consent.pdf"] | |
| # protocol_retriever = qdrant_vectorstore.as_retriever() | |
| # protocol_retriever = create_protocol_retriever(document_titles) | |
| protocol_retriever = qdrant_vectorstore.as_retriever( | |
| search_kwargs={ | |
| 'filter': rest.Filter( | |
| must=[ | |
| rest.FieldCondition( | |
| key="metadata.document_title", | |
| match=rest.MatchAny(any=document_titles) | |
| ) | |
| ] | |
| ), | |
| 'k':15, | |
| } | |
| ) | |
| # Create prompt | |
| rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template) | |
| llm = default_llm | |
| rag_chain = ( | |
| {"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")} | |
| | rag_prompt | llm | StrOutputParser() | |
| ) | |