Spaces:
Runtime error
Runtime error
File size: 5,305 Bytes
1f49ee0 2edf2fb 1f49ee0 5e4b78a 1f49ee0 004e22c 1f49ee0 004e22c 1f49ee0 004e22c 1f49ee0 004e22c 1f49ee0 5e4b78a 1f49ee0 5e4b78a 1f49ee0 5e4b78a 1f49ee0 5e4b78a 1f49ee0 2edf2fb 5e4b78a 2edf2fb 5e4b78a 2edf2fb 5e4b78a 2edf2fb 5e4b78a 2edf2fb 5e4b78a 2edf2fb 5e4b78a 2edf2fb 5e4b78a 2edf2fb 5e4b78a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
import chainlit as cl
import pymupdf
import tiktoken
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import getVectorstore
from getVectorstore import getVectorstore
from qdrant_client.http import models as rest
from langchain.prompts import ChatPromptTemplate
import prompts
from prompts import rag_prompt_template
from defaults import default_llm
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
@cl.on_chat_start
async def on_chat_start():
files = await cl.AskFileMessage(
content="Upload a file to proceed",
accept=["application/pdf"],
max_size_mb=50,
timeout=180,
).send()
file = files[0]
print(f"filename is {file.name}")
doc = pymupdf.Document(file.path)
toc = doc.get_toc()
# Want to find the List Of Figures page because that is the last page I want to skip
# Default is 1 if I do not find better start location
start_page = 1
for _, title, page in toc:
if title == "List of Figures":
print(f"{title} on page {page}")
start_page = page + 1
# get the last page I want included
# default is last page of document
end_page = len(doc)
for _, title, page in toc:
if ("References" in title) or ("Bibliography" in title):
print(f"{title} on page {page}")
end_page = page
print(f"Extraction should start on page {start_page} and end on page {end_page}")
# need a rect that will exclude headers and footers
rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
#capture the first 2 page
extracted_text = ""
for page in doc.pages():
if page.number in [0, 1, 2]:
extracted_text += page.get_text()
elif page.number in range(start_page-1, end_page):
# print(page.get_text(clip=rect))
extracted_text += page.get_text(clip=rect)
msg = cl.Message(
content=f"""Processing selected file: `{file.name}`...
Extraction beginning on page {start_page} and ending on page {end_page}.
Using a clipping rectangle to exclude headers and footers ({rect}).
Processed {end_page - start_page} pages of PDF document.
Length of extracted text string is {len(extracted_text)}
"""
)
await msg.send()
chunk_size = 2000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap = chunk_overlap,
# length_function = tiktoken_len
)
text_chunks = text_splitter.split_text(extracted_text)
# print(f"Number of chunks: {len(text_chunks)} ")
document = [Document(page_content=chunk) for chunk in text_chunks]
# print(f"Length of document: {len(document)}")
msg = cl.Message(
content=f"""Splitting the text with a recursive character splitter.
Set chunk size at {chunk_size} and overlap at {chunk_overlap}.
Number of resulting chunks: {len(text_chunks)}.
Document created from chunks to get stored in vector database.
Length of the document: {len(document)} (should be same as number of chunks).
"""
)
await msg.send()
qdrant_vectorstore = getVectorstore(document, file.name)
protocol_retriever = qdrant_vectorstore.as_retriever(search_kwargs={"k":15})
# document_titles = [file.name]
protocol_retriever = qdrant_vectorstore.as_retriever(
search_kwargs={
'filter': rest.Filter(
must=[
rest.FieldCondition(
key="metadata.document_title",
match=rest.MatchAny(any=[file.name])
)
]
),
'k': 15,
}
)
# # protocol_retriever = qdrant_vectorstore.as_retriever()
# protocol_retriever = create_protocol_retriever(document_titles)
# Create prompt
rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)
llm = default_llm
rag_chain = (
{"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
| rag_prompt | llm | StrOutputParser()
)
from datetime import date
# Heading for top of ICF document
protocol_title = rag_chain.invoke({"question": "What is the exact title of this protocol? Only return the title itself without any other description."})
principal_investigator = rag_chain.invoke({"question":"What is the name of the principal investigator of the study? Only return the name itself without any other description."})
support = rag_chain.invoke({"question":"What agency is funding the study? Only return the name of the agency without any other description."})
version_date = date.today().strftime("%B %d, %Y")
msg = cl.Message(
content=f"""
**Study Title:** {protocol_title}
**Principal Investigator:** {principal_investigator}
**Version Date:** {version_date}
**Source of Support:** {support}
---
"""
)
await msg.send()
|