File size: 5,305 Bytes
1f49ee0
 
 
 
 
 
 
 
 
 
 
2edf2fb
 
 
 
 
 
 
 
 
 
 
 
 
1f49ee0
 
 
 
 
 
 
 
 
 
 
 
5e4b78a
1f49ee0
 
 
 
004e22c
 
1f49ee0
 
 
 
 
004e22c
1f49ee0
004e22c
 
1f49ee0
 
 
 
 
004e22c
1f49ee0
 
 
 
 
 
5e4b78a
1f49ee0
5e4b78a
 
1f49ee0
5e4b78a
 
 
1f49ee0
 
5e4b78a
 
1f49ee0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2edf2fb
 
5e4b78a
2edf2fb
5e4b78a
 
2edf2fb
 
 
 
 
5e4b78a
2edf2fb
 
5e4b78a
2edf2fb
 
 
5e4b78a
2edf2fb
 
5e4b78a
2edf2fb
5e4b78a
 
2edf2fb
 
 
 
 
 
 
 
 
5e4b78a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

import chainlit as cl
import pymupdf
import tiktoken
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import getVectorstore
from getVectorstore import getVectorstore
from qdrant_client.http import models as rest
from langchain.prompts import ChatPromptTemplate
import prompts
from prompts import rag_prompt_template
from defaults import default_llm
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser






@cl.on_chat_start
async def on_chat_start():
    files = await cl.AskFileMessage(
        content="Upload a file to proceed",
        accept=["application/pdf"],
        max_size_mb=50,
        timeout=180,
    ).send()

    file = files[0]
    print(f"filename is {file.name}")

    doc = pymupdf.Document(file.path)
    toc = doc.get_toc()
    # Want to find the List Of Figures page because that is the last page I want to skip
    # Default is 1 if I do not find better start location
    start_page = 1
    for _, title, page in toc:
        if title == "List of Figures":
            print(f"{title} on page {page}")
            start_page = page + 1


    # get the last page I want included
    # default is last page of document
    end_page = len(doc)
    for _, title, page in toc:
        if ("References" in title) or ("Bibliography" in title):
            print(f"{title} on page {page}")
            end_page = page


    print(f"Extraction should start on page {start_page} and end on page {end_page}")


    # need a rect that will exclude headers and footers
    rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)

    #capture the first 2 page
    extracted_text = ""


    for page in doc.pages():
        if page.number in [0, 1, 2]:
            extracted_text += page.get_text()
        elif page.number in range(start_page-1, end_page):
            # print(page.get_text(clip=rect))
            extracted_text += page.get_text(clip=rect)


    msg = cl.Message(
        content=f"""Processing selected file: `{file.name}`...
        Extraction beginning on page {start_page} and ending on page {end_page}.
        Using a clipping rectangle to exclude headers and footers ({rect}).
        Processed {end_page - start_page} pages of PDF document.
        Length of extracted text string is {len(extracted_text)}
        """
    )
    await msg.send()

    chunk_size = 2000
    chunk_overlap = 200

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap = chunk_overlap,
        # length_function = tiktoken_len
    )

    text_chunks = text_splitter.split_text(extracted_text)
    # print(f"Number of chunks: {len(text_chunks)} ")
    document = [Document(page_content=chunk) for chunk in text_chunks]
    # print(f"Length of  document: {len(document)}")

    msg = cl.Message(
        content=f"""Splitting the text with a recursive character splitter.
        Set chunk size at {chunk_size} and overlap at {chunk_overlap}.
        Number of resulting chunks: {len(text_chunks)}.
        Document created from chunks to get stored in vector database.
        Length of the document: {len(document)} (should be same as number of chunks).
        """
    )

    await msg.send()

    qdrant_vectorstore = getVectorstore(document, file.name)

    protocol_retriever = qdrant_vectorstore.as_retriever(search_kwargs={"k":15})
    # document_titles = [file.name]


    protocol_retriever = qdrant_vectorstore.as_retriever(
        search_kwargs={
            'filter': rest.Filter(
                 must=[
                    rest.FieldCondition(
                        key="metadata.document_title",
                         match=rest.MatchAny(any=[file.name])
                    )
                ]
            ),
            'k': 15,                                       
        }
    )
    # # protocol_retriever = qdrant_vectorstore.as_retriever()

    # protocol_retriever = create_protocol_retriever(document_titles)
 
    # Create prompt
    rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)

    llm = default_llm

    rag_chain = (
        {"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
        | rag_prompt | llm | StrOutputParser()
    )

    from datetime import date
    # Heading for top of ICF document
    protocol_title = rag_chain.invoke({"question": "What is the exact title of this protocol?  Only return the title itself without any other description."})
    principal_investigator = rag_chain.invoke({"question":"What is the name of the principal investigator of the study?  Only return the name itself without any other description."})
    support = rag_chain.invoke({"question":"What agency is funding the study?  Only return the name of the agency without any other description."})
    version_date = date.today().strftime("%B %d, %Y")

    msg = cl.Message(
        content=f""" 
        **Study Title:** {protocol_title}
        **Principal Investigator:** {principal_investigator}
        **Version Date:** {version_date}
        **Source of Support:** {support}
        ---
        """
    )

    await msg.send()