File size: 7,572 Bytes
64fd6d4
 
 
 
 
 
 
 
 
 
1fecdf1
15bcf0e
64fd6d4
1fecdf1
 
 
 
 
 
 
 
 
64fd6d4
 
 
1fecdf1
 
23f69ee
1fecdf1
 
 
 
 
 
 
 
 
 
 
 
 
 
64fd6d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fecdf1
64fd6d4
1fecdf1
 
 
 
 
 
 
 
 
 
ef29c72
64fd6d4
 
 
 
 
1fecdf1
64fd6d4
1fecdf1
 
 
 
 
 
 
 
64fd6d4
1fecdf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64fd6d4
1fecdf1
 
 
 
 
 
 
64fd6d4
 
 
 
 
 
1fecdf1
64fd6d4
 
 
10bb25e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
from langchain.docstore.document import Document
"""Core Modules s"""
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, NLTKTextSplitter, CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain_community.document_loaders import Docx2txtLoader
from langchain import hub
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import Chroma
import os
import gradio as gr
import os
from typing import List
from pydantic import BaseModel
from langchain_core.prompts import ChatPromptTemplate
from unstructured.partition.pdf import partition_pdf
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.document_loaders import UnstructuredPDFLoader



# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="rag_app",embedding_function=OpenAIEmbeddings(api_key=os.environ['OpenAI_APIKEY']))

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

def split_text(doc:str, split_mode:str='tiktoken', 
                      chunk_size:int=1000, chunk_overlap:int=5, faiss_save_path:str=None, save_faiss:bool=None):
    # Split by separator and merge by character count
    if split_mode == "character":
        # Create a CharacterTextSplitter object
        text_splitter = CharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
    # Recursively split until below the chunk size limit
    elif split_mode == "recursive_character":
        # Create a RecursiveCharacterTextSplitter object
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
    elif split_mode == "nltk":
        # Create a NLTKTextSplitter object
        text_splitter = NLTKTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
    elif split_mode == "tiktoken":
        # Create a CharacterTextSplitter object
        text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,)
    else:
        raise ValueError("Please specify the split mode.") 
    documents = text_splitter.split_documents(doc)
    return documents

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

class Element(BaseModel):
    type: str
    text: str

def save_documents(Documents):
    embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OpenAI_APIKEY'])
    faiss_db = FAISS.from_documents(documents, embeddings)
    if save_faiss:
        faiss_db.save_local(faiss_save_path)
    return faiss_db 

def save_documents(texts, text_summaries, tables, table_summaries):

    # Add texts
    doc_ids = [str(uuid.uuid4()) for _ in texts]
    summary_texts = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(text_summaries)
    ]
    retriever.vectorstore.add_documents(summary_texts)
    retriever.docstore.mset(list(zip(doc_ids, texts)))

    # Add tables
    table_ids = [str(uuid.uuid4()) for _ in tables]
    summary_tables = [
        Document(page_content=s, metadata={id_key: table_ids[i]})
        for i, s in enumerate(table_summaries)
    ]
    retriever.vectorstore.add_documents(summary_tables)
    retriever.docstore.mset(list(zip(table_ids, tables)))


def doc_processing(files: List[bytes]):
    docs = []
    tables = []
    for file in files:
        if file.name.endswith(".pdf"):
    # Identify file type and process accordingly
            raw_pdf_elements = partition_pdf(
            filename=file,
            extract_images_in_pdf=False,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=4000,
            new_after_n_chars=3800,
            combine_text_under_n_chars=2000,
            image_output_dir_path='/tmp',  # Change this to your desired path
            )
            categorized_elements = []
            for element in raw_pdf_elements:
                if "unstructured.documents.elements.Table" in str(type(element)):
                    categorized_elements.append(Element(type="table", text=str(element)))
                elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
                    categorized_elements.append(Element(type="text", text=str(element)))
            
            # Extract text and table elements
            text_elements = [e for e in categorized_elements if e.type == "text"]
            table_elements = [e for e in categorized_elements if e.type == "table"]
            docs.extend(text_elements)
            tables.extend(table_elements)  
        elif file.name.endswith(".docx"):
            # Process DOCX file using LangChain Docx2txtLoader
            loader = Docx2txtLoader(file)
            data = loader.load()
            docs.extend(data)

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables and text. 
    Give a concise summary of the table or text. Table or text chunk: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4")
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
    text_summaries = summarize_chain.batch(docs, {"max_concurrency": 5})

    return docs, tables, text_summaries, table_summaries

    # Convert the list of document texts to embeddings


def wrap_all(files: List[bytes], input_prompt: str):
    
    save_documents(doc_processing(files))
        # Prompt template
    template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
                 If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. Please cite the text that you are using to base your arguments when it is possible. 

                Question: {question} 

                Context: {context} 

                Answer:
                    """
    prompt = ChatPromptTemplate.from_template(template)
    # Load the prompt template and the language model
    #prompt = hub.pull("rlm/rag-prompt")
    llm = ChatOpenAI(model_name="gpt-4o", openai_api_key=os.environ['OpenAI_APIKEY'], temperature=0)
    
    # Create the RAG chain
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    # Invoke the chain with the input prompt
    return rag_chain.invoke(input_prompt)


# Define the Gradio interface
iface = gr.Interface(
    fn=wrap_all, 
    inputs=[gr.File(type="filepath", label=".docx file of the interview", file_count='multiple'), gr.Textbox(label="Enter your inquiry")],
    outputs="text",
    title="Interviews: QA and summarization",
    description="Upload a .docx file with the interview and enter the question you have or ask for a summarization.")
iface.launch()