Spaces:
Sleeping
Sleeping
File size: 7,572 Bytes
64fd6d4 1fecdf1 15bcf0e 64fd6d4 1fecdf1 64fd6d4 1fecdf1 23f69ee 1fecdf1 64fd6d4 1fecdf1 64fd6d4 1fecdf1 ef29c72 64fd6d4 1fecdf1 64fd6d4 1fecdf1 64fd6d4 1fecdf1 64fd6d4 1fecdf1 64fd6d4 1fecdf1 64fd6d4 10bb25e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
from langchain.docstore.document import Document
"""Core Modules s"""
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, NLTKTextSplitter, CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain_community.document_loaders import Docx2txtLoader
from langchain import hub
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import Chroma
import os
import gradio as gr
import os
from typing import List
from pydantic import BaseModel
from langchain_core.prompts import ChatPromptTemplate
from unstructured.partition.pdf import partition_pdf
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.document_loaders import UnstructuredPDFLoader
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
collection_name="rag_app",embedding_function=OpenAIEmbeddings(api_key=os.environ['OpenAI_APIKEY']))
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)
def split_text(doc:str, split_mode:str='tiktoken',
chunk_size:int=1000, chunk_overlap:int=5, faiss_save_path:str=None, save_faiss:bool=None):
# Split by separator and merge by character count
if split_mode == "character":
# Create a CharacterTextSplitter object
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
# Recursively split until below the chunk size limit
elif split_mode == "recursive_character":
# Create a RecursiveCharacterTextSplitter object
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
elif split_mode == "nltk":
# Create a NLTKTextSplitter object
text_splitter = NLTKTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
elif split_mode == "tiktoken":
# Create a CharacterTextSplitter object
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,)
else:
raise ValueError("Please specify the split mode.")
documents = text_splitter.split_documents(doc)
return documents
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
class Element(BaseModel):
type: str
text: str
def save_documents(Documents):
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OpenAI_APIKEY'])
faiss_db = FAISS.from_documents(documents, embeddings)
if save_faiss:
faiss_db.save_local(faiss_save_path)
return faiss_db
def save_documents(texts, text_summaries, tables, table_summaries):
# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
Document(page_content=s, metadata={id_key: doc_ids[i]})
for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))
# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
Document(page_content=s, metadata={id_key: table_ids[i]})
for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))
def doc_processing(files: List[bytes]):
docs = []
tables = []
for file in files:
if file.name.endswith(".pdf"):
# Identify file type and process accordingly
raw_pdf_elements = partition_pdf(
filename=file,
extract_images_in_pdf=False,
infer_table_structure=True,
chunking_strategy="by_title",
max_characters=4000,
new_after_n_chars=3800,
combine_text_under_n_chars=2000,
image_output_dir_path='/tmp', # Change this to your desired path
)
categorized_elements = []
for element in raw_pdf_elements:
if "unstructured.documents.elements.Table" in str(type(element)):
categorized_elements.append(Element(type="table", text=str(element)))
elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
categorized_elements.append(Element(type="text", text=str(element)))
# Extract text and table elements
text_elements = [e for e in categorized_elements if e.type == "text"]
table_elements = [e for e in categorized_elements if e.type == "table"]
docs.extend(text_elements)
tables.extend(table_elements)
elif file.name.endswith(".docx"):
# Process DOCX file using LangChain Docx2txtLoader
loader = Docx2txtLoader(file)
data = loader.load()
docs.extend(data)
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text.
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
# Summary chain
model = ChatOpenAI(temperature=0, model="gpt-4")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
text_summaries = summarize_chain.batch(docs, {"max_concurrency": 5})
return docs, tables, text_summaries, table_summaries
# Convert the list of document texts to embeddings
def wrap_all(files: List[bytes], input_prompt: str):
save_documents(doc_processing(files))
# Prompt template
template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. Please cite the text that you are using to base your arguments when it is possible.
Question: {question}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)
# Load the prompt template and the language model
#prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-4o", openai_api_key=os.environ['OpenAI_APIKEY'], temperature=0)
# Create the RAG chain
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# Invoke the chain with the input prompt
return rag_chain.invoke(input_prompt)
# Define the Gradio interface
iface = gr.Interface(
fn=wrap_all,
inputs=[gr.File(type="filepath", label=".docx file of the interview", file_count='multiple'), gr.Textbox(label="Enter your inquiry")],
outputs="text",
title="Interviews: QA and summarization",
description="Upload a .docx file with the interview and enter the question you have or ask for a summarization.")
iface.launch() |