from langchain.docstore.document import Document """Core Modules s""" from typing import Union, Optional, List, Sequence from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter, NLTKTextSplitter, CharacterTextSplitter from langchain.vectorstores.faiss import FAISS from langchain_community.document_loaders import Docx2txtLoader from langchain import hub from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_core.runnables import RunnablePassthrough from langchain_core.output_parsers import StrOutputParser import os import gradio as gr def doc_to_embeddings(doc:Document, split_mode:str='tiktoken', chunk_size:int=1000, chunk_overlap:int=5, faiss_save_path:str=None, save_faiss:bool=None): # Load the PDF file (if the file is a URL, load the PDF file from the URL) # Split by separator and merge by character count if split_mode == "character": # Create a CharacterTextSplitter object text_splitter = CharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) # Recursively split until below the chunk size limit elif split_mode == "recursive_character": # Create a RecursiveCharacterTextSplitter object text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) elif split_mode == "nltk": # Create a NLTKTextSplitter object text_splitter = NLTKTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) elif split_mode == "tiktoken": # Create a CharacterTextSplitter object text_splitter = CharacterTextSplitter.from_tiktoken_encoder( chunk_size=chunk_size, chunk_overlap=chunk_overlap,) else: raise ValueError("Please specify the split mode.") documents = text_splitter.split_documents(doc) embeddings = OpenAIEmbeddings() faiss_db = FAISS.from_documents(documents, embeddings) if save_faiss: faiss_db.save_local(faiss_save_path) return faiss_db def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) def wrap_all(file, input_prompt:str): loader = Docx2txtLoader(file.name) data = loader.load() db = doc_to_embeddings(data) retriever = db.as_retriever() prompt = hub.pull("rlm/rag-prompt") llm = ChatOpenAI(model_name="gpt-4",openai_api_key=os.environ['OpenAI_APIKEY'], temperature=0) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) return rag_chain.invoke(input_prompt) # Define the Gradio interface iface = gr.Interface( fn=wrap_all, inputs=[gr.File(type="binary", label=".docx file of the interview"), gr.Textbox(label="Enter your inquiry")], outputs="text", title="Interviews: QA and summarization", description="Upload a .docx file with the interview and enter the question you have or ask for a summarization.") iface.launch()