Spaces:

Skier8402
/

mistral-PDF-chat

Running

App Files Files Community

Skier8402 commited on Jul 23, 2024

Commit

5bea413

verified ·

1 Parent(s): ea3db3e

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -112

app.py CHANGED Viewed

@@ -15,131 +15,94 @@ from langchain.vectorstores import FAISS
 from langchain.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from htmlTemplates import css, bot_template, user_template
 from langchain.llms import HuggingFaceHub
 def get_pdf_text(pdf_docs):
-    """
-    Extract text from a list of PDF documents.
-    Parameters
-    ----------
-    pdf_docs : list
-        List of PDF documents to extract text from.
-    Returns
-    -------
-    str
-        Extracted text from all the PDF documents.
-    """
     text = ""
     for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
     return text
 def get_text_chunks(text):
-    """
-    Split the input text into chunks.
-    Parameters
-    ----------
-    text : str
-        The input text to be split.
-    Returns
-    -------
-    list
-        List of text chunks.
-    """
     text_splitter = CharacterTextSplitter(
         separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
     )
-    chunks = text_splitter.split_text(text)
     return chunks
 def get_vectorstore(text_chunks):
-    """
-    Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
-    Parameters
-    ----------
-    text_chunks : list
-        List of text chunks to be embedded.
-    Returns
-    -------
-    FAISS
-        A FAISS vector store containing the embeddings of the text chunks.
-    """
     model = "BAAI/bge-base-en-v1.5"
     encode_kwargs = {
         "normalize_embeddings": True
-    }  # set True to compute cosine similarity
-    embeddings = HuggingFaceBgeEmbeddings(
-        model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
-    )
-    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
 def get_conversation_chain(vectorstore):
-    """
-    Create a conversational retrieval chain using a vector store and a language model.
-    Parameters
-    ----------
-    vectorstore : FAISS
-        A FAISS vector store containing the embeddings of the text chunks.
-    Returns
-    -------
-    ConversationalRetrievalChain
-        A conversational retrieval chain for generating responses.
-    """
-    llm = HuggingFaceHub(
-        repo_id="mistralai/Mistral-7B-v0.3",
-        model_kwargs={"temperature": 0.5, "max_length": 4000},
-    )
-    # llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
-    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-    conversation_chain = ConversationalRetrievalChain.from_llm(
-        llm=llm, retriever=vectorstore.as_retriever(), memory=memory
-    )
     return conversation_chain
 def handle_userinput(user_question):
-    """
-    Handle user input and generate a response using the conversational retrieval chain.
-    Parameters
-    ----------
-    user_question : str
-        The user's question.
-    """
-    response = st.session_state.conversation({"question": user_question})
-    st.session_state.chat_history = response["chat_history"]
-    for i, message in enumerate(st.session_state.chat_history):
-        if i % 2 == 0:
-            st.write("//_^ User: " + message.content)
-        else:
-            st.write("🤖 ChatBot: " + message.content)
 def main():
-    """
-    Putting it all together.
-    """
     st.set_page_config(
         page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
         page_icon=":books:",
@@ -150,15 +113,13 @@ def main():
     st.write(css, unsafe_allow_html=True)
-    # set huggingface hub token in st.text_input widget
-    # then hide the input
     huggingface_token = st.text_input("Enter your HuggingFace Hub token", type="password")
     #openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
-    # set this key as an environment variable
-    os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
-    #os.environ["OPENAI_API_KEY"] = openai_api_key
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
@@ -177,18 +138,20 @@ def main():
         )
         if st.button("Process"):
             with st.spinner("Processing"):
-                # get pdf text
-                raw_text = get_pdf_text(pdf_docs)
-                # get the text chunks
-                text_chunks = get_text_chunks(raw_text)
-                # create vector store
-                vectorstore = get_vectorstore(text_chunks)
-                # create conversation chain
-                st.session_state.conversation = get_conversation_chain(vectorstore)
 if __name__ == "__main__":
-    main()

 from langchain.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
+from langchain.schema import BaseOutputParser, OutputParserException
 from htmlTemplates import css, bot_template, user_template
 from langchain.llms import HuggingFaceHub
+class ReferenceOutputParser(BaseOutputParser):
+    def parse(self, text: str) -> dict:
+        try:
+            result, references = text.split("References:")
+            return {"result": result.strip(), "references": [ref.strip() for ref in references.split("\n") if ref.strip()]}
+        except ValueError:
+            raise OutputParserException(f"Could not parse output: {text}")
 def get_pdf_text(pdf_docs):
     text = ""
     for pdf in pdf_docs:
+        try:
+            pdf_reader = PdfReader(pdf)
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+        except Exception as e:
+            st.error(f"Error extracting text from PDF: {e}")
     return text
 def get_text_chunks(text):
     text_splitter = CharacterTextSplitter(
         separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
     )
+    try:
+        chunks = text_splitter.split_text(text)
+    except Exception as e:
+        st.error(f"Error splitting text into chunks: {e}")
+        chunks = []
     return chunks
 def get_vectorstore(text_chunks):
     model = "BAAI/bge-base-en-v1.5"
     encode_kwargs = {
         "normalize_embeddings": True
+    }
+    try:
+        embeddings = HuggingFaceBgeEmbeddings(
+            model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
+        )
+        vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    except Exception as e:
+        st.error(f"Error creating vector store: {e}")
+        vectorstore = None
     return vectorstore
 def get_conversation_chain(vectorstore):
+    if vectorstore is None:
+        return None
+    try:
+        llm = HuggingFaceHub(
+            repo_id="mistralai/Mistral-7B-v0.3",
+            model_kwargs={"temperature": 0.5, "max_length": 4000},
+        )
+        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+        conversation_chain = ConversationalRetrievalChain.from_llm(
+            llm=llm, retriever=vectorstore.as_retriever(), memory=memory, output_parser=ReferenceOutputParser()
+        )
+    except Exception as e:
+        st.error(f"Error creating conversation chain: {e}")
+        conversation_chain = None
     return conversation_chain
 def handle_userinput(user_question):
+    if st.session_state.conversation is None:
+        st.error("Please process the PDF files before asking a question.")
+        return
+    try:
+        response = st.session_state.conversation({"question": user_question})
+        st.session_state.chat_history = response["chat_history"]
+        result = response["result"]
+        references = response["references"]
+        st.write("//_^ User: " + user_question)
+        st.write("🤖 ChatBot: " + result)
+        st.write("References:")
+        for ref in references:
+            st.write("- " + ref)
+    except Exception as e:
+        st.error(f"Error handling user input: {e}")
 def main():
     st.set_page_config(
         page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
         page_icon=":books:",
     st.write(css, unsafe_allow_html=True)
     huggingface_token = st.text_input("Enter your HuggingFace Hub token", type="password")
     #openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
+    if huggingface_token:
+        os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
+    #if openai_api_key:
+    #    os.environ["OPENAI_API_KEY"] = openai_api_key
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
         )
         if st.button("Process"):
             with st.spinner("Processing"):
+                try:
+                    # get pdf text
+                    raw_text = get_pdf_text(pdf_docs)
+                    # get the text chunks
+                    text_chunks = get_text_chunks(raw_text)
+                    # create vector store
+                    vectorstore = get_vectorstore(text_chunks)
+                    # create conversation chain
+                    st.session_state.conversation = get_conversation_chain(vectorstore)
+                except Exception as e:
+                    st.error(f"Error processing PDF files: {e}")
 if __name__ == "__main__":
+    main()