Spaces:

Mattral
/

RAG-bot

Sleeping

App Files Files Community

Mattral commited on May 10, 2024

Commit

aa8e6f0

verified ·

1 Parent(s): 46c5199

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -55

app.py CHANGED Viewed

@@ -35,7 +35,12 @@ def get_page_urls(url):
     links.append(url)
     return set(links)
 def get_url_content(url):
     response = requests.get(url)
@@ -48,17 +53,16 @@ def get_url_content(url):
         return (url, ''.join([text for page in doc for text in page.get_text()]))
     else:
         soup = BeautifulSoup(response.content, 'html.parser')
-        # Content containers. Here wordpress specific container css class name
-        # used. This will be different for each website.
         content = soup.find_all('div', class_='wpb_content_element')
         text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
         text = [line for item in text for line in item.split('\n') if line.strip() != '']
-        # Post processing to exclude footer content.
-        # This will be different for each website.
-        arts_on = text.index('ARTS ON:')
-        return (url, '\n'.join(text[:arts_on]))
 @st.cache_resource
@@ -121,12 +125,9 @@ def create_chain(_retriever):
     return qa_chain
-# Set the webpage title
-st.set_page_config(
-    page_title="Your own AI-Chat!"
-)
-# Create a header element
 st.header("Your own AI-Chat!")
 # This sets the LLM's personality.
@@ -136,20 +137,27 @@ st.header("Your own AI-Chat!")
 #    label="System Prompt",
 #    value="You are a helpful AI assistant who answers questions in short sentences.",
 #    key="system_prompt")
-if "base_url" not in st.session_state:
-    st.session_state.base_url = ""
-base_url = st.text_input("Enter the site url here", key="base_url")
-if st.session_state.base_url != "":
-    urls = get_page_urls(base_url)
-    retriever = get_retriever(urls)
     # We store the conversation in the session state.
     # This will be used to render the chat conversation.
-    # We initialize it with the first message we want to be greeted with.
     if "messages" not in st.session_state:
         st.session_state.messages = [
             {"role": "assistant", "content": "How may I help you today?"}
@@ -164,34 +172,36 @@ if st.session_state.base_url != "":
         with st.chat_message(message["role"]):
             st.markdown(message["content"])
-    # We initialize the quantized LLM from a local path.
-    # Currently most parameters are fixed but we can make them
-    # configurable.
-    llm_chain = create_chain(retriever)
-    # We take questions/instructions from the chat input to pass to the LLM
-    if user_prompt := st.chat_input("Your message here", key="user_input"):
-        # Add our input to the session state
-        st.session_state.messages.append(
-            {"role": "user", "content": user_prompt}
-        )
-        # Add our input to the chat window
-        with st.chat_message("user"):
-            st.markdown(user_prompt)
-        # Pass our input to the llm chain and capture the final responses.
-        # It is worth noting that the Stream Handler is already receiving the
-        # streaming response as the llm is generating. We get our response
-        # here once the llm has finished generating the complete response.
-        response = llm_chain.run(user_prompt)
-        # Add the response to the session state
-        st.session_state.messages.append(
-            {"role": "assistant", "content": response}
-        )
-        # Add the response to the chat window
-        with st.chat_message("assistant"):
-            st.markdown(response)

     links.append(url)
     return set(links)
+@st.cache(allow_output_mutation=True)
+def process_pdf(file):
+    # Reads PDF from bytes, processes it, and returns extracted text
+    doc = fitz.open(stream=file)
+    texts = [page.get_text() for page in doc]
+    return '\n'.join(texts)
 def get_url_content(url):
     response = requests.get(url)
         return (url, ''.join([text for page in doc for text in page.get_text()]))
     else:
         soup = BeautifulSoup(response.content, 'html.parser')
         content = soup.find_all('div', class_='wpb_content_element')
         text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
         text = [line for item in text for line in item.split('\n') if line.strip() != '']
+        # Post processing to exclude footer content, only if 'ARTS ON:' is present.
+        try:
+            arts_on_index = text.index('ARTS ON:')
+            return (url, '\n'.join(text[:arts_on_index]))
+        except ValueError:
+            return (url, '\n'.join(text))  # If 'ARTS ON:' not found, return full text
 @st.cache_resource
     return qa_chain
+# Set the webpage title
+st.set_page_config(page_title="Your own AI-Chat!")
 st.header("Your own AI-Chat!")
 # This sets the LLM's personality.
 #    label="System Prompt",
 #    value="You are a helpful AI assistant who answers questions in short sentences.",
 #    key="system_prompt")
+# Choose input method
+input_type = st.radio("Choose an input method:", ['URL', 'Upload PDF'])
+if input_type == 'URL':
+    base_url = st.text_input("Enter the site URL here:", key="base_url")
+    if base_url:
+        urls = get_page_urls(base_url)
+        retriever = get_retriever(urls)
+elif input_type == 'Upload PDF':
+    uploaded_file = st.file_uploader("Upload your PDF here:", type="pdf")
+    if uploaded_file:
+        pdf_text = process_pdf(uploaded_file)
+        # Assume we process the PDF text into a format that can be used by your LLM
+        urls = [pdf_text]  # This should be adjusted to match your system's needs
+        retriever = get_retriever(urls)
     # We store the conversation in the session state.
     # This will be used to render the chat conversation.
+    # We initialize it with the first message we want to be greeted with
     if "messages" not in st.session_state:
         st.session_state.messages = [
             {"role": "assistant", "content": "How may I help you today?"}
         with st.chat_message(message["role"]):
             st.markdown(message["content"])
+    if retriever:
+        # We initialize the quantized LLM from a local path.
+        # Currently most parameters are fixed but we can make them
+        # configurable.
+        llm_chain = create_chain(retriever)
+        # We take questions/instructions from the chat input to pass to the LLM
+        if user_prompt := st.chat_input("Your message here", key="user_input"):
+            # Add our input to the session state
+            st.session_state.messages.append(
+                {"role": "user", "content": user_prompt}
+            )
+            # Add our input to the chat window
+            with st.chat_message("user"):
+                st.markdown(user_prompt)
+            # Pass our input to the llm chain and capture the final responses.
+            # It is worth noting that the Stream Handler is already receiving the
+            # streaming response as the llm is generating. We get our response
+            # here once the llm has finished generating the complete response.
+            response = llm_chain.run(user_prompt)
+            # Add the response to the session state
+            st.session_state.messages.append(
+                {"role": "assistant", "content": response}
+            )
+            # Add the response to the chat window
+            with st.chat_message("assistant"):
+                st.markdown(response)