Spaces:

Mattral
/

RAG-bot

Sleeping

Mattral commited on May 10, 2024

Commit

452bd06

verified ·

1 Parent(s): 74bf15b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -37,11 +37,12 @@ def get_page_urls(url):
 @st.cache(allow_output_mutation=True)
 def process_pdf(file):
-    # Reads PDF from bytes, processes it, and returns extracted text
-    doc = fitz.open(stream=file)
     texts = [page.get_text() for page in doc]
     return '\n'.join(texts)
 def get_url_content(url):
     response = requests.get(url)
     if url.endswith('.pdf'):
@@ -49,7 +50,7 @@ def get_url_content(url):
         file = open('pdf.pdf', 'wb')
         file.write(pdf.read())
         file.close()
-        doc = fitz.open('pdf.pdf')
         return (url, ''.join([text for page in doc for text in page.get_text()]))
     else:
         soup = BeautifulSoup(response.content, 'html.parser')
@@ -152,7 +153,8 @@ elif input_type == 'Upload PDF':
         pdf_text = process_pdf(uploaded_file)
         # Assume we process the PDF text into a format that can be used by your LLM
         urls = [pdf_text]  # This should be adjusted to match your system's needs
-        retriever = get_retriever(urls)
     # We store the conversation in the session state.
     # This will be used to render the chat conversation.

 @st.cache(allow_output_mutation=True)
 def process_pdf(file):
+    # file is expected to be a BytesIO object directly from the file uploader
+    doc = fitz.open("pdf", file.read())  # "pdf" indicates file format is PDF, reading the BytesIO stream
     texts = [page.get_text() for page in doc]
     return '\n'.join(texts)
 def get_url_content(url):
     response = requests.get(url)
     if url.endswith('.pdf'):
         file = open('pdf.pdf', 'wb')
         file.write(pdf.read())
         file.close()
+        doc = fitz.open(stream=pdf, filetype="pdf")
         return (url, ''.join([text for page in doc for text in page.get_text()]))
     else:
         soup = BeautifulSoup(response.content, 'html.parser')
         pdf_text = process_pdf(uploaded_file)
         # Assume we process the PDF text into a format that can be used by your LLM
         urls = [pdf_text]  # This should be adjusted to match your system's needs
+        retriever = get_retriever(urls)  # Make sure your retriever can handle raw text if not, adapt it.
+        llm_chain = create_chain(retriever)
     # We store the conversation in the session state.
     # This will be used to render the chat conversation.