Mattral commited on
Commit
452bd06
·
verified ·
1 Parent(s): 74bf15b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -4
app.py CHANGED
@@ -37,11 +37,12 @@ def get_page_urls(url):
37
 
38
  @st.cache(allow_output_mutation=True)
39
  def process_pdf(file):
40
- # Reads PDF from bytes, processes it, and returns extracted text
41
- doc = fitz.open(stream=file)
42
  texts = [page.get_text() for page in doc]
43
  return '\n'.join(texts)
44
 
 
45
  def get_url_content(url):
46
  response = requests.get(url)
47
  if url.endswith('.pdf'):
@@ -49,7 +50,7 @@ def get_url_content(url):
49
  file = open('pdf.pdf', 'wb')
50
  file.write(pdf.read())
51
  file.close()
52
- doc = fitz.open('pdf.pdf')
53
  return (url, ''.join([text for page in doc for text in page.get_text()]))
54
  else:
55
  soup = BeautifulSoup(response.content, 'html.parser')
@@ -152,7 +153,8 @@ elif input_type == 'Upload PDF':
152
  pdf_text = process_pdf(uploaded_file)
153
  # Assume we process the PDF text into a format that can be used by your LLM
154
  urls = [pdf_text] # This should be adjusted to match your system's needs
155
- retriever = get_retriever(urls)
 
156
 
157
  # We store the conversation in the session state.
158
  # This will be used to render the chat conversation.
 
37
 
38
  @st.cache(allow_output_mutation=True)
39
  def process_pdf(file):
40
+ # file is expected to be a BytesIO object directly from the file uploader
41
+ doc = fitz.open("pdf", file.read()) # "pdf" indicates file format is PDF, reading the BytesIO stream
42
  texts = [page.get_text() for page in doc]
43
  return '\n'.join(texts)
44
 
45
+
46
  def get_url_content(url):
47
  response = requests.get(url)
48
  if url.endswith('.pdf'):
 
50
  file = open('pdf.pdf', 'wb')
51
  file.write(pdf.read())
52
  file.close()
53
+ doc = fitz.open(stream=pdf, filetype="pdf")
54
  return (url, ''.join([text for page in doc for text in page.get_text()]))
55
  else:
56
  soup = BeautifulSoup(response.content, 'html.parser')
 
153
  pdf_text = process_pdf(uploaded_file)
154
  # Assume we process the PDF text into a format that can be used by your LLM
155
  urls = [pdf_text] # This should be adjusted to match your system's needs
156
+ retriever = get_retriever(urls) # Make sure your retriever can handle raw text if not, adapt it.
157
+ llm_chain = create_chain(retriever)
158
 
159
  # We store the conversation in the session state.
160
  # This will be used to render the chat conversation.