Update app.py
Browse files
app.py
CHANGED
@@ -37,11 +37,12 @@ def get_page_urls(url):
|
|
37 |
|
38 |
@st.cache(allow_output_mutation=True)
|
39 |
def process_pdf(file):
|
40 |
-
#
|
41 |
-
doc = fitz.open(
|
42 |
texts = [page.get_text() for page in doc]
|
43 |
return '\n'.join(texts)
|
44 |
|
|
|
45 |
def get_url_content(url):
|
46 |
response = requests.get(url)
|
47 |
if url.endswith('.pdf'):
|
@@ -49,7 +50,7 @@ def get_url_content(url):
|
|
49 |
file = open('pdf.pdf', 'wb')
|
50 |
file.write(pdf.read())
|
51 |
file.close()
|
52 |
-
doc = fitz.open(
|
53 |
return (url, ''.join([text for page in doc for text in page.get_text()]))
|
54 |
else:
|
55 |
soup = BeautifulSoup(response.content, 'html.parser')
|
@@ -152,7 +153,8 @@ elif input_type == 'Upload PDF':
|
|
152 |
pdf_text = process_pdf(uploaded_file)
|
153 |
# Assume we process the PDF text into a format that can be used by your LLM
|
154 |
urls = [pdf_text] # This should be adjusted to match your system's needs
|
155 |
-
retriever = get_retriever(urls)
|
|
|
156 |
|
157 |
# We store the conversation in the session state.
|
158 |
# This will be used to render the chat conversation.
|
|
|
37 |
|
38 |
@st.cache(allow_output_mutation=True)
|
39 |
def process_pdf(file):
|
40 |
+
# file is expected to be a BytesIO object directly from the file uploader
|
41 |
+
doc = fitz.open("pdf", file.read()) # "pdf" indicates file format is PDF, reading the BytesIO stream
|
42 |
texts = [page.get_text() for page in doc]
|
43 |
return '\n'.join(texts)
|
44 |
|
45 |
+
|
46 |
def get_url_content(url):
|
47 |
response = requests.get(url)
|
48 |
if url.endswith('.pdf'):
|
|
|
50 |
file = open('pdf.pdf', 'wb')
|
51 |
file.write(pdf.read())
|
52 |
file.close()
|
53 |
+
doc = fitz.open(stream=pdf, filetype="pdf")
|
54 |
return (url, ''.join([text for page in doc for text in page.get_text()]))
|
55 |
else:
|
56 |
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
153 |
pdf_text = process_pdf(uploaded_file)
|
154 |
# Assume we process the PDF text into a format that can be used by your LLM
|
155 |
urls = [pdf_text] # This should be adjusted to match your system's needs
|
156 |
+
retriever = get_retriever(urls) # Make sure your retriever can handle raw text if not, adapt it.
|
157 |
+
llm_chain = create_chain(retriever)
|
158 |
|
159 |
# We store the conversation in the session state.
|
160 |
# This will be used to render the chat conversation.
|