Update app.py
Browse files
app.py
CHANGED
@@ -47,24 +47,19 @@ def get_url_content(url):
|
|
47 |
response = requests.get(url)
|
48 |
if url.endswith('.pdf'):
|
49 |
pdf = io.BytesIO(response.content)
|
50 |
-
file = open('pdf.pdf', 'wb')
|
51 |
-
file.write(pdf.read())
|
52 |
-
file.close()
|
53 |
doc = fitz.open(stream=pdf, filetype="pdf")
|
54 |
-
return (url, ''.join(
|
55 |
else:
|
56 |
soup = BeautifulSoup(response.content, 'html.parser')
|
57 |
content = soup.find_all('div', class_='wpb_content_element')
|
58 |
text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
|
59 |
text = [line for item in text for line in item.split('\n') if line.strip() != '']
|
60 |
-
|
61 |
-
# Post processing to exclude footer content, only if 'ARTS ON:' is present.
|
62 |
try:
|
63 |
arts_on_index = text.index('ARTS ON:')
|
64 |
return (url, '\n'.join(text[:arts_on_index]))
|
65 |
except ValueError:
|
66 |
-
return (url, '\n'.join(text)) #
|
67 |
-
|
68 |
|
69 |
@st.cache_resource
|
70 |
def get_retriever(urls):
|
|
|
47 |
response = requests.get(url)
|
48 |
if url.endswith('.pdf'):
|
49 |
pdf = io.BytesIO(response.content)
|
|
|
|
|
|
|
50 |
doc = fitz.open(stream=pdf, filetype="pdf")
|
51 |
+
return (url, ''.join(page.get_text() for page in doc))
|
52 |
else:
|
53 |
soup = BeautifulSoup(response.content, 'html.parser')
|
54 |
content = soup.find_all('div', class_='wpb_content_element')
|
55 |
text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
|
56 |
text = [line for item in text for line in item.split('\n') if line.strip() != '']
|
57 |
+
# Exclude footer content
|
|
|
58 |
try:
|
59 |
arts_on_index = text.index('ARTS ON:')
|
60 |
return (url, '\n'.join(text[:arts_on_index]))
|
61 |
except ValueError:
|
62 |
+
return (url, '\n'.join(text)) # Return full text if specific marker not found
|
|
|
63 |
|
64 |
@st.cache_resource
|
65 |
def get_retriever(urls):
|