Update app.py
Browse files
app.py
CHANGED
@@ -50,17 +50,17 @@ def get_url_content(url):
|
|
50 |
return (url, ''.join([text for page in doc for text in page.get_text()]))
|
51 |
else:
|
52 |
soup = BeautifulSoup(response.content, 'html.parser')
|
53 |
-
|
54 |
-
# Content containers. Here wordpress specific container css class name
|
55 |
-
# used. This will be different for each website.
|
56 |
content = soup.find_all('div', class_='wpb_content_element')
|
57 |
text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
|
58 |
text = [line for item in text for line in item.split('\n') if line.strip() != '']
|
59 |
|
60 |
-
#
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
64 |
|
65 |
|
66 |
@st.cache_resource
|
|
|
50 |
return (url, ''.join([text for page in doc for text in page.get_text()]))
|
51 |
else:
|
52 |
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
|
|
53 |
content = soup.find_all('div', class_='wpb_content_element')
|
54 |
text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
|
55 |
text = [line for item in text for line in item.split('\n') if line.strip() != '']
|
56 |
|
57 |
+
# Check if 'ARTS ON:' exists in the list
|
58 |
+
try:
|
59 |
+
arts_on = text.index('ARTS ON:')
|
60 |
+
return (url, '\n'.join(text[:arts_on]))
|
61 |
+
except ValueError:
|
62 |
+
# If 'ARTS ON:' is not found, return what was found without truncating
|
63 |
+
return (url, '\n'.join(text))
|
64 |
|
65 |
|
66 |
@st.cache_resource
|