Mattral commited on
Commit
da85442
·
verified ·
1 Parent(s): c7a122e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -50,17 +50,17 @@ def get_url_content(url):
50
  return (url, ''.join([text for page in doc for text in page.get_text()]))
51
  else:
52
  soup = BeautifulSoup(response.content, 'html.parser')
53
-
54
- # Content containers. Here wordpress specific container css class name
55
- # used. This will be different for each website.
56
  content = soup.find_all('div', class_='wpb_content_element')
57
  text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
58
  text = [line for item in text for line in item.split('\n') if line.strip() != '']
59
 
60
- # Post processing to exclude footer content.
61
- # This will be different for each website.
62
- arts_on = text.index('ARTS ON:')
63
- return (url, '\n'.join(text[:arts_on]))
 
 
 
64
 
65
 
66
  @st.cache_resource
 
50
  return (url, ''.join([text for page in doc for text in page.get_text()]))
51
  else:
52
  soup = BeautifulSoup(response.content, 'html.parser')
 
 
 
53
  content = soup.find_all('div', class_='wpb_content_element')
54
  text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
55
  text = [line for item in text for line in item.split('\n') if line.strip() != '']
56
 
57
+ # Check if 'ARTS ON:' exists in the list
58
+ try:
59
+ arts_on = text.index('ARTS ON:')
60
+ return (url, '\n'.join(text[:arts_on]))
61
+ except ValueError:
62
+ # If 'ARTS ON:' is not found, return what was found without truncating
63
+ return (url, '\n'.join(text))
64
 
65
 
66
  @st.cache_resource