Mattral commited on
Commit
3a411d7
·
verified ·
1 Parent(s): 4fcf54f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -8
app.py CHANGED
@@ -47,24 +47,19 @@ def get_url_content(url):
47
  response = requests.get(url)
48
  if url.endswith('.pdf'):
49
  pdf = io.BytesIO(response.content)
50
- file = open('pdf.pdf', 'wb')
51
- file.write(pdf.read())
52
- file.close()
53
  doc = fitz.open(stream=pdf, filetype="pdf")
54
- return (url, ''.join([text for page in doc for text in page.get_text()]))
55
  else:
56
  soup = BeautifulSoup(response.content, 'html.parser')
57
  content = soup.find_all('div', class_='wpb_content_element')
58
  text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
59
  text = [line for item in text for line in item.split('\n') if line.strip() != '']
60
-
61
- # Post processing to exclude footer content, only if 'ARTS ON:' is present.
62
  try:
63
  arts_on_index = text.index('ARTS ON:')
64
  return (url, '\n'.join(text[:arts_on_index]))
65
  except ValueError:
66
- return (url, '\n'.join(text)) # If 'ARTS ON:' not found, return full text
67
-
68
 
69
  @st.cache_resource
70
  def get_retriever(urls):
 
47
  response = requests.get(url)
48
  if url.endswith('.pdf'):
49
  pdf = io.BytesIO(response.content)
 
 
 
50
  doc = fitz.open(stream=pdf, filetype="pdf")
51
+ return (url, ''.join(page.get_text() for page in doc))
52
  else:
53
  soup = BeautifulSoup(response.content, 'html.parser')
54
  content = soup.find_all('div', class_='wpb_content_element')
55
  text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
56
  text = [line for item in text for line in item.split('\n') if line.strip() != '']
57
+ # Exclude footer content
 
58
  try:
59
  arts_on_index = text.index('ARTS ON:')
60
  return (url, '\n'.join(text[:arts_on_index]))
61
  except ValueError:
62
+ return (url, '\n'.join(text)) # Return full text if specific marker not found
 
63
 
64
  @st.cache_resource
65
  def get_retriever(urls):