Mattral commited on
Commit
d7252e0
·
verified ·
1 Parent(s): 435e461

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -11
app.py CHANGED
@@ -47,26 +47,19 @@ def get_url_content(url):
47
  file.write(pdf.read())
48
  file.close()
49
  doc = fitz.open('pdf.pdf')
50
- return (url, ''.join([text for page in doc for text in page.get_text()]))
51
  else:
52
  soup = BeautifulSoup(response.content, 'html.parser')
53
  content = soup.find_all('div', class_='wpb_content_element')
54
- text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
55
- text = [line for item in text for line in item.split('\n') if line.strip() != '']
56
 
57
- # Check if 'ARTS ON:' exists in the list
58
- try:
59
- arts_on = text.index('ARTS ON:')
60
- return (url, '\n'.join(text[:arts_on]))
61
- except ValueError:
62
- # If 'ARTS ON:' is not found, return what was found without truncating
63
- return (url, '\n'.join(text))
64
 
65
 
66
  @st.cache_resource
67
  def get_retriever(urls):
68
  all_content = [get_url_content(url) for url in urls]
69
- documents = [Document(page_content=doc, metadata={'url': url}) for (url, doc) in all_content]
70
 
71
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
72
  docs = text_splitter.split_documents(documents)
 
47
  file.write(pdf.read())
48
  file.close()
49
  doc = fitz.open('pdf.pdf')
50
+ return (url, ''.join([text for page in doc for text in page.get_text("text")]))
51
  else:
52
  soup = BeautifulSoup(response.content, 'html.parser')
53
  content = soup.find_all('div', class_='wpb_content_element')
54
+ text = ' '.join([c.get_text().strip() for c in content if c.get_text().strip() != ''])
55
+ return (url, text)
56
 
 
 
 
 
 
 
 
57
 
58
 
59
  @st.cache_resource
60
  def get_retriever(urls):
61
  all_content = [get_url_content(url) for url in urls]
62
+ documents = [Document(text=doc, metadata={'url': url}) for (url, doc) in all_content]
63
 
64
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
65
  docs = text_splitter.split_documents(documents)