Update app.py
Browse files
app.py
CHANGED
@@ -47,26 +47,19 @@ def get_url_content(url):
|
|
47 |
file.write(pdf.read())
|
48 |
file.close()
|
49 |
doc = fitz.open('pdf.pdf')
|
50 |
-
return (url, ''.join([text for page in doc for text in page.get_text()]))
|
51 |
else:
|
52 |
soup = BeautifulSoup(response.content, 'html.parser')
|
53 |
content = soup.find_all('div', class_='wpb_content_element')
|
54 |
-
text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
|
55 |
-
|
56 |
|
57 |
-
# Check if 'ARTS ON:' exists in the list
|
58 |
-
try:
|
59 |
-
arts_on = text.index('ARTS ON:')
|
60 |
-
return (url, '\n'.join(text[:arts_on]))
|
61 |
-
except ValueError:
|
62 |
-
# If 'ARTS ON:' is not found, return what was found without truncating
|
63 |
-
return (url, '\n'.join(text))
|
64 |
|
65 |
|
66 |
@st.cache_resource
|
67 |
def get_retriever(urls):
|
68 |
all_content = [get_url_content(url) for url in urls]
|
69 |
-
documents = [Document(
|
70 |
|
71 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
|
72 |
docs = text_splitter.split_documents(documents)
|
|
|
47 |
file.write(pdf.read())
|
48 |
file.close()
|
49 |
doc = fitz.open('pdf.pdf')
|
50 |
+
return (url, ''.join([text for page in doc for text in page.get_text("text")]))
|
51 |
else:
|
52 |
soup = BeautifulSoup(response.content, 'html.parser')
|
53 |
content = soup.find_all('div', class_='wpb_content_element')
|
54 |
+
text = ' '.join([c.get_text().strip() for c in content if c.get_text().strip() != ''])
|
55 |
+
return (url, text)
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
|
59 |
@st.cache_resource
|
60 |
def get_retriever(urls):
|
61 |
all_content = [get_url_content(url) for url in urls]
|
62 |
+
documents = [Document(text=doc, metadata={'url': url}) for (url, doc) in all_content]
|
63 |
|
64 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
|
65 |
docs = text_splitter.split_documents(documents)
|