Mattral commited on
Commit
737f9f2
·
verified ·
1 Parent(s): 0d01a2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -12
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  from bs4 import BeautifulSoup
3
  import io
4
- import fitz
5
  import requests
6
  from langchain.llms import LlamaCpp
7
  from langchain.callbacks.base import BaseCallbackHandler
@@ -42,20 +42,25 @@ def get_page_urls(url):
42
 
43
 
44
 
 
 
 
 
 
 
 
 
 
 
45
  def get_url_content(url):
46
  response = requests.get(url)
47
- if url.endswith('.pdf'):
48
- pdf = io.BytesIO(response.content)
49
- doc = fitz.open(stream=pdf, filetype="pdf")
50
- text = ''.join([page.get_text("text") for page in doc])
51
- return Document(text=text, tags={'url': url})
52
  else:
53
- soup = BeautifulSoup(response.content, 'html.parser')
54
- content = soup.find_all('div', class_='wpb_content_element')
55
- text = ' '.join([c.get_text().strip() for c in content if c.get_text().strip() != ''])
56
- return Document(text=text, tags={'url': url})
57
-
58
-
59
 
60
 
61
 
 
1
  import streamlit as st
2
  from bs4 import BeautifulSoup
3
  import io
4
+ import fitz # PyMuPDF
5
  import requests
6
  from langchain.llms import LlamaCpp
7
  from langchain.callbacks.base import BaseCallbackHandler
 
42
 
43
 
44
 
45
+ def get_pdf_content(url, response):
46
+ pdf = io.BytesIO(response.content)
47
+ doc = fitz.open(stream=pdf, filetype="pdf")
48
+ return ''.join(page.get_text("text") for page in doc)
49
+
50
+ def get_html_content(url, response):
51
+ soup = BeautifulSoup(response.content, 'html.parser')
52
+ content = soup.find_all('div', class_='wpb_content_element')
53
+ return ' '.join(c.get_text().strip() for c in content if c.get_text().strip())
54
+
55
  def get_url_content(url):
56
  response = requests.get(url)
57
+ if response.ok:
58
+ if url.endswith('.pdf'):
59
+ return Document(text=get_pdf_content(url, response), tags={'url': url})
60
+ else:
61
+ return Document(text=get_html_content(url, response), tags={'url': url})
62
  else:
63
+ raise ValueError("Failed to fetch URL content")
 
 
 
 
 
64
 
65
 
66