Mattral commited on
Commit
a257e25
·
verified ·
1 Parent(s): 758446e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -21
app.py CHANGED
@@ -42,38 +42,41 @@ def get_page_urls(url):
42
 
43
 
44
 
45
- def get_pdf_content(url, response):
46
- pdf = io.BytesIO(response.content)
47
- doc = fitz.open(stream=pdf, filetype="pdf")
48
- return ''.join(page.get_text("text") for page in doc)
49
-
50
- def get_html_content(url, response):
51
- soup = BeautifulSoup(response.content, 'html.parser')
52
- content = soup.find_all('div', class_='wpb_content_element')
53
- return ' '.join(c.get_text().strip() for c in content if c.get_text().strip())
54
-
55
  def get_url_content(url):
56
- response = requests.get(url)
57
- response.raise_for_status()
58
- if response.ok:
59
  if url.endswith('.pdf'):
60
- return Document(text=get_pdf_content(url, response), tags={'url': url})
 
 
61
  else:
62
- return Document(text=get_html_content(url, response), tags={'url': url})
63
- else:
64
- st.error(f"Failed to process URL content: {e}")
65
- return None
66
 
 
 
 
 
 
67
 
68
  @st.cache_resource
69
  def get_retriever(urls):
70
- documents = DocumentArray([doc for url in urls if (doc := get_url_content(url)) is not None])
71
-
 
 
 
 
72
  model = SentenceTransformer('all-MiniLM-L6-v2')
73
- documents.embeddings = model.encode([doc.text for doc in documents], show_progress_bar=True)
 
 
74
 
75
  return documents
76
 
 
77
  @st.cache_resource
78
  def create_chain(_retriever):
79
  # A stream handler to direct streaming output on the chat screen.
 
42
 
43
 
44
 
 
 
 
 
 
 
 
 
 
 
45
  def get_url_content(url):
46
+ try:
47
+ response = requests.get(url)
48
+ response.raise_for_status()
49
  if url.endswith('.pdf'):
50
+ pdf = io.BytesIO(response.content)
51
+ doc = fitz.open(stream=pdf, filetype="pdf")
52
+ text = ''.join([page.get_text("text") for page in doc])
53
  else:
54
+ soup = BeautifulSoup(response.content, 'html.parser')
55
+ content = soup.find_all('div', class_='wpb_content_element')
56
+ text = ' '.join([c.get_text().strip() for c in content if c.get_text().strip() != ''])
 
57
 
58
+ # Create a single document with metadata
59
+ return DocumentArray([{'text': text, 'tags': {'url': url}}])
60
+ except Exception as e:
61
+ st.error(f"Failed to process URL content: {e}")
62
+ return DocumentArray()
63
 
64
  @st.cache_resource
65
  def get_retriever(urls):
66
+ documents = DocumentArray()
67
+ for url in urls:
68
+ content = get_url_content(url)
69
+ if content:
70
+ documents.extend(content)
71
+
72
  model = SentenceTransformer('all-MiniLM-L6-v2')
73
+ embeddings = model.encode([doc.text for doc in documents], show_progress_bar=True)
74
+ for doc, emb in zip(documents, embeddings):
75
+ doc.embedding = emb
76
 
77
  return documents
78
 
79
+
80
  @st.cache_resource
81
  def create_chain(_retriever):
82
  # A stream handler to direct streaming output on the chat screen.