Mattral commited on
Commit
0d01a2d
·
verified ·
1 Parent(s): 349e450

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -16
app.py CHANGED
@@ -11,7 +11,9 @@ from langchain.embeddings import HuggingFaceEmbeddings
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.chains import ConversationalRetrievalChain
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
-
 
 
15
 
16
  # StreamHandler to intercept streaming output from the LLM.
17
  # This makes it appear that the Language Model is "typing"
@@ -39,36 +41,36 @@ def get_page_urls(url):
39
  return set()
40
 
41
 
 
42
  def get_url_content(url):
43
  response = requests.get(url)
44
  if url.endswith('.pdf'):
45
  pdf = io.BytesIO(response.content)
46
- file = open('pdf.pdf', 'wb')
47
- file.write(pdf.read())
48
- file.close()
49
- doc = fitz.open('pdf.pdf')
50
- return (url, ''.join([text for page in doc for text in page.get_text("text")]))
51
  else:
52
  soup = BeautifulSoup(response.content, 'html.parser')
53
  content = soup.find_all('div', class_='wpb_content_element')
54
  text = ' '.join([c.get_text().strip() for c in content if c.get_text().strip() != ''])
55
- return (url, text)
 
 
56
 
57
 
58
 
59
  @st.cache_resource
60
  def get_retriever(urls):
61
- all_content = [get_url_content(url) for url in urls]
62
- documents = [Document(text=doc, metadata={'url': url}) for (url, doc) in all_content]
63
-
64
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
65
- docs = text_splitter.split_documents(documents)
66
 
67
- embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 
 
68
 
69
- db = DocArrayInMemorySearch.from_documents(docs, embeddings)
70
- retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10})
71
- return retriever
 
72
 
73
 
74
  @st.cache_resource
 
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.chains import ConversationalRetrievalChain
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+ from docarray import Document
15
+ from docarray import DocumentArray
16
+ from sentence_transformers import SentenceTransformer
17
 
18
  # StreamHandler to intercept streaming output from the LLM.
19
  # This makes it appear that the Language Model is "typing"
 
41
  return set()
42
 
43
 
44
+
45
  def get_url_content(url):
46
  response = requests.get(url)
47
  if url.endswith('.pdf'):
48
  pdf = io.BytesIO(response.content)
49
+ doc = fitz.open(stream=pdf, filetype="pdf")
50
+ text = ''.join([page.get_text("text") for page in doc])
51
+ return Document(text=text, tags={'url': url})
 
 
52
  else:
53
  soup = BeautifulSoup(response.content, 'html.parser')
54
  content = soup.find_all('div', class_='wpb_content_element')
55
  text = ' '.join([c.get_text().strip() for c in content if c.get_text().strip() != ''])
56
+ return Document(text=text, tags={'url': url})
57
+
58
+
59
 
60
 
61
 
62
  @st.cache_resource
63
  def get_retriever(urls):
64
+ documents = DocumentArray([get_url_content(url) for url in urls])
 
 
 
 
65
 
66
+ # Load the model and encode document texts directly
67
+ model = SentenceTransformer('all-MiniLM-L6-v2')
68
+ documents.embeddings = model.encode(documents.texts, show_progress_bar=True)
69
 
70
+ # Applying HNSW for efficient similarity search
71
+ documents.embeddings.faiss.ann['HNSW'] = {'nlinks': 16}
72
+
73
+ return documents
74
 
75
 
76
  @st.cache_resource