Update app.py
Browse files
app.py
CHANGED
@@ -11,7 +11,9 @@ from langchain.embeddings import HuggingFaceEmbeddings
|
|
11 |
from langchain.memory import ConversationBufferMemory
|
12 |
from langchain.chains import ConversationalRetrievalChain
|
13 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
14 |
-
|
|
|
|
|
15 |
|
16 |
# StreamHandler to intercept streaming output from the LLM.
|
17 |
# This makes it appear that the Language Model is "typing"
|
@@ -39,36 +41,36 @@ def get_page_urls(url):
|
|
39 |
return set()
|
40 |
|
41 |
|
|
|
42 |
def get_url_content(url):
|
43 |
response = requests.get(url)
|
44 |
if url.endswith('.pdf'):
|
45 |
pdf = io.BytesIO(response.content)
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
doc = fitz.open('pdf.pdf')
|
50 |
-
return (url, ''.join([text for page in doc for text in page.get_text("text")]))
|
51 |
else:
|
52 |
soup = BeautifulSoup(response.content, 'html.parser')
|
53 |
content = soup.find_all('div', class_='wpb_content_element')
|
54 |
text = ' '.join([c.get_text().strip() for c in content if c.get_text().strip() != ''])
|
55 |
-
return (
|
|
|
|
|
56 |
|
57 |
|
58 |
|
59 |
@st.cache_resource
|
60 |
def get_retriever(urls):
|
61 |
-
|
62 |
-
documents = [Document(text=doc, metadata={'url': url}) for (url, doc) in all_content]
|
63 |
-
|
64 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
|
65 |
-
docs = text_splitter.split_documents(documents)
|
66 |
|
67 |
-
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
72 |
|
73 |
|
74 |
@st.cache_resource
|
|
|
11 |
from langchain.memory import ConversationBufferMemory
|
12 |
from langchain.chains import ConversationalRetrievalChain
|
13 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
14 |
+
from docarray import Document
|
15 |
+
from docarray import DocumentArray
|
16 |
+
from sentence_transformers import SentenceTransformer
|
17 |
|
18 |
# StreamHandler to intercept streaming output from the LLM.
|
19 |
# This makes it appear that the Language Model is "typing"
|
|
|
41 |
return set()
|
42 |
|
43 |
|
44 |
+
|
45 |
def get_url_content(url):
|
46 |
response = requests.get(url)
|
47 |
if url.endswith('.pdf'):
|
48 |
pdf = io.BytesIO(response.content)
|
49 |
+
doc = fitz.open(stream=pdf, filetype="pdf")
|
50 |
+
text = ''.join([page.get_text("text") for page in doc])
|
51 |
+
return Document(text=text, tags={'url': url})
|
|
|
|
|
52 |
else:
|
53 |
soup = BeautifulSoup(response.content, 'html.parser')
|
54 |
content = soup.find_all('div', class_='wpb_content_element')
|
55 |
text = ' '.join([c.get_text().strip() for c in content if c.get_text().strip() != ''])
|
56 |
+
return Document(text=text, tags={'url': url})
|
57 |
+
|
58 |
+
|
59 |
|
60 |
|
61 |
|
62 |
@st.cache_resource
|
63 |
def get_retriever(urls):
|
64 |
+
documents = DocumentArray([get_url_content(url) for url in urls])
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
# Load the model and encode document texts directly
|
67 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
68 |
+
documents.embeddings = model.encode(documents.texts, show_progress_bar=True)
|
69 |
|
70 |
+
# Applying HNSW for efficient similarity search
|
71 |
+
documents.embeddings.faiss.ann['HNSW'] = {'nlinks': 16}
|
72 |
+
|
73 |
+
return documents
|
74 |
|
75 |
|
76 |
@st.cache_resource
|