Update app.py
Browse files
app.py
CHANGED
@@ -64,13 +64,15 @@ def get_url_content(url):
|
|
64 |
@st.cache_resource
|
65 |
def get_retriever(urls):
|
66 |
all_content = [get_url_content(url) for url in urls]
|
|
|
67 |
documents = [Document(page_content=doc, metadata={'url': url}) for (url, doc) in all_content]
|
|
|
68 |
|
69 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
|
70 |
docs = text_splitter.split_documents(documents)
|
|
|
71 |
|
72 |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
73 |
-
|
74 |
db = DocArrayInMemorySearch.from_documents(docs, embeddings)
|
75 |
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10})
|
76 |
return retriever
|
|
|
64 |
@st.cache_resource
|
65 |
def get_retriever(urls):
|
66 |
all_content = [get_url_content(url) for url in urls]
|
67 |
+
print(all_content) # See what is actually fetched
|
68 |
documents = [Document(page_content=doc, metadata={'url': url}) for (url, doc) in all_content]
|
69 |
+
print(documents) # Verify that documents are created correctly
|
70 |
|
71 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
|
72 |
docs = text_splitter.split_documents(documents)
|
73 |
+
print(docs) # Check the final structure of split documents
|
74 |
|
75 |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
|
|
76 |
db = DocArrayInMemorySearch.from_documents(docs, embeddings)
|
77 |
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10})
|
78 |
return retriever
|