Spaces:
Runtime error
Runtime error
| from qdrant_client import QdrantClient | |
| from qdrant_client.http import models as rest | |
| from langchain_qdrant import QdrantVectorStore | |
| import hashlib | |
| import defaults | |
| embedding_model = defaults.default_embedding_model | |
| qdrant_url = defaults.default_url | |
| """ | |
| This code creates a hash for every chunk and checks to see if that chunk already exists in the | |
| vector database. We only want one collection in Qdrant, but want to make sure that if a user | |
| selects a document that has already been embedded and stored, it does not get stored again. We | |
| also add metadata for the document title, so that we can make our retriever focus on documents of | |
| interest. For example, after some usage, the application might have 20 documents for the user to | |
| select from. We want the retriever to be exactly right for the documents that they selected. | |
| This could also be useful if different versions of documents are in existence. We would not want to | |
| recreate a large vectorstore. But the user could select the most recent version. | |
| """ | |
| def get_document_hash(doc_content): | |
| """Generate a unique hash for the document content.""" | |
| return hashlib.md5(doc_content.encode()).hexdigest() | |
| def getVectorstore(document, file_path): | |
| # Add a unique hash to your documents | |
| for doc in document: | |
| doc.metadata['content_hash'] = get_document_hash(doc.page_content) | |
| # Add the document title | |
| for doc in document: | |
| doc.metadata['document_title'] = file_path.split('/')[-1] | |
| client = QdrantClient( url=qdrant_url) | |
| # client = QdrantClient(":memory:") | |
| # If the collection exists, then we need to check to see if our document is already | |
| # present, in which case we would not want to store it again. | |
| if client.collection_exists("protocol_collection"): | |
| print("Collection exists") | |
| qdrant_vectorstore = QdrantVectorStore.from_existing_collection( | |
| embedding=embedding_model, | |
| collection_name="protocol_collection", | |
| url=qdrant_url | |
| # location = ":memory:" | |
| ) | |
| # Check for existing documents and only add new ones | |
| existing_hashes = set() | |
| new_docs = [] | |
| # Get all existing hashes | |
| scroll_filter = rest.Filter( | |
| should=[ | |
| rest.FieldCondition( | |
| key="metadata.content_hash", | |
| match=rest.MatchValue(value=doc.metadata['content_hash']) | |
| ) for doc in document | |
| ] | |
| ) | |
| scroll_results = client.scroll( | |
| collection_name="protocol_collection", | |
| scroll_filter=scroll_filter, | |
| limit=len(document) # Adjust this if you have a large number of documents | |
| ) | |
| existing_hashes = set(point.payload.get('metadata', {}).get('content_hash') for point in scroll_results[0]) | |
| for doc in document: | |
| if doc.metadata['content_hash'] not in existing_hashes: | |
| new_docs.append(doc) | |
| if new_docs: | |
| qdrant_vectorstore.add_documents(new_docs) | |
| print(f"Added {len(new_docs)} new documents") | |
| print(f"Skipped {len(existing_hashes)} existing documents") | |
| else: | |
| print("Collection does not exist") #So we go ahead and just add the documents | |
| qdrant_vectorstore = QdrantVectorStore.from_documents( | |
| documents=document, | |
| embedding=embedding_model, | |
| collection_name="protocol_collection", | |
| # location = ":memory:" | |
| url=qdrant_url | |
| ) | |
| return qdrant_vectorstore |