'''This module contains utility functions for the project''' | |
import mmh3 | |
from haystack import Document | |
def get_unique_docs(dataset): | |
'''Get unique documents from dataset | |
Args: | |
dataset: list of dictionaries | |
Returns: | |
docs: list of haystack.Document | |
''' | |
unique_docs = set() | |
docs = list() | |
for doc in dataset: | |
if doc["context"] is not None and doc["context_id"] not in unique_docs: | |
unique_docs.add(doc["context_id"]) | |
document = Document(content=doc["context"], meta={'title': doc["context_title"], 'context_id': doc["context_id"]}) | |
docs.append(document) | |
return docs | |