File size: 653 Bytes
4ce2e5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
'''This module contains utility functions for the project'''
import mmh3
from haystack import Document


def get_unique_docs(dataset):
    '''Get unique documents from dataset
    
    Args:
    dataset: list of dictionaries

    Returns:
    docs: list of haystack.Document
    '''
    unique_docs = set()
    docs = list()
    for doc in dataset:
        if doc["context"] is not None and doc["context_id"] not in unique_docs:
            unique_docs.add(doc["context_id"])
            document = Document(content=doc["context"], meta={'title': doc["context_title"], 'context_id': doc["context_id"]})
            docs.append(document)
    return docs