File size: 857 Bytes
4ce2e5d
 
 
 
 
8416f29
4ce2e5d
 
 
 
 
 
 
 
 
 
 
 
8416f29
 
 
 
 
 
 
 
 
4ce2e5d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
'''This module contains utility functions for the project'''
import mmh3
from haystack import Document


def get_unique_docs(dataset, unique_docs:set):
    '''Get unique documents from dataset
    
    Args:
    dataset: list of dictionaries

    Returns:
    docs: list of haystack.Document
    '''
    docs = list()
    for doc in dataset:
        if doc["context"] is not None and doc["context_id"] not in unique_docs:
            unique_docs.add(doc["context_id"])
            document = Document(
                content=doc["context"],
                meta={
                    'title': doc["context_title"],
                    'context_id': doc["context_id"],
                    'url': doc["url"],
                    'source': 'QASports', 'category': 'basketball'
                }
            )
            docs.append(document)
    return docs