File size: 857 Bytes
4ce2e5d 8416f29 4ce2e5d 8416f29 4ce2e5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
'''This module contains utility functions for the project'''
import mmh3
from haystack import Document
def get_unique_docs(dataset, unique_docs:set):
'''Get unique documents from dataset
Args:
dataset: list of dictionaries
Returns:
docs: list of haystack.Document
'''
docs = list()
for doc in dataset:
if doc["context"] is not None and doc["context_id"] not in unique_docs:
unique_docs.add(doc["context_id"])
document = Document(
content=doc["context"],
meta={
'title': doc["context_title"],
'context_id': doc["context_id"],
'url': doc["url"],
'source': 'QASports', 'category': 'basketball'
}
)
docs.append(document)
return docs
|