File size: 509 Bytes
e539b70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from .sourcer import search_web
import pandas as pd
import os

root_dir = 'data/datasets'
pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))

def gen_corpus(query: str, pira: bool=True, ONU: bool=True, web: bool=True)->list:
    corpus = []
    if not (pira or ONU or web):
        # TODO: raise error
        pass
    if pira:
        corpus += pira_df.text.to_list()
    if ONU:
        # TODO: implement PDFs
        pass
    if web:
        corpus += search_web(query)

    return corpus