autosumm / corpora /corpora.py
mhsvieira's picture
Add current system
e539b70
raw
history blame
509 Bytes
from .sourcer import search_web
import pandas as pd
import os
root_dir = 'data/datasets'
pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))
def gen_corpus(query: str, pira: bool=True, ONU: bool=True, web: bool=True)->list:
corpus = []
if not (pira or ONU or web):
# TODO: raise error
pass
if pira:
corpus += pira_df.text.to_list()
if ONU:
# TODO: implement PDFs
pass
if web:
corpus += search_web(query)
return corpus