Spaces:
Sleeping
Sleeping
File size: 9,071 Bytes
0d0a4e0 86604a2 0d0a4e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
from datetime import datetime
from process_data import prepare_dataset
from datasets import load_dataset
from collect_data_wikidata_ent import collect_wikidata_entity
from collect_data_wikidata_prop import collect_wikidata
from collect_data_wikinews import collect_wikinews
from collect_data_wikiquote import collect_wikiquote
from collect_data_es import collect_es
from collect_data_gr import collect_gr
def normalize_outcome(o):
lowered = o.lower()
if 'διαγρ' in lowered:
return 'Διαγραφή'
elif 'διατήρη' in lowered or 'παραμονή' in lowered:
return 'Διατήρηση'
elif 'συγχών' in lowered:
return 'συγχώνευση'
else:
return 'Δεν υπάρχει συναίνεση'
def collect(mode, start_date=None, end_date=None, url=None, title=None, output_path=None,
platform=None, lang=None, date=None, years=None):
if mode not in ['date_range', 'date', 'title','url','wide_2023']:
raise ValueError("Invalid mode. Choose from ['date_range', 'date', 'title','url','wide_2023']")
if mode == 'wide_2023':
dataset = load_dataset('hsuvaskakoty/wide_analysis')
print('Dataset loaded successfully as huggingface dataset')
print('The dataset has the following columns:', dataset.column_names)
return dataset
underlying_mode = mode
if mode in ['date', 'date_range']:
underlying_mode = 'year'
if mode == 'url':
underlying_mode = 'url'
if (platform is None and lang is None) or (platform=='wikipedia' and lang=='en'):
if mode in ['date_range', 'date', 'title']:
return prepare_dataset(
mode=mode,
start_date=start_date,
end_date=end_date,
url=url,
title=title,
output_path=output_path
)
else:
print("Invalid input. Choose from ['date_range', 'date', 'title','wide_2023']")
return None
if platform == 'wikidata_entity':
if underlying_mode == 'title':
if not title or (years and len(years)>0):
raise ValueError("For 'title' mode in wikidata entity, 'title' must be provided and 'years' must be empty.")
return collect_wikidata_entity(mode='title', title=title, years=[])
elif underlying_mode == 'year':
if start_date and end_date:
start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year)
return collect_wikidata_entity(mode='year', years=[start_year, end_year])
elif start_date:
single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
return collect_wikidata_entity(mode='year', years=single_year)
else:
raise ValueError("For 'year' mode in wikidata entity, start_date (and optionally end_date) is required.")
elif underlying_mode == 'url':
if not url:
raise ValueError("For 'url' mode in wikidata entity, 'url' must be provided.")
return collect_wikidata_entity(mode='url', url=url)
else:
raise ValueError("Invalid mode for wikidata entity. Use 'title' or 'year'.")
elif platform == 'wikidata_property':
if underlying_mode == 'title':
if not title or (years and len(years)>0):
raise ValueError("For 'title' mode in wikidata property, 'title' must be provided and 'years' must be empty.")
return collect_wikidata(mode='title', title=title, years=[])
elif underlying_mode == 'url':
if not url:
raise ValueError("For 'url' mode in wikidata property, 'url' must be provided.")
return collect_wikidata(mode='url', title='', url=url, years=[])
elif underlying_mode == 'year':
if start_date and end_date:
start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year)
return collect_wikidata(mode='year', years=[start_year, end_year])
elif start_date:
single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
return collect_wikidata(mode='year', years=single_year)
else:
raise ValueError("For 'year' mode in wikidata property, start_date (and optionally end_date) is required.")
else:
raise ValueError("Invalid mode for wikidata property. Use 'title' or 'year'.")
# else:
# raise ValueError("Invalid lang for wikidata. Use 'entity' or 'property'.")
elif platform == 'wikinews':
if underlying_mode == 'title':
if not title:
raise ValueError("For 'title' mode in wikinews, 'title' is required.")
return collect_wikinews(mode='title', title=title)
elif underlying_mode == 'url':
if not url:
raise ValueError("For 'url' mode in wikinews, 'url' is required.")
return collect_wikinews(mode='url', url=url)
elif underlying_mode == 'year':
if start_date and end_date:
start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year)
return collect_wikinews(mode='year', year=[start_y, end_y])
elif start_date:
single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
return collect_wikinews(mode='year', year=single_y)
else:
raise ValueError("For 'year' mode in wikinews, start_date (and optionally end_date) is required.")
else:
raise ValueError("Invalid mode for wikinews. Use 'title' or 'year' or 'url'.")
# elif platform == 'wikiquote':
# if underlying_mode != 'title':
# raise ValueError("Wikiquote collection currently only supports 'title' mode.")
# if not title:
# title = 'all'
# return collect_wikiquote(mode='title', title=title)
elif platform == 'wikiquote':
if underlying_mode not in ['title', 'url']:
raise ValueError("Wikiquote collection currently only supports 'title' or 'url' mode.")
if underlying_mode == 'title':
if not title:
title = 'all'
return collect_wikiquote(mode='title', title=title)
elif underlying_mode == 'url':
if not url:
raise ValueError("For 'url' mode in wikiquote, 'url' must be provided.")
return collect_wikiquote(mode='url', url=url)
elif platform == 'wikipedia':
if lang == 'es':
if underlying_mode == 'title':
if not title or date:
raise ValueError("For 'title' mode in spanish wikipedia, 'title' must be provided and 'date' must be empty.")
return collect_es(mode='title', title=title, date='')
elif underlying_mode == 'year':
if not date:
raise ValueError("For 'year' mode in spanish wikipedia, 'date' parameter (dd/mm/yyyy) is required.")
return collect_es(mode='year', title='', date=date)
else:
raise ValueError("Invalid mode for spanish wikipedia. Use 'title' or 'year'.")
elif lang == 'gr':
if underlying_mode == 'title':
if not title or not years or len(years) != 1:
raise ValueError("For 'title' mode in greek wikipedia, 'title' and a single-element list years=['mm/yyyy'] are required.")
return collect_gr(mode='title', title=title, years=years)
elif underlying_mode == 'year':
if start_date and end_date:
start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year)
return collect_gr(mode='year', title='', years=[start_y,end_y])
elif start_date:
single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
return collect_gr(mode='year', title='', years=[single_y])
else:
raise ValueError("For 'year' mode in greek wikipedia, start_date (and optionally end_date) is required.")
else:
raise ValueError("Invalid mode for greek wikipedia. Use 'title' or 'year'.")
else:
raise ValueError("Invalid lang for wikipedia. Use 'en', 'es', or 'gr'.")
else:
raise ValueError("Invalid platform. Use 'wikipedia', 'wikidata_entity', Wikidata_property', 'wikinews', or 'wikiquote'.") |