File size: 9,071 Bytes
0d0a4e0
 
86604a2
0d0a4e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180

from datetime import datetime
from process_data import prepare_dataset
from datasets import load_dataset

from collect_data_wikidata_ent import collect_wikidata_entity
from collect_data_wikidata_prop import collect_wikidata
from collect_data_wikinews import collect_wikinews
from collect_data_wikiquote import collect_wikiquote
from collect_data_es import collect_es
from collect_data_gr import collect_gr

def normalize_outcome(o):
    lowered = o.lower()
    if 'διαγρ' in lowered:
        return 'Διαγραφή'
    elif 'διατήρη' in lowered or 'παραμονή' in lowered:
        return 'Διατήρηση'
    elif 'συγχών' in lowered:
        return 'συγχώνευση'
    else:
        return 'Δεν υπάρχει συναίνεση'

def collect(mode, start_date=None, end_date=None, url=None, title=None, output_path=None,
            platform=None, lang=None, date=None, years=None):
    if mode not in ['date_range', 'date', 'title','url','wide_2023']:
        raise ValueError("Invalid mode. Choose from ['date_range', 'date', 'title','url','wide_2023']")

    if mode == 'wide_2023':
        dataset = load_dataset('hsuvaskakoty/wide_analysis')
        print('Dataset loaded successfully as huggingface dataset')
        print('The dataset has the following columns:', dataset.column_names)
        return dataset
    underlying_mode = mode
    if mode in ['date', 'date_range']:
        underlying_mode = 'year'
    if mode == 'url':
        underlying_mode = 'url'
    if (platform is None and lang is None) or (platform=='wikipedia' and lang=='en'):
        if mode in ['date_range', 'date', 'title']:
            return prepare_dataset(
                mode=mode,
                start_date=start_date,
                end_date=end_date,
                url=url,
                title=title,
                output_path=output_path
            )
        else:
            print("Invalid input. Choose from ['date_range', 'date', 'title','wide_2023']")
            return None

    if platform == 'wikidata_entity':
            if underlying_mode == 'title':
                if not title or (years and len(years)>0):
                    raise ValueError("For 'title' mode in wikidata entity, 'title' must be provided and 'years' must be empty.")
                return collect_wikidata_entity(mode='title', title=title, years=[])

            elif underlying_mode == 'year':
                if start_date and end_date:
                    start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
                    end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year)
                    return collect_wikidata_entity(mode='year', years=[start_year, end_year])
                elif start_date:
                    single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
                    return collect_wikidata_entity(mode='year', years=single_year)
                else:
                    raise ValueError("For 'year' mode in wikidata entity, start_date (and optionally end_date) is required.")
            elif underlying_mode == 'url':
                if not url:
                    raise ValueError("For 'url' mode in wikidata entity, 'url' must be provided.")
                return collect_wikidata_entity(mode='url', url=url)
            else:
                raise ValueError("Invalid mode for wikidata entity. Use 'title' or 'year'.")
            

    elif platform == 'wikidata_property':
            if underlying_mode == 'title':
                if not title or (years and len(years)>0):
                    raise ValueError("For 'title' mode in wikidata property, 'title' must be provided and 'years' must be empty.")
                return collect_wikidata(mode='title', title=title, years=[])
            elif underlying_mode == 'url':
                if not url:
                    raise ValueError("For 'url' mode in wikidata property, 'url' must be provided.")
                return collect_wikidata(mode='url', title='', url=url, years=[])
            
            elif underlying_mode == 'year':
                if start_date and end_date:
                    start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
                    end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year)
                    return collect_wikidata(mode='year', years=[start_year, end_year])
                elif start_date:
                    single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
                    return collect_wikidata(mode='year', years=single_year)
                else:
                    raise ValueError("For 'year' mode in wikidata property, start_date (and optionally end_date) is required.")
            else:
                raise ValueError("Invalid mode for wikidata property. Use 'title' or 'year'.")

        # else:
        #     raise ValueError("Invalid lang for wikidata. Use 'entity' or 'property'.")

    elif platform == 'wikinews':
        if underlying_mode == 'title':
            if not title:
                raise ValueError("For 'title' mode in wikinews, 'title' is required.")
            return collect_wikinews(mode='title', title=title)
        elif underlying_mode == 'url':
            if not url:
                raise ValueError("For 'url' mode in wikinews, 'url' is required.")
            return collect_wikinews(mode='url', url=url)
        elif underlying_mode == 'year':
            if start_date and end_date:
                start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
                end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year)
                return collect_wikinews(mode='year', year=[start_y, end_y])
            elif start_date:
                single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
                return collect_wikinews(mode='year', year=single_y)
            else:
                raise ValueError("For 'year' mode in wikinews, start_date (and optionally end_date) is required.")
        else:
            raise ValueError("Invalid mode for wikinews. Use 'title' or 'year' or  'url'.")

    # elif platform == 'wikiquote':
    #     if underlying_mode != 'title':
    #         raise ValueError("Wikiquote collection currently only supports 'title' mode.")
    #     if not title:
    #         title = 'all'
    #     return collect_wikiquote(mode='title', title=title)
    elif platform == 'wikiquote':
        if underlying_mode not in ['title', 'url']:
            raise ValueError("Wikiquote collection currently only supports 'title' or 'url' mode.")

        if underlying_mode == 'title':
            if not title:
                title = 'all'
            return collect_wikiquote(mode='title', title=title)
        elif underlying_mode == 'url':
            if not url:
                raise ValueError("For 'url' mode in wikiquote, 'url' must be provided.")
            return collect_wikiquote(mode='url', url=url)


    elif platform == 'wikipedia':
        if lang == 'es':
            if underlying_mode == 'title':
                if not title or date:
                    raise ValueError("For 'title' mode in spanish wikipedia, 'title' must be provided and 'date' must be empty.")
                return collect_es(mode='title', title=title, date='')
            elif underlying_mode == 'year':
                if not date:
                    raise ValueError("For 'year' mode in spanish wikipedia, 'date' parameter (dd/mm/yyyy) is required.")
                return collect_es(mode='year', title='', date=date)
            else:
                raise ValueError("Invalid mode for spanish wikipedia. Use 'title' or 'year'.")

        elif lang == 'gr':
            if underlying_mode == 'title':
                if not title or not years or len(years) != 1:
                    raise ValueError("For 'title' mode in greek wikipedia, 'title' and a single-element list years=['mm/yyyy'] are required.")
                return collect_gr(mode='title', title=title, years=years)
            elif underlying_mode == 'year':
                if start_date and end_date:
                    start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
                    end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year)
                    return collect_gr(mode='year', title='', years=[start_y,end_y])
                elif start_date:
                    single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
                    return collect_gr(mode='year', title='', years=[single_y])
                else:
                    raise ValueError("For 'year' mode in greek wikipedia, start_date (and optionally end_date) is required.")
            else:
                raise ValueError("Invalid mode for greek wikipedia. Use 'title' or 'year'.")

        else:
            raise ValueError("Invalid lang for wikipedia. Use 'en', 'es', or 'gr'.")

    else:
        raise ValueError("Invalid platform. Use 'wikipedia', 'wikidata_entity', Wikidata_property', 'wikinews', or 'wikiquote'.")