hsuvaskakoty commited on
Commit
86604a2
·
verified ·
1 Parent(s): 27be2d9
Files changed (2) hide show
  1. data_collect.py +1 -1
  2. process_data.py +140 -0
data_collect.py CHANGED
@@ -1,6 +1,6 @@
1
 
2
  from datetime import datetime
3
- from wide_analysis.data.process_data import prepare_dataset
4
  from datasets import load_dataset
5
 
6
  from collect_data_wikidata_ent import collect_wikidata_entity
 
1
 
2
  from datetime import datetime
3
+ from process_data import prepare_dataset
4
  from datasets import load_dataset
5
 
6
  from collect_data_wikidata_ent import collect_wikidata_entity
process_data.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from datetime import datetime
3
+ from wide_analysis.data.collect_data import collect_deletion_discussions, process_data
4
+
5
+
6
+ label_mapping = {
7
+ 'soft delete': 'delete',
8
+ 'keep': 'keep',
9
+ 'delete': 'delete',
10
+ 'merge': 'merge',
11
+ 'no consensus': 'no consensus',
12
+ 'userfy for beanie': 'userfy',
13
+ 'redirect': 'redirect',
14
+ 'speedy keep': 'speedy keep',
15
+ 'moved to\n \n wp:draft\n \n space': 'draftify',
16
+ 'keep.': 'keep',
17
+ 'draftify': 'draftify',
18
+ 'speedy delete': 'speedy delete',
19
+ 'speedily deleted': 'speedy delete',
20
+ 'speedy close': 'speedy close',
21
+ 'snow keep': 'keep',
22
+ 'nomination withdrawn': 'withdrawn',
23
+ 'speedy keep, nomination withdrawn': 'speedy keep',
24
+ 'procedural close': 'no consensus',
25
+ 'move to draft': 'draftify',
26
+ 'draftified': 'draftify',
27
+ 'withdrawn': 'withdrawn',
28
+ 'snow delete': 'delete',
29
+ 'redirect to\n \n colorado college tigers football, 1882–1909#1882': 'redirect',
30
+ 'userfy': 'userfy',
31
+ 'withdrawn by nominator': 'withdrawn',
32
+ 'will nominated individually for a fair discussion': 'withdrawn',
33
+ 'keep/redirect:': 'keep',
34
+ 'nominator withdrew': 'withdrawn',
35
+ 'delete and redirect': 'delete',
36
+ 'merge and redirect': 'merge',
37
+ 'speedily deleted under\n \n wp:a7': 'speedy delete',
38
+ 'delete.': 'delete',
39
+ 'speedy delete as\n \n wp:g11': 'speedy delete',
40
+ 'snow\n \n keep': 'keep',
41
+ 'closed': 'no consensus',
42
+ 'move to\n \n bids for the 2034 winter olympics': 'draftify',
43
+ 'withdraw': 'withdrawn',
44
+ 'trainwreck': 'no consensus',
45
+ 'procedural keep': 'keep',
46
+ 'article deleted by its original creator': 'delete',
47
+ 'keep/nomination withdrawn': 'keep',
48
+ 'withdrawn by nominator.': 'withdrawn',
49
+ 'keep and revert': 'keep',
50
+ "wp:g5ed\n \n because it's been created by a lta\n \n liamb2011\n \n (\n \n talk\n \n\n ·\n \n\n contribs\n \n )": 'delete',
51
+ 'speedy deleted': 'speedy delete',
52
+ 'mixed outcome': 'no consensus',
53
+ 'rename': 'rename',
54
+ 'speedy keep (sock nom, no delete votes)': 'speedy keep',
55
+ 'merge one; no consensus for rest': 'merge',
56
+ 'speedy keep.': 'speedy keep',
57
+ 'restore to disambig': 'redirect',
58
+ 'speedily deleted under g3': 'speedy delete',
59
+ 'this is the only consensus i can discern from this discussion as few editors commented on the other articles. no penalty for future afds on this other articles.': 'no consensus',
60
+ 'draftify then create redirect from this page title.': 'draftify',
61
+ 'keep/withdaw': 'keep',
62
+ 'speedy delete g12': 'speedy delete',
63
+ 'convert to disambiguation page': 'redirect',
64
+ 'convert to dab': 'redirect',
65
+ 'withdrawn to draftify': 'draftify',
66
+ 'a snow keep': 'keep',
67
+ 'speedy keep per\n \n wp:csk\n \n #3: no accurate deletion rationale has been provided': 'speedy keep',
68
+ 'userify': 'userfy',
69
+ 'keep\n \n leinster chess leagues': 'keep',
70
+ 'draftify.': 'draftify',
71
+ 'move to project space and redirect': 'redirect',
72
+ 'moot': 'no consensus',
73
+ 'draftified by creator.': 'draftify',
74
+ 'no consensus to delete; consensus to rename to\n \n next tasmanian state election': 'no consensus',
75
+ 'nom withdrawn': 'withdrawn',
76
+ 'move to\n \n glenwood south': 'draftify',
77
+ 'withdrawn; speedy keep': 'speedy keep',
78
+ 'restore dab': 'redirect',
79
+ 'speedy deleted via g5': 'speedy delete',
80
+ 'deleted g11': 'speedy delete',
81
+ 'drafify': 'draftify',
82
+ 'no action.': 'no consensus',
83
+ 'reinstate previous redirect.': 'redirect',
84
+ 'perform a\n \n wp:mergeprop\n \n instead': 'merge',
85
+ 'transwiki': 'no consensus',
86
+ 'duplicate afd': 'no consensus',
87
+ 'redirected': 'redirect',
88
+ 'already deleted': 'delete',
89
+ 'speedy': 'speedy delete'
90
+ }
91
+
92
+
93
+
94
+ def prepare_dataset(mode = 'date', start_date=None, end_date=None, url=None, title=None, output_path=None):
95
+ if mode == 'date_range':
96
+ if start_date and end_date:
97
+ start_date = datetime.strptime(start_date, '%Y-%m-%d')
98
+ end_date = datetime.strptime(end_date, '%Y-%m-%d')
99
+ df = collect_deletion_discussions(start_date, end_date)
100
+ else:
101
+ raise ValueError("start_date and end_date must be provided for mode 'date_range'")
102
+
103
+ elif mode == 'date':
104
+ if start_date:
105
+ start_date = datetime.strptime(start_date, '%Y-%m-%d')
106
+ df = collect_deletion_discussions(start_date, start_date)
107
+ else:
108
+ raise ValueError("start_date must be provided for mode 'date'")
109
+
110
+ elif mode == 'title':
111
+ if url and title:
112
+ if url.endswith('/'):
113
+ start_date =datetime.strptime(start_date, '%Y-%m-%d')
114
+ start_date = start_date.strftime('%Y_%B_%d')
115
+ url = f'{url}{start_date}#{title}'
116
+ df = process_data(url,start_date)
117
+ else:
118
+ raise ValueError("url and title must be provided for mode 'title'")
119
+
120
+ else:
121
+ raise ValueError("Invalid mode. Choose from ['date_range', 'date', 'title']")
122
+
123
+ if not df.empty:
124
+ if mode == 'date' or mode == 'date_range':
125
+ df = df[['log_date', 'title', 'text_url','discussion_cleaned','proper_label','confirmation']]
126
+ df = df.rename(columns={'log_date':'date','title':'title','text_url':'url','discussion_cleaned':'discussion','proper_label':'label','confirmation':'confirmation'})
127
+ if mode == 'title':
128
+ df = df[['date','title','text_url', 'discussion_cleaned','label','confirmation']]
129
+ df = df.rename(columns={'discussion_cleaned':'discussion'})
130
+ df['label'] = df['label'].str.lower()
131
+ df['label'] = df['label'].map(label_mapping)
132
+ df = df.drop_duplicates(subset=['title', 'discussion'])
133
+ df = df.dropna()
134
+ if output_path:
135
+ df.to_csv(output_path, index=False)
136
+ return df
137
+ else:
138
+ return df
139
+
140
+ return df