Spaces:
Sleeping
Sleeping
File size: 5,931 Bytes
86604a2 2e68d8f 86604a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import pandas as pd
from datetime import datetime
from collect_data import collect_deletion_discussions, process_data
label_mapping = {
'soft delete': 'delete',
'keep': 'keep',
'delete': 'delete',
'merge': 'merge',
'no consensus': 'no consensus',
'userfy for beanie': 'userfy',
'redirect': 'redirect',
'speedy keep': 'speedy keep',
'moved to\n \n wp:draft\n \n space': 'draftify',
'keep.': 'keep',
'draftify': 'draftify',
'speedy delete': 'speedy delete',
'speedily deleted': 'speedy delete',
'speedy close': 'speedy close',
'snow keep': 'keep',
'nomination withdrawn': 'withdrawn',
'speedy keep, nomination withdrawn': 'speedy keep',
'procedural close': 'no consensus',
'move to draft': 'draftify',
'draftified': 'draftify',
'withdrawn': 'withdrawn',
'snow delete': 'delete',
'redirect to\n \n colorado college tigers football, 1882–1909#1882': 'redirect',
'userfy': 'userfy',
'withdrawn by nominator': 'withdrawn',
'will nominated individually for a fair discussion': 'withdrawn',
'keep/redirect:': 'keep',
'nominator withdrew': 'withdrawn',
'delete and redirect': 'delete',
'merge and redirect': 'merge',
'speedily deleted under\n \n wp:a7': 'speedy delete',
'delete.': 'delete',
'speedy delete as\n \n wp:g11': 'speedy delete',
'snow\n \n keep': 'keep',
'closed': 'no consensus',
'move to\n \n bids for the 2034 winter olympics': 'draftify',
'withdraw': 'withdrawn',
'trainwreck': 'no consensus',
'procedural keep': 'keep',
'article deleted by its original creator': 'delete',
'keep/nomination withdrawn': 'keep',
'withdrawn by nominator.': 'withdrawn',
'keep and revert': 'keep',
"wp:g5ed\n \n because it's been created by a lta\n \n liamb2011\n \n (\n \n talk\n \n\n ·\n \n\n contribs\n \n )": 'delete',
'speedy deleted': 'speedy delete',
'mixed outcome': 'no consensus',
'rename': 'rename',
'speedy keep (sock nom, no delete votes)': 'speedy keep',
'merge one; no consensus for rest': 'merge',
'speedy keep.': 'speedy keep',
'restore to disambig': 'redirect',
'speedily deleted under g3': 'speedy delete',
'this is the only consensus i can discern from this discussion as few editors commented on the other articles. no penalty for future afds on this other articles.': 'no consensus',
'draftify then create redirect from this page title.': 'draftify',
'keep/withdaw': 'keep',
'speedy delete g12': 'speedy delete',
'convert to disambiguation page': 'redirect',
'convert to dab': 'redirect',
'withdrawn to draftify': 'draftify',
'a snow keep': 'keep',
'speedy keep per\n \n wp:csk\n \n #3: no accurate deletion rationale has been provided': 'speedy keep',
'userify': 'userfy',
'keep\n \n leinster chess leagues': 'keep',
'draftify.': 'draftify',
'move to project space and redirect': 'redirect',
'moot': 'no consensus',
'draftified by creator.': 'draftify',
'no consensus to delete; consensus to rename to\n \n next tasmanian state election': 'no consensus',
'nom withdrawn': 'withdrawn',
'move to\n \n glenwood south': 'draftify',
'withdrawn; speedy keep': 'speedy keep',
'restore dab': 'redirect',
'speedy deleted via g5': 'speedy delete',
'deleted g11': 'speedy delete',
'drafify': 'draftify',
'no action.': 'no consensus',
'reinstate previous redirect.': 'redirect',
'perform a\n \n wp:mergeprop\n \n instead': 'merge',
'transwiki': 'no consensus',
'duplicate afd': 'no consensus',
'redirected': 'redirect',
'already deleted': 'delete',
'speedy': 'speedy delete'
}
def prepare_dataset(mode = 'date', start_date=None, end_date=None, url=None, title=None, output_path=None):
if mode == 'date_range':
if start_date and end_date:
start_date = datetime.strptime(start_date, '%Y-%m-%d')
end_date = datetime.strptime(end_date, '%Y-%m-%d')
df = collect_deletion_discussions(start_date, end_date)
else:
raise ValueError("start_date and end_date must be provided for mode 'date_range'")
elif mode == 'date':
if start_date:
start_date = datetime.strptime(start_date, '%Y-%m-%d')
df = collect_deletion_discussions(start_date, start_date)
else:
raise ValueError("start_date must be provided for mode 'date'")
elif mode == 'title':
if url and title:
if url.endswith('/'):
start_date =datetime.strptime(start_date, '%Y-%m-%d')
start_date = start_date.strftime('%Y_%B_%d')
url = f'{url}{start_date}#{title}'
df = process_data(url,start_date)
else:
raise ValueError("url and title must be provided for mode 'title'")
else:
raise ValueError("Invalid mode. Choose from ['date_range', 'date', 'title']")
if not df.empty:
if mode == 'date' or mode == 'date_range':
df = df[['log_date', 'title', 'text_url','discussion_cleaned','proper_label','confirmation']]
df = df.rename(columns={'log_date':'date','title':'title','text_url':'url','discussion_cleaned':'discussion','proper_label':'label','confirmation':'confirmation'})
if mode == 'title':
df = df[['date','title','text_url', 'discussion_cleaned','label','confirmation']]
df = df.rename(columns={'discussion_cleaned':'discussion'})
df['label'] = df['label'].str.lower()
df['label'] = df['label'].map(label_mapping)
df = df.drop_duplicates(subset=['title', 'discussion'])
df = df.dropna()
if output_path:
df.to_csv(output_path, index=False)
return df
else:
return df
return df
|