|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
import pysbd |
|
import re |
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_soup(url): |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
return BeautifulSoup(response.text, 'html.parser') |
|
|
|
def html_to_plaintext(html_content): |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
for tag in soup.find_all(['p', 'li', 'dd', 'dl']): |
|
tag.insert_before('\n') |
|
tag.insert_after('\n') |
|
for br in soup.find_all('br'): |
|
br.replace_with('\n') |
|
|
|
text = soup.get_text(separator=' ', strip=True) |
|
text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != '']) |
|
return text |
|
|
|
def extract_fallback_discussion(html_content): |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
discussion_parts = [] |
|
for element in soup.find_all(['p', 'li', 'dd', 'ol'], recursive=False): |
|
discussion_parts.append(element.get_text(separator=' ', strip=True)) |
|
return ' '.join(discussion_parts).strip() |
|
|
|
def process_html_to_plaintext(df): |
|
if df.empty: |
|
return df |
|
df['discussion_cleaned'] = df['discussion'].apply(html_to_plaintext) |
|
for index, row in df.iterrows(): |
|
if not row['discussion_cleaned'].strip(): |
|
df.at[index, 'discussion_cleaned'] = extract_fallback_discussion(row['discussion_uncleaned']) |
|
return df |
|
|
|
def extract_outcome_from_div(div): |
|
try: |
|
result_phrase = div.find(text=re.compile(r'The result was to')) |
|
if result_phrase: |
|
result = result_phrase.find_next('b') |
|
if result: |
|
outcome_text = result.text.strip() |
|
if outcome_text.lower() == "please do not modify it": |
|
return extract_following_sentence(div) or 'unknown' |
|
elif validate_outcome(outcome_text): |
|
return outcome_text |
|
li_outcome = div.find('li') |
|
if li_outcome and li_outcome.find('b'): |
|
outcome_text = li_outcome.find('b').text.strip() |
|
if outcome_text.lower() == "please do not modify it": |
|
return extract_following_sentence(div) or 'unknown' |
|
elif validate_outcome(outcome_text): |
|
return outcome_text |
|
|
|
dl_outcome = div.find('dl') |
|
if dl_outcome and dl_outcome.find('b'): |
|
outcome_text = dl_outcome.find('b').text.strip() |
|
if outcome_text.lower() == "please do not modify it": |
|
return extract_following_sentence(div) or 'unknown' |
|
elif validate_outcome(outcome_text): |
|
return outcome_text |
|
|
|
outcome_italic = div.find('dd') |
|
if outcome_italic and outcome_italic.find('i'): |
|
outcome_text = outcome_italic.find('i').get_text(strip=True) |
|
if outcome_text.lower() == "please do not modify it": |
|
return extract_following_sentence(div) or 'unknown' |
|
elif validate_outcome(outcome_text): |
|
return outcome_text |
|
return extract_following_sentence(div) or 'unknown' |
|
|
|
except Exception as e: |
|
print(f"Error extracting outcome: {e}") |
|
return 'unknown' |
|
|
|
|
|
def extract_following_sentence(div): |
|
try: |
|
phrases = [ |
|
"No further edits should be made to this discussion", |
|
"Please do not add any more comments and votes to this request", |
|
"No further edits should be made to this discussion." |
|
] |
|
|
|
for phrase in phrases: |
|
phrase_location = div.find(text=re.compile(phrase)) |
|
if phrase_location: |
|
following_text = "" |
|
for sibling in phrase_location.find_all_next(string=True): |
|
if "Please do not modify it" in sibling: |
|
continue |
|
following_text += sibling.strip() + " " |
|
if "." in sibling: |
|
break |
|
sentence = following_text.split('.')[0].strip() |
|
if validate_outcome(sentence): |
|
return sentence |
|
|
|
return None |
|
|
|
except Exception as e: |
|
print(f"Error extracting following sentence: {e}") |
|
return None |
|
|
|
def validate_outcome(outcome_text): |
|
label_mapping = { |
|
'delete': [ |
|
'delete', 'delete ... unanimous', 'deleted', 'deleted as abandoned', |
|
'speedy delete', 'Delete', 'delete as redundant to existing template', |
|
'delete as unlikely to be used', 'delete but no prejudice against recreation when needed', |
|
'delete after Ottawahitech chose not to provide a rationale', |
|
'Delete, with no objection to recreation when needed.', 'Deleted', |
|
'delete the Cigarette redirect and keep the NHS redirect.', 'Delete all articles', 'Tentatively sending through the usual abandonment process', |
|
'Delete all articles','This was completed already.' |
|
], |
|
'speedy delete': [ |
|
|
|
'speedy delete', 'speedy deleted', 'speedy deleted test page', 'Speedy-deleted', 'Speedy deleted', 'Speedy-deleted, no meaningful content', |
|
'Speeded as "old prep"', 'Speedied as "old prep" -- Pi zero ( talk ) 23:42, 10 February 2020 (UTC) [ reply ] __DTELLIPSISBUTTON__{"threadItem":{"timestamp":"2020-02-10T23:42:00' |
|
], |
|
|
|
'keep': [ |
|
'keep', |
|
'Do not undelete. The content should be kept by the author off-wiki, and can be included as a part of another story that is current', |
|
'Personal details have been redacted and hidden from public view together with a NOINDEX flag', |
|
|
|
], |
|
'redirect': [ |
|
'soft redirect' |
|
], |
|
'merge': [ |
|
'convert near-clone of mainspace article to use {{topic cat}}; apply {{correction}} to mainspace article' |
|
], |
|
'no_consensus': [ |
|
'No consensus to delete. However, there clearly is a consensus that if we are to have this template, we aren\'t to use it in its present form.', |
|
'no consensus', 'No consensus', |
|
"At this time, it's unclear if there's a consensus to keep but abundantly clear there isn't one to delete." |
|
], |
|
'comment': [ |
|
'Remove', 'SVT', 'withdraw the deletion request', 'On consideration, speedied as unused and lacking fair-use rationale', |
|
'Moved to userspace', 'Withdrawn to allow interview re-focus','More userspace drafts This is the second batch of a large number of draft articles in userspace', |
|
'This was completed already ', 'Do not undelete. The content should be kept by the author off-wiki, and can be included as a part of another story that is current', |
|
|
|
], |
|
'withdrawn': ['Withdrawn to allow interview re-focus', |
|
] |
|
} |
|
|
|
|
|
|
|
outcome_to_label = {outcome.lower(): label for label, outcomes in label_mapping.items() for outcome in outcomes} |
|
return outcome_to_label.get(outcome_text.lower(), 'unknown') |
|
|
|
|
|
def update_unknown_outcomes(df): |
|
base_url = "https://en.wikinews.org/w/index.php?title=" |
|
|
|
for i in df.index: |
|
if df.at[i, 'outcome'] == 'unknown': |
|
title = df.at[i, 'title'].replace(" ", "_") |
|
url = f"{base_url}{title}&action=edit&redlink=1" |
|
print(f"Checking page: {url}") |
|
|
|
try: |
|
response = requests.get(url) |
|
if response.status_code == 200: |
|
page_soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
warning_div = page_soup.find('div', class_='cdx-message cdx-message--block cdx-message--warning mw-warning-with-logexcerpt') |
|
if warning_div: |
|
df.at[i, 'outcome'] = 'delete' |
|
else: |
|
df.at[i, 'outcome'] = 'keep' |
|
else: |
|
print(f"Failed to retrieve page: {url}") |
|
|
|
except Exception as e: |
|
print(f"Error accessing {url}: {e}") |
|
|
|
return df |
|
|
|
|
|
def collect_wikinews_deletions(years=None): |
|
base_url = 'https://en.wikinews.org/wiki/Wikinews:Deletion_requests/Archives' |
|
response = requests.get(base_url) |
|
if response.status_code != 200: |
|
print("Failed to retrieve the archive page.") |
|
return None |
|
|
|
soup = get_soup(base_url) |
|
titles = [] |
|
text_urls = [] |
|
outcomes = [] |
|
deletion_discussions = [] |
|
discussion_uncleaned = [] |
|
year_links = [] |
|
for a in soup.select('a[href^="/wiki/Wikinews:Deletion_requests/Archives/"]'): |
|
year_text = re.findall(r'\d{4}', a.get_text()) |
|
if year_text: |
|
year_links.append((year_text[0], a['href'])) |
|
if years: |
|
if len(years) == 1: |
|
start_year = end_year = years[0] |
|
elif len(years) == 2: |
|
start_year, end_year = min(years), max(years) |
|
else: |
|
print("Invalid years input. Provide one or two years.") |
|
return None |
|
year_links = [(year, link) for year, link in year_links if start_year <= int(year) <= end_year] |
|
for year, year_link in year_links: |
|
year_url = 'https://en.wikinews.org' + year_link |
|
print(f"Processing year: {year_url}") |
|
year_soup = get_soup(year_url) |
|
discussion_divs = year_soup.find_all('div', class_=lambda x: x and 'boilerplate metadata' in x) |
|
|
|
for div in discussion_divs: |
|
title_tag = div.find(['h2', 'h3']) |
|
if title_tag: |
|
link_tag = title_tag.find('a', title=True) |
|
if link_tag: |
|
title = link_tag.get_text(strip=True) |
|
titles.append(title) |
|
text_url = year_url + '#' + link_tag['title'].replace(' ', '_') |
|
text_urls.append(text_url) |
|
else: |
|
titles.append(title_tag.get_text(strip=True)) |
|
text_urls.append(year_url) |
|
else: |
|
dl_tag = div.find('dl') |
|
if dl_tag and dl_tag.find('b'): |
|
titles.append(dl_tag.find('b').get_text(strip=True)) |
|
else: |
|
titles.append('No title found') |
|
text_urls.append(year_url) |
|
deletion_discussions.append(div.prettify()) |
|
discussion_uncleaned.append(div.prettify()) |
|
outcome = extract_outcome_from_div(div) |
|
outcomes.append(outcome) |
|
|
|
df = pd.DataFrame({ |
|
'title': titles, |
|
'url': text_urls, |
|
'outcome': outcomes, |
|
'discussion': deletion_discussions, |
|
'discussion_uncleaned': discussion_uncleaned |
|
}) |
|
|
|
|
|
df = process_html_to_plaintext(df) |
|
for i in df.index: |
|
if df.at[i,'outcome'] == 'Please do not modify it' or df.at[i,'outcome'] == 'Please do not modify it.': |
|
df.at[i,'outcome'] = extract_following_sentence(BeautifulSoup(df.at[i,'discussion_uncleaned'], 'html.parser')) or 'unknown' |
|
df['outcome'] = df['outcome'].apply(lambda x: validate_outcome(x) if x else 'unknown') |
|
df = update_unknown_outcomes(df) |
|
return df |
|
|
|
def collect_wikinews(years=None): |
|
df = collect_wikinews_deletions(years=years) |
|
if df is None: |
|
print('Error collecting Wikinews deletions.') |
|
return None |
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
def html_to_plaintext(html_content): |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']): |
|
tag.insert_before('\n') |
|
tag.insert_after('\n') |
|
for br in soup.find_all('br'): |
|
br.replace_with('\n') |
|
text = soup.get_text(separator=' ', strip=True) |
|
text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != '']) |
|
return text |
|
|
|
def split_text_into_sentences(text): |
|
seg = pysbd.Segmenter(language="en", clean=False) |
|
sentences = seg.segment(text) |
|
return ' '.join(sentences) |
|
|
|
def clean_discussion_tag(tag): |
|
for unwanted in tag.find_all(['span', 'img', 'a', 'div'], recursive=True): |
|
unwanted.decompose() |
|
return tag.get_text(separator=' ', strip=True) |
|
|
|
def extract_outcome_from_text_elements(elements): |
|
consensus_keywords = [ |
|
'Deleted', 'Delete', 'delete', 'deleted', |
|
'kept', 'keep', 'Keep', 'Kept', |
|
'merge', 'Merge', 'Not done', 'No consensus', 'no consensus', 'Done' |
|
] |
|
for el in elements: |
|
b_tags = el.find_all('b') |
|
for b in b_tags: |
|
if b.text.strip() in consensus_keywords: |
|
return b.text.strip() |
|
return '' |
|
|
|
def extract_discussion_section(soup, title): |
|
"""Extracts discussion section, label, and cleaned text.""" |
|
try: |
|
h3_id = title.replace(" ", "_") |
|
h3_tag = soup.find('h3', {'id': h3_id}) |
|
|
|
if not h3_tag: |
|
print(f"h3 tag with id '{h3_id}' not found.") |
|
return '', '', '' |
|
|
|
heading_div = h3_tag.parent |
|
|
|
if not heading_div: |
|
print("Parent div not found.") |
|
return '', '', '' |
|
|
|
next_heading_div = heading_div.find_next_sibling('div', class_='mw-heading mw-heading3') |
|
discussion_nodes = [] |
|
for sibling in heading_div.next_siblings: |
|
if sibling == next_heading_div: |
|
break |
|
discussion_nodes.append(sibling) |
|
|
|
discussion_tags = [] |
|
for node in discussion_nodes: |
|
if getattr(node, 'name', None) in ['p', 'ul', 'dl']: |
|
discussion_tags.append(node) |
|
|
|
if not discussion_tags: |
|
return '', '', '' |
|
|
|
label = extract_outcome_from_text_elements(discussion_tags) |
|
|
|
discussion_html_parts = [str(tag) for tag in discussion_tags] |
|
cleaned_parts = [] |
|
for tag in discussion_tags: |
|
text = clean_discussion_tag(tag) |
|
if text: |
|
cleaned_parts.append(text) |
|
|
|
cleaned_discussion = ' '.join(cleaned_parts) |
|
discussion_html = '\n'.join(discussion_html_parts) |
|
return discussion_html, label, cleaned_discussion |
|
|
|
except Exception as e: |
|
print(f"Error processing title '{title}': {e}") |
|
import traceback |
|
traceback.print_exc() |
|
return '', '', '' |
|
|
|
def extract_div_from_title(title): |
|
base_url = 'https://en.wikinews.org/wiki/Wikinews:Deletion_requests' |
|
t = title.replace(' ', '_') |
|
url = base_url + '#' + t |
|
|
|
response = requests.get(url) |
|
if response.status_code != 200: |
|
return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']) |
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title) |
|
|
|
text_url = base_url |
|
discussion_url = text_url + '#' + title.replace(' ', '_') |
|
|
|
df = pd.DataFrame([[title, text_url, discussion_url, cleaned_discussion, label]], |
|
columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']) |
|
|
|
if label: |
|
df['label'] = df['label'].replace({ |
|
'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete', |
|
'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep', |
|
'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus', |
|
'No consensus':'no_consensus', 'no consensus':'no_consensus', 'Done':'delete' |
|
}) |
|
|
|
df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences) |
|
df = df.rename(columns={'discussion_cleaned':'discussion'}) |
|
return df |
|
|
|
|
|
|
|
|
|
|
|
def collect_wikinews(mode, title=None, url ='', year=None): |
|
|
|
if mode == 'title': |
|
if not title: |
|
raise ValueError("Title is required for 'title' mode.") |
|
return extract_div_from_title(title) |
|
|
|
elif mode == 'url': |
|
if 'Archives' in url.split('/')[-2]: |
|
year = int(url.split('/')[-1].split('#')[0]) |
|
print(f"Year extracted from URL: {year}") |
|
df = collect_wikinews_deletions(years=[year]) |
|
|
|
df = df[df['title'] == url.split('#')[-1].replace('_', ' ')] |
|
if df.empty: |
|
return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']) |
|
df = df[['title','url','discussion_cleaned','outcome']] |
|
df = df.rename(columns={'discussion_cleaned':'discussion'}) |
|
return df |
|
|
|
if not url: |
|
raise ValueError("URL is required for 'url' mode.") |
|
title = url.split('#')[-1].replace('_', ' ') |
|
print(f"Title extracted from URL: {title}") |
|
return extract_div_from_title(title) |
|
|
|
elif mode == 'year': |
|
if not year: |
|
raise ValueError("Year or year range is required for 'year' mode.") |
|
return collect_wikinews_deletions(years=year) |
|
|
|
else: |
|
raise ValueError("Invalid mode. Please specify 'title' or 'year' or 'url'.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|