ai-deadlines / .github /scripts /update_conferences.py
nielsr's picture
nielsr HF Staff
Add rankings as separate field
5c4b399
raw
history blame
9.16 kB
import yaml
import requests
from datetime import datetime
from typing import Dict, List, Any
def fetch_conference_files() -> List[Dict[str, Any]]:
"""Fetch all conference YAML files from ccfddl repository."""
# First get the directory listing from GitHub API
api_url = "https://api.github.com/repos/ccfddl/ccf-deadlines/contents/conference/AI"
response = requests.get(api_url)
files = response.json()
conferences = []
for file in files:
if file['name'].endswith('.yml'):
yaml_content = requests.get(file['download_url']).text
conf_data = yaml.safe_load(yaml_content)
# The data is a list with a single item
if isinstance(conf_data, list) and len(conf_data) > 0:
conferences.append(conf_data[0])
return conferences
def parse_date_range(date_str: str, year: str) -> tuple[str, str]:
"""Parse various date formats and return start and end dates."""
# Remove the year if it appears at the end of the string
date_str = date_str.replace(f", {year}", "")
# Handle various date formats
try:
# Split into start and end dates
if ' - ' in date_str:
start, end = date_str.split(' - ')
elif '-' in date_str:
start, end = date_str.split('-')
else:
# For single date format like "May 19, 2025"
start = end = date_str
# Clean up month abbreviations
month_map = {
'Sept': 'September', # Handle Sept before Sep
'Jan': 'January',
'Feb': 'February',
'Mar': 'March',
'Apr': 'April',
'Jun': 'June',
'Jul': 'July',
'Aug': 'August',
'Sep': 'September',
'Oct': 'October',
'Nov': 'November',
'Dec': 'December'
}
# Create a set of all month names (full and abbreviated)
all_months = set(month_map.keys()) | set(month_map.values())
# Handle cases like "April 29-May 4"
has_month = any(month in end for month in all_months)
if not has_month:
# End is just a day number, use start's month
start_parts = start.split()
if len(start_parts) >= 1:
end = f"{start_parts[0]} {end.strip()}"
# Replace month abbreviations
for abbr, full in month_map.items():
start = start.replace(abbr, full)
end = end.replace(abbr, full)
# Clean up any extra spaces
start = ' '.join(start.split())
end = ' '.join(end.split())
# Parse start date
start_date = datetime.strptime(f"{start}, {year}", "%B %d, %Y")
# Parse end date
end_date = datetime.strptime(f"{end}, {year}", "%B %d, %Y")
return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')
except Exception as e:
raise ValueError(f"Could not parse date: {date_str} ({e})")
def transform_conference_data(conferences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Transform ccfddl format to our format."""
transformed = []
current_year = datetime.now().year
for conf in conferences:
# Get the most recent or upcoming conference instance
recent_conf = None
if 'confs' in conf:
for instance in conf['confs']:
if instance['year'] >= current_year:
recent_conf = instance
break
if not recent_conf:
continue
# Transform to our format
transformed_conf = {
'title': conf.get('title', ''),
'year': recent_conf['year'],
'id': recent_conf['id'],
'full_name': conf.get('description', ''),
'link': recent_conf.get('link', ''),
'deadline': recent_conf.get('timeline', [{}])[0].get('deadline', ''),
'timezone': recent_conf.get('timezone', ''),
'place': recent_conf.get('place', ''),
'date': recent_conf.get('date', ''),
'tags': [], # We'll need to maintain a mapping for tags
}
# Add optional fields
timeline = recent_conf.get('timeline', [{}])[0]
if 'abstract_deadline' in timeline:
transformed_conf['abstract_deadline'] = timeline['abstract_deadline']
# Parse date range for start/end
try:
if transformed_conf['date']:
start_date, end_date = parse_date_range(
transformed_conf['date'],
str(transformed_conf['year'])
)
transformed_conf['start'] = start_date
transformed_conf['end'] = end_date
except Exception as e:
print(f"Warning: Could not parse date for {transformed_conf['title']}: {e}")
# Add rankings as separate field
if 'rank' in conf:
rankings = []
for rank_type, rank_value in conf['rank'].items():
rankings.append(f"{rank_type.upper()}: {rank_value}")
if rankings:
transformed_conf['rankings'] = ', '.join(rankings)
transformed.append(transformed_conf)
return transformed
def main():
try:
# Fetch current conferences.yml
current_file = 'src/data/conferences.yml'
with open(current_file, 'r') as f:
current_conferences = yaml.safe_load(f)
# Fetch and transform new data
new_conferences = fetch_conference_files()
if not new_conferences:
print("Warning: No conferences fetched from ccfddl")
return
transformed_conferences = transform_conference_data(new_conferences)
if not transformed_conferences:
print("Warning: No conferences transformed")
return
# Create a dictionary of current conferences by ID
current_conf_dict = {conf['id']: conf for conf in current_conferences}
# Update or add new conferences while preserving existing ones
for new_conf in transformed_conferences:
if new_conf['id'] in current_conf_dict:
# Update existing conference while preserving fields
curr_conf = current_conf_dict[new_conf['id']]
# Preserve existing fields
preserved_fields = [
'tags', 'venue', 'hindex', 'submission_deadline',
'timezone_submission', 'rebuttal_period_start',
'rebuttal_period_end', 'final_decision_date',
'review_release_date', 'commitment_deadline',
'start', 'end', 'note' # Added note to preserved fields
]
for field in preserved_fields:
if field in curr_conf:
new_conf[field] = curr_conf[field]
# If start/end not in current conference but we parsed them, keep the parsed ones
if 'start' not in curr_conf and 'start' in new_conf:
new_conf['start'] = new_conf['start']
if 'end' not in curr_conf and 'end' in new_conf:
new_conf['end'] = new_conf['end']
# Preserve existing rankings if available
if 'rankings' in curr_conf:
new_conf['rankings'] = curr_conf['rankings']
# Update the conference in the dictionary
current_conf_dict[new_conf['id']] = new_conf
else:
# Add new conference to the dictionary
current_conf_dict[new_conf['id']] = new_conf
# Convert back to list and sort by deadline
all_conferences = list(current_conf_dict.values())
all_conferences.sort(key=lambda x: x.get('deadline', '9999'))
# Write back to file with newlines between conferences
with open(current_file, 'w') as f:
for i, conf in enumerate(all_conferences):
if i > 0:
f.write('\n\n') # Add two newlines between conferences
yaml_str = yaml.dump(
[conf],
allow_unicode=True,
sort_keys=False,
default_flow_style=False,
explicit_start=False,
explicit_end=False,
width=float("inf"),
indent=2,
default_style=None,
)
f.write(yaml_str.rstrip()) # Remove trailing whitespace
# Add final newline
f.write('\n')
print(f"Successfully updated {len(all_conferences)} conferences")
except Exception as e:
print(f"Error: {e}")
raise
if __name__ == "__main__":
main()