ai-deadlines-copy

Sleeping

App Files Files Community

ai-deadlines-copy / .github /scripts /update_conferences.py

nielsr HF staff

Add rankings as separate field

5c4b399 26 days ago

raw

history blame contribute delete

9.16 kB

	import yaml
	import requests
	from datetime import datetime
	from typing import Dict, List, Any


	def fetch_conference_files() -> List[Dict[str, Any]]:
	"""Fetch all conference YAML files from ccfddl repository."""

	# First get the directory listing from GitHub API
	api_url = "https://api.github.com/repos/ccfddl/ccf-deadlines/contents/conference/AI"
	response = requests.get(api_url)
	files = response.json()

	conferences = []
	for file in files:
	if file['name'].endswith('.yml'):
	yaml_content = requests.get(file['download_url']).text
	conf_data = yaml.safe_load(yaml_content)
	# The data is a list with a single item
	if isinstance(conf_data, list) and len(conf_data) > 0:
	conferences.append(conf_data[0])

	return conferences


	def parse_date_range(date_str: str, year: str) -> tuple[str, str]:
	"""Parse various date formats and return start and end dates."""
	# Remove the year if it appears at the end of the string
	date_str = date_str.replace(f", {year}", "")

	# Handle various date formats
	try:
	# Split into start and end dates
	if ' - ' in date_str:
	start, end = date_str.split(' - ')
	elif '-' in date_str:
	start, end = date_str.split('-')
	else:
	# For single date format like "May 19, 2025"
	start = end = date_str

	# Clean up month abbreviations
	month_map = {
	'Sept': 'September', # Handle Sept before Sep
	'Jan': 'January',
	'Feb': 'February',
	'Mar': 'March',
	'Apr': 'April',
	'Jun': 'June',
	'Jul': 'July',
	'Aug': 'August',
	'Sep': 'September',
	'Oct': 'October',
	'Nov': 'November',
	'Dec': 'December'
	}

	# Create a set of all month names (full and abbreviated)
	all_months = set(month_map.keys()) \| set(month_map.values())

	# Handle cases like "April 29-May 4"
	has_month = any(month in end for month in all_months)
	if not has_month:
	# End is just a day number, use start's month
	start_parts = start.split()
	if len(start_parts) >= 1:
	end = f"{start_parts[0]} {end.strip()}"

	# Replace month abbreviations
	for abbr, full in month_map.items():
	start = start.replace(abbr, full)
	end = end.replace(abbr, full)

	# Clean up any extra spaces
	start = ' '.join(start.split())
	end = ' '.join(end.split())

	# Parse start date
	start_date = datetime.strptime(f"{start}, {year}", "%B %d, %Y")

	# Parse end date
	end_date = datetime.strptime(f"{end}, {year}", "%B %d, %Y")

	return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')

	except Exception as e:
	raise ValueError(f"Could not parse date: {date_str} ({e})")


	def transform_conference_data(conferences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""Transform ccfddl format to our format."""
	transformed = []
	current_year = datetime.now().year

	for conf in conferences:
	# Get the most recent or upcoming conference instance
	recent_conf = None
	if 'confs' in conf:
	for instance in conf['confs']:
	if instance['year'] >= current_year:
	recent_conf = instance
	break

	if not recent_conf:
	continue

	# Transform to our format
	transformed_conf = {
	'title': conf.get('title', ''),
	'year': recent_conf['year'],
	'id': recent_conf['id'],
	'full_name': conf.get('description', ''),
	'link': recent_conf.get('link', ''),
	'deadline': recent_conf.get('timeline', [{}])[0].get('deadline', ''),
	'timezone': recent_conf.get('timezone', ''),
	'place': recent_conf.get('place', ''),
	'date': recent_conf.get('date', ''),
	'tags': [], # We'll need to maintain a mapping for tags
	}

	# Add optional fields
	timeline = recent_conf.get('timeline', [{}])[0]
	if 'abstract_deadline' in timeline:
	transformed_conf['abstract_deadline'] = timeline['abstract_deadline']

	# Parse date range for start/end
	try:
	if transformed_conf['date']:
	start_date, end_date = parse_date_range(
	transformed_conf['date'],
	str(transformed_conf['year'])
	)
	transformed_conf['start'] = start_date
	transformed_conf['end'] = end_date
	except Exception as e:
	print(f"Warning: Could not parse date for {transformed_conf['title']}: {e}")

	# Add rankings as separate field
	if 'rank' in conf:
	rankings = []
	for rank_type, rank_value in conf['rank'].items():
	rankings.append(f"{rank_type.upper()}: {rank_value}")
	if rankings:
	transformed_conf['rankings'] = ', '.join(rankings)

	transformed.append(transformed_conf)

	return transformed


	def main():
	try:
	# Fetch current conferences.yml
	current_file = 'src/data/conferences.yml'
	with open(current_file, 'r') as f:
	current_conferences = yaml.safe_load(f)

	# Fetch and transform new data
	new_conferences = fetch_conference_files()
	if not new_conferences:
	print("Warning: No conferences fetched from ccfddl")
	return

	transformed_conferences = transform_conference_data(new_conferences)
	if not transformed_conferences:
	print("Warning: No conferences transformed")
	return

	# Create a dictionary of current conferences by ID
	current_conf_dict = {conf['id']: conf for conf in current_conferences}

	# Update or add new conferences while preserving existing ones
	for new_conf in transformed_conferences:
	if new_conf['id'] in current_conf_dict:
	# Update existing conference while preserving fields
	curr_conf = current_conf_dict[new_conf['id']]

	# Preserve existing fields
	preserved_fields = [
	'tags', 'venue', 'hindex', 'submission_deadline',
	'timezone_submission', 'rebuttal_period_start',
	'rebuttal_period_end', 'final_decision_date',
	'review_release_date', 'commitment_deadline',
	'start', 'end', 'note' # Added note to preserved fields
	]
	for field in preserved_fields:
	if field in curr_conf:
	new_conf[field] = curr_conf[field]

	# If start/end not in current conference but we parsed them, keep the parsed ones
	if 'start' not in curr_conf and 'start' in new_conf:
	new_conf['start'] = new_conf['start']
	if 'end' not in curr_conf and 'end' in new_conf:
	new_conf['end'] = new_conf['end']

	# Preserve existing rankings if available
	if 'rankings' in curr_conf:
	new_conf['rankings'] = curr_conf['rankings']

	# Update the conference in the dictionary
	current_conf_dict[new_conf['id']] = new_conf
	else:
	# Add new conference to the dictionary
	current_conf_dict[new_conf['id']] = new_conf

	# Convert back to list and sort by deadline
	all_conferences = list(current_conf_dict.values())
	all_conferences.sort(key=lambda x: x.get('deadline', '9999'))

	# Write back to file with newlines between conferences
	with open(current_file, 'w') as f:
	for i, conf in enumerate(all_conferences):
	if i > 0:
	f.write('\n\n') # Add two newlines between conferences

	yaml_str = yaml.dump(
	[conf],
	allow_unicode=True,
	sort_keys=False,
	default_flow_style=False,
	explicit_start=False,
	explicit_end=False,
	width=float("inf"),
	indent=2,
	default_style=None,
	)
	f.write(yaml_str.rstrip()) # Remove trailing whitespace

	# Add final newline
	f.write('\n')

	print(f"Successfully updated {len(all_conferences)} conferences")

	except Exception as e:
	print(f"Error: {e}")
	raise


	if __name__ == "__main__":
	main()