ai-deadlines-copy

Running

File size: 9,164 Bytes

import yaml
import requests
from datetime import datetime
from typing import Dict, List, Any


def fetch_conference_files() -> List[Dict[str, Any]]:
    """Fetch all conference YAML files from ccfddl repository."""
    
    # First get the directory listing from GitHub API
    api_url = "https://api.github.com/repos/ccfddl/ccf-deadlines/contents/conference/AI"
    response = requests.get(api_url)
    files = response.json()
    
    conferences = []
    for file in files:
        if file['name'].endswith('.yml'):
            yaml_content = requests.get(file['download_url']).text
            conf_data = yaml.safe_load(yaml_content)
            # The data is a list with a single item
            if isinstance(conf_data, list) and len(conf_data) > 0:
                conferences.append(conf_data[0])
    
    return conferences


def parse_date_range(date_str: str, year: str) -> tuple[str, str]:
    """Parse various date formats and return start and end dates."""
    # Remove the year if it appears at the end of the string
    date_str = date_str.replace(f", {year}", "")
    
    # Handle various date formats
    try:
        # Split into start and end dates
        if ' - ' in date_str:
            start, end = date_str.split(' - ')
        elif '-' in date_str:
            start, end = date_str.split('-')
        else:
            # For single date format like "May 19, 2025"
            start = end = date_str
            
        # Clean up month abbreviations
        month_map = {
            'Sept': 'September',  # Handle Sept before Sep
            'Jan': 'January', 
            'Feb': 'February', 
            'Mar': 'March',
            'Apr': 'April', 
            'Jun': 'June', 
            'Jul': 'July',
            'Aug': 'August', 
            'Sep': 'September', 
            'Oct': 'October', 
            'Nov': 'November', 
            'Dec': 'December'
        }
        
        # Create a set of all month names (full and abbreviated)
        all_months = set(month_map.keys()) | set(month_map.values())
        
        # Handle cases like "April 29-May 4"
        has_month = any(month in end for month in all_months)
        if not has_month:
            # End is just a day number, use start's month
            start_parts = start.split()
            if len(start_parts) >= 1:
                end = f"{start_parts[0]} {end.strip()}"
        
        # Replace month abbreviations
        for abbr, full in month_map.items():
            start = start.replace(abbr, full)
            end = end.replace(abbr, full)
        
        # Clean up any extra spaces
        start = ' '.join(start.split())
        end = ' '.join(end.split())
        
        # Parse start date
        start_date = datetime.strptime(f"{start}, {year}", "%B %d, %Y")
        
        # Parse end date
        end_date = datetime.strptime(f"{end}, {year}", "%B %d, %Y")
        
        return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')
        
    except Exception as e:
        raise ValueError(f"Could not parse date: {date_str} ({e})")


def transform_conference_data(conferences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Transform ccfddl format to our format."""
    transformed = []
    current_year = datetime.now().year
    
    for conf in conferences:
        # Get the most recent or upcoming conference instance
        recent_conf = None
        if 'confs' in conf:
            for instance in conf['confs']:
                if instance['year'] >= current_year:
                    recent_conf = instance
                    break
        
        if not recent_conf:
            continue
            
        # Transform to our format
        transformed_conf = {
            'title': conf.get('title', ''),
            'year': recent_conf['year'],
            'id': recent_conf['id'],
            'full_name': conf.get('description', ''),
            'link': recent_conf.get('link', ''),
            'deadline': recent_conf.get('timeline', [{}])[0].get('deadline', ''),
            'timezone': recent_conf.get('timezone', ''),
            'place': recent_conf.get('place', ''),
            'date': recent_conf.get('date', ''),
            'tags': [],  # We'll need to maintain a mapping for tags
        }
        
        # Add optional fields
        timeline = recent_conf.get('timeline', [{}])[0]
        if 'abstract_deadline' in timeline:
            transformed_conf['abstract_deadline'] = timeline['abstract_deadline']
            
        # Parse date range for start/end
        try:
            if transformed_conf['date']:
                start_date, end_date = parse_date_range(
                    transformed_conf['date'], 
                    str(transformed_conf['year'])
                )
                transformed_conf['start'] = start_date
                transformed_conf['end'] = end_date
        except Exception as e:
            print(f"Warning: Could not parse date for {transformed_conf['title']}: {e}")
            
        # Add rankings as separate field
        if 'rank' in conf:
            rankings = []
            for rank_type, rank_value in conf['rank'].items():
                rankings.append(f"{rank_type.upper()}: {rank_value}")
            if rankings:
                transformed_conf['rankings'] = ', '.join(rankings)
            
        transformed.append(transformed_conf)
    
    return transformed


def main():
    try:
        # Fetch current conferences.yml
        current_file = 'src/data/conferences.yml'
        with open(current_file, 'r') as f:
            current_conferences = yaml.safe_load(f)
        
        # Fetch and transform new data
        new_conferences = fetch_conference_files()
        if not new_conferences:
            print("Warning: No conferences fetched from ccfddl")
            return
            
        transformed_conferences = transform_conference_data(new_conferences)
        if not transformed_conferences:
            print("Warning: No conferences transformed")
            return
        
        # Create a dictionary of current conferences by ID
        current_conf_dict = {conf['id']: conf for conf in current_conferences}
        
        # Update or add new conferences while preserving existing ones
        for new_conf in transformed_conferences:
            if new_conf['id'] in current_conf_dict:
                # Update existing conference while preserving fields
                curr_conf = current_conf_dict[new_conf['id']]
                
                # Preserve existing fields
                preserved_fields = [
                    'tags', 'venue', 'hindex', 'submission_deadline',
                    'timezone_submission', 'rebuttal_period_start',
                    'rebuttal_period_end', 'final_decision_date',
                    'review_release_date', 'commitment_deadline',
                    'start', 'end', 'note'  # Added note to preserved fields
                ]
                for field in preserved_fields:
                    if field in curr_conf:
                        new_conf[field] = curr_conf[field]
                
                # If start/end not in current conference but we parsed them, keep the parsed ones
                if 'start' not in curr_conf and 'start' in new_conf:
                    new_conf['start'] = new_conf['start']
                if 'end' not in curr_conf and 'end' in new_conf:
                    new_conf['end'] = new_conf['end']
                
                # Preserve existing rankings if available
                if 'rankings' in curr_conf:
                    new_conf['rankings'] = curr_conf['rankings']
                
                # Update the conference in the dictionary
                current_conf_dict[new_conf['id']] = new_conf
            else:
                # Add new conference to the dictionary
                current_conf_dict[new_conf['id']] = new_conf
        
        # Convert back to list and sort by deadline
        all_conferences = list(current_conf_dict.values())
        all_conferences.sort(key=lambda x: x.get('deadline', '9999'))
        
        # Write back to file with newlines between conferences
        with open(current_file, 'w') as f:
            for i, conf in enumerate(all_conferences):
                if i > 0:
                    f.write('\n\n')  # Add two newlines between conferences
                
                yaml_str = yaml.dump(
                    [conf],
                    allow_unicode=True,
                    sort_keys=False,
                    default_flow_style=False,
                    explicit_start=False,
                    explicit_end=False,
                    width=float("inf"),
                    indent=2,
                    default_style=None,
                )
                f.write(yaml_str.rstrip())  # Remove trailing whitespace
            
            # Add final newline
            f.write('\n')
            
        print(f"Successfully updated {len(all_conferences)} conferences")
        
    except Exception as e:
        print(f"Error: {e}")
        raise


if __name__ == "__main__":
    main()