File size: 7,724 Bytes
ee6a180
 
 
3f2d847
144f299
ee6a180
 
144f299
 
ee6a180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f2d847
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee6a180
3f2d847
 
 
 
 
 
 
 
ee6a180
3f2d847
 
 
ee6a180
 
 
 
 
 
 
144f299
 
 
 
 
ee6a180
 
 
 
 
 
 
 
 
144f299
 
 
 
ee6a180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144f299
 
 
 
 
 
 
 
 
 
 
 
 
 
ee6a180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144f299
 
ee6a180
 
 
144f299
 
ee6a180
662d6a3
144f299
 
662d6a3
 
ee6a180
af692e5
 
 
144f299
ee6a180
144f299
ee6a180
7ce5480
144f299
ee6a180
7ce5480
ee6a180
 
834ab51
 
 
 
 
 
 
 
 
 
 
 
 
 
ee6a180
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import pandas as pd
import json
import os
import pycountry
import re

from src.collect_data import fetch_version_metadata, fetch_registry_data
import assets.text_content as tc

PRICING_PATH = os.path.join('assets', 'pricing.json')

# Convert parameters to float, handling both B and T suffixes
def convert_parameters(param):
    if pd.isna(param) or param == '':
        return None
    param = str(param)
    if 'T' in param:
        return float(param.replace('T', '')) * 1000
    return float(param.replace('B', ''))

# Clean price strings by removing '$' and handling empty strings
def clean_price(price):
    if pd.isna(price) or price == '':
        return None
    return float(price.replace('$', ''))

# Handle language mapping for both string and list inputs
def map_languages(languages):
    if isinstance(languages, float) and pd.isna(languages):
        return None
        
    def get_language_name(lang):
        # Clean and standardize the language code
        lang = str(lang).strip().lower()
        
        # Try to find the language
        try:
            # First try as language code (en, fr, etc.)
            language = pycountry.languages.get(alpha_2=lang)
            if not language:
                # Try as language name (English, French, etc.)
                language = pycountry.languages.get(name=lang.capitalize())
            
            return language.name if language else lang
        except (AttributeError, LookupError):
            return lang
    
    # Handle different input types
    if isinstance(languages, list):
        lang_list = languages
    elif isinstance(languages, str):
        lang_list = [l.strip() for l in languages.split(',')]
    else:
        try:
            lang_list = list(languages)
        except:
            return str(languages)
    
    # Map all languages and join them
    return ', '.join(get_language_name(lang) for lang in lang_list)

# Extract multimodality fields
def get_multimodality_field(model_data, field):
    try:
        return model_data.get('model_config', {}).get('multimodality', {}).get(field, False)
    except:
        return False

def clean_model_name(model_name: str) -> str:
    """Clean model name by removing temperature suffix pattern."""
    # Match pattern like -t0.0--, -t0.7--, -t1.0--, etc.
    pattern = r'-t[0-1]\.[0-9]--'
    return re.split(pattern, model_name)[0]

def merge_data():

    mm_latency_df, mm_result_df, text_latency_df, text_result_df = fetch_version_metadata()
    registry_data = fetch_registry_data()
    with open(PRICING_PATH, 'r') as f:
        pricing_data = json.load(f)

    # Ensure the unnamed column is renamed to 'model'
    mm_result_df.rename(columns={tc.DEFAULT_MODEL_NAME: 'model', tc.DEFAULT_CLEMSCORE: 'clemscore'}, inplace=True)
    text_result_df.rename(columns={tc.DEFAULT_MODEL_NAME: 'model', tc.DEFAULT_CLEMSCORE: 'clemscore'}, inplace=True)
    mm_result_df['model'] = mm_result_df['model'].apply(clean_model_name)
    text_result_df['model'] = text_result_df['model'].apply(clean_model_name)

    # Merge datasets to compute average values
    avg_latency_df = pd.concat([mm_latency_df, text_latency_df], axis=0).groupby('model')['latency'].mean().reset_index()
    avg_clemscore_df = pd.concat([mm_result_df, text_result_df], axis=0).groupby('model')['clemscore'].mean().reset_index()

    # Merge latency, clemscore, registry, and pricing data
    lat_clem_df = pd.merge(avg_latency_df, avg_clemscore_df, on='model', how='outer')

    # Convert registry_data to DataFrame for easier merging
    registry_df = pd.DataFrame(registry_data)
    
    # Extract license info
    registry_df['license_name'] = registry_df['license'].apply(lambda x: x['name'])
    registry_df['license_url'] = registry_df['license'].apply(lambda x: x['url'])

    # Add individual multimodality columns
    registry_df['single_image'] = registry_df.apply(lambda x: get_multimodality_field(x, 'single_image'), axis=1)
    registry_df['multiple_images'] = registry_df.apply(lambda x: get_multimodality_field(x, 'multiple_images'), axis=1)
    registry_df['audio'] = registry_df.apply(lambda x: get_multimodality_field(x, 'audio'), axis=1)
    registry_df['video'] = registry_df.apply(lambda x: get_multimodality_field(x, 'video'), axis=1)

    # Update columns list to include new multimodality fields
    registry_df = registry_df[[
        'model_name', 'parameters', 'release_date', 'open_weight',
        'languages', 'context_size', 'license_name', 'license_url',
        'single_image', 'multiple_images', 'audio', 'video'
    ]]
    
    # Merge with previous data
    merged_df = pd.merge(
        lat_clem_df,
        registry_df,
        left_on='model',
        right_on='model_name',
        how='inner'
    )
    
    # Update column renaming
    merged_df = merged_df.rename(columns={
        'model': tc.MODEL_NAME,
        'latency': tc.LATENCY,
        'clemscore': tc.CLEMSCORE,
        'parameters': tc.PARAMS,
        'release_date': tc.RELEASE_DATE,
        'open_weight': tc.OPEN_WEIGHT,
        'languages': tc.LANGS,
        'context_size': tc.CONTEXT,
        'license_name': tc.LICENSE_NAME,
        'license_url': tc.LICENSE_URL,
        'single_image': tc.SINGLE_IMG,
        'multiple_images': tc.MULT_IMG,
        'audio': tc.AUDIO,
        'video': tc.VIDEO
    })
    
    # Convert pricing_data list to DataFrame
    pricing_df = pd.DataFrame(pricing_data)
    pricing_df['input'] = pricing_df['input'].apply(clean_price)
    pricing_df['output'] = pricing_df['output'].apply(clean_price)
    
    # Merge pricing data with the existing dataframe
    merged_df = pd.merge(
        merged_df,
        pricing_df,
        left_on='Model Name',
        right_on='model_id',
        how='left'
    )
    
    # Drop duplicate model column and rename price columns
    merged_df = merged_df.drop('model_id', axis=1)
    merged_df = merged_df.rename(columns={
        'input': tc.INPUT,
        'output': tc.OUTPUT
    })
    
    # Fill NaN values with 0.0 for pricing columns
    merged_df[tc.INPUT] = merged_df[tc.INPUT].fillna(0.0)
    merged_df[tc.OUTPUT] = merged_df[tc.OUTPUT].fillna(0.0)
    
    # Convert parameters and set to None for commercial models
    merged_df[tc.PARAMS] = merged_df.apply(
        lambda row: None if not row[tc.OPEN_WEIGHT] else convert_parameters(row[tc.PARAMS]), 
        axis=1
    )

    merged_df[tc.LICENSE] = merged_df.apply(
        lambda row: f'[{row[tc.LICENSE_NAME]}]({row[tc.LICENSE_URL]})', axis=1
    )
    merged_df[tc.TEMP_DATE] = merged_df[tc.RELEASE_DATE]

    merged_df[tc.LANGS] = merged_df[tc.LANGS].apply(map_languages)

    # Sort by Clemscore in descending order
    merged_df = merged_df.sort_values(by=tc.CLEMSCORE, ascending=False)
    
    # Drop model_name column
    merged_df.drop(columns=['model_name'], inplace=True)
    
    # Clean up context and convert to integer
    merged_df[tc.CONTEXT] = merged_df[tc.CONTEXT].astype(str).str.replace('k', '', regex=False)
    merged_df[tc.CONTEXT] = pd.to_numeric(merged_df[tc.CONTEXT], errors='coerce').fillna(0).astype(int)

    # Handle commercial model parameters / Set to max of open models
    # Find the maximum value of tc.PARAMS where tc.OPEN_WEIGHT is True
    max_params_value = merged_df.loc[merged_df[tc.OPEN_WEIGHT], tc.PARAMS].max()

    # Create a new dummy PARAM column
    merged_df[tc.DUMMY_PARAMS] = merged_df.apply(
        lambda row: max_params_value if not row[tc.OPEN_WEIGHT] else row[tc.PARAMS],
        axis=1
    )

    return merged_df

if __name__=='__main__':
    merged_df = merge_data()
    # # Save to CSV
    output_path = os.path.join('assets', 'merged_data.csv')
    merged_df.to_csv(output_path, index=False)