File size: 3,725 Bytes
68e6513
 
fbdc657
144f299
fbdc657
68e6513
 
 
144f299
 
 
 
 
 
 
 
 
68e6513
1580227
68e6513
 
 
fbdc657
 
 
ee6a180
3433b65
144f299
ee6a180
 
 
144f299
 
1580227
ee6a180
144f299
ee6a180
 
 
 
 
144f299
ee6a180
 
 
144f299
 
ee6a180
 
 
 
 
3433b65
144f299
3433b65
 
144f299
3433b65
 
144f299
 
 
 
 
 
 
 
3433b65
ee6a180
 
 
3433b65
144f299
 
 
 
 
7ce5480
 
fbdc657
3433b65
144f299
fbdc657
 
3433b65
144f299
fbdc657
 
 
 
 
 
3433b65
144f299
68e6513
 
144f299
ee6a180
68e6513
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Utility functions for filtering the dataframe

import pandas as pd
import assets.text_content as tc

def filter_cols(df):

    df = df[[
    tc.MODEL_NAME, 
    tc.CLEMSCORE,
    tc.INPUT, 
    tc.OUTPUT,
    tc.LATENCY,
    tc.CONTEXT, 
    tc.PARAMS,
    tc.RELEASE_DATE, 
    tc.LICENSE
    ]]
    
    return df


def filter(df, language_list, parameters, input_price, output_price, multimodal,
           context, open_weight, start, end, license ):
    

    if not df.empty:  # Check if df is non-empty
        df = df[df[tc.LANGS].apply(lambda x: all(lang in x for lang in language_list))]

    if not df.empty:
        # Split dataframe by Open Weight
        open_weight_true = df[df[tc.OPEN_WEIGHT] == True]
        open_weight_false = df[df[tc.OPEN_WEIGHT] == False]
        
        # Get max parameter size for open weight models
        max_parameter_size = open_weight_true[tc.PARAMS].max() if not open_weight_true.empty else 0
        
        # Filter only the open weight models based on parameters
        if not open_weight_true.empty:
            if parameters[1] >= max_parameter_size:
                filtered_open = open_weight_true[
                    (open_weight_true[tc.PARAMS] >= parameters[0])
                ]
            else:
                filtered_open = open_weight_true[
                    (open_weight_true[tc.PARAMS] >= parameters[0]) & 
                    (open_weight_true[tc.PARAMS] <= parameters[1])
                ]
            
            # Combine filtered open weight models with unfiltered commercial models
            df = pd.concat([filtered_open, open_weight_false])

    if not df.empty:  # Check if df is non-empty
        df = df[(df[tc.INPUT] >= input_price[0]) & (df[tc.INPUT] <= input_price[1])]
    
    if not df.empty:  # Check if df is non-empty
        df = df[(df[tc.OUTPUT] >= output_price[0]) & (df[tc.OUTPUT] <= output_price[1])]

    if not df.empty:  # Check if df is non-empty
        if tc.SINGLE_IMG in multimodal:
            df = df[df[tc.SINGLE_IMG] == True]
        if tc.MULT_IMG in multimodal:
            df = df[df[tc.MULT_IMG] == True]
        if tc.AUDIO in multimodal:
            df = df[df[tc.AUDIO] == True]
        if tc.VIDEO in multimodal:
            df = df[df[tc.VIDEO] == True]

    # if not df.empty:  # Check if df is non-empty
    #     df = df[(df['Context Size (k)'] >= (context[0])) & (df['Context Size (k)'] <= (context[1]))]

    if not df.empty:  # Check if df is non-empty
        if tc.OPEN in open_weight and tc.COMM not in open_weight:
            df = df[df[tc.OPEN_WEIGHT] == True]
        elif tc.COMM in open_weight and tc.OPEN not in open_weight:
            df = df[df[tc.OPEN_WEIGHT] == False]
        elif tc.OPEN not in open_weight and tc.COMM not in open_weight:
            # Return empty DataFrame with same columns
            df = pd.DataFrame(columns=df.columns)
        
    if not df.empty:  # Check if df is non-empty
        df = df[df[tc.LICENSE_NAME].apply(lambda x: any(lic in x for lic in license))]

    # Convert 'Release Date' to int temporarily
    if not df.empty:  # Check if df is non-empty
        df[tc.TEMP_DATE] = pd.to_datetime(df[tc.TEMP_DATE]).astype(int) // 10**9  # Convert to seconds since epoch

    # Convert start and end to int (seconds since epoch)
    start = int(pd.to_datetime(start).timestamp())  
    end = int(pd.to_datetime(end).timestamp())    

    # Filter based on the converted 'Release Date'
    if not df.empty:  # Check if df is non-empty
        df = df[(df[tc.TEMP_DATE] >= start) & (df[tc.TEMP_DATE] <= end)]

    df = filter_cols(df)
    df = df.sort_values(by=tc.CLEMSCORE, ascending=False)

    return df  # Return the filtered dataframe