File size: 4,654 Bytes
68e6513
 
fbdc657
144f299
890203a
 
 
 
 
fbdc657
68e6513
 
 
144f299
 
 
 
 
 
 
 
 
68e6513
1580227
68e6513
 
890203a
fd62121
 
 
 
 
 
 
890203a
 
 
fd62121
 
 
890203a
 
 
 
 
fd62121
890203a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd62121
68e6513
fbdc657
890203a
 
 
ee6a180
890203a
3433b65
144f299
ee6a180
 
834ab51
ee6a180
3433b65
144f299
3433b65
 
144f299
3433b65
 
f3c3cd9
 
144f299
 
 
 
 
 
 
 
3433b65
834ab51
 
 
 
 
 
ee6a180
3433b65
144f299
 
 
 
 
7ce5480
 
fbdc657
3433b65
144f299
fbdc657
fd62121
68e6513
 
144f299
ee6a180
68e6513
 
fd62121
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Utility functions for filtering the dataframe

import pandas as pd
import assets.text_content as tc
import calendar
from typing import Union, List
from datetime import datetime

current_year = str(datetime.now().year)

def filter_cols(df):

    df = df[[
    tc.MODEL_NAME, 
    tc.CLEMSCORE,
    tc.INPUT, 
    tc.OUTPUT,
    tc.LATENCY,
    tc.CONTEXT, 
    tc.PARAMS,
    tc.RELEASE_DATE, 
    tc.LICENSE
    ]]
    
    return df


def convert_date_components_to_timestamp(year: str, month: str) -> int:
    """Convert year and month strings to timestamp."""
    # Create a datetime object for the first day of the month
    date_str = f"{year}-{month:02d}-01"
    return int(pd.to_datetime(date_str).timestamp())

def filter_by_date(df: pd.DataFrame, 
                  start_year, start_month,
                  end_year, end_month,
                  date_column: str = tc.RELEASE_DATE) -> pd.DataFrame:
    """
    Filter DataFrame by date range using separate year and month components.
    """
    # All lists are passed at once, so set default values here instead of passing them in args- Overwritten by empty lists
    if not start_year:
        start_year = tc.START_YEAR
    if not end_year:
        end_year = current_year
    
    if not start_month:
        start_month = "January"
    if not end_month:
        end_month = "December"

    try:
        # Convert string inputs to integers for date creation
        start_timestamp = convert_date_components_to_timestamp(
            int(start_year), 
            int(tc.MONTH_MAP[start_month])
        )
        
        end_timestamp = convert_date_components_to_timestamp(
            int(end_year), 
            int(tc.MONTH_MAP[end_month])
        )
        
        # Convert the DataFrame's date column to timestamps for comparison
        date_timestamps = pd.to_datetime(df[date_column]).apply(lambda x: int(x.timestamp()))
        
        # Filter the DataFrame
        return df[
            (date_timestamps >= start_timestamp) & 
            (date_timestamps <= end_timestamp)
        ]
    except (ValueError, TypeError) as e:
        print(f"Error processing dates: {e}")
        return df  # Return unfiltered DataFrame if there's an error


def filter(df, language_list, parameters, input_price, output_price, multimodal,
           context, open_weight, 
           start_year, start_month, end_year, end_month, 
           license ):

    
    if not df.empty:  # Check if df is non-empty
        df = df[df[tc.LANGS].apply(lambda x: all(lang in x for lang in language_list))]

    if not df.empty:
        df = df[(df[tc.DUMMY_PARAMS] >= parameters[0]) & (df[tc.DUMMY_PARAMS] <= parameters[1])]

    if not df.empty:  # Check if df is non-empty
        df = df[(df[tc.INPUT] >= input_price[0]) & (df[tc.INPUT] <= input_price[1])]
    
    if not df.empty:  # Check if df is non-empty
        df = df[(df[tc.OUTPUT] >= output_price[0]) & (df[tc.OUTPUT] <= output_price[1])]

    if not df.empty:  # Check if df is non-empty
        if tc.TEXT in multimodal:
            df = df[(df[tc.SINGLE_IMG] == False) & (df[tc.MULT_IMG] == False) & (df[tc.AUDIO] == False) & (df[tc.VIDEO] == False) ]
        if tc.SINGLE_IMG in multimodal:
            df = df[df[tc.SINGLE_IMG] == True]
        if tc.MULT_IMG in multimodal:
            df = df[df[tc.MULT_IMG] == True]
        if tc.AUDIO in multimodal:
            df = df[df[tc.AUDIO] == True]
        if tc.VIDEO in multimodal:
            df = df[df[tc.VIDEO] == True]

    if not df.empty:  # Check if df is non-empty
        # Convert 'Context Size (k)' to numeric, coercing errors to NaN
        context_size = pd.to_numeric(df['Context Size (k)'], errors='coerce').fillna(0)
        
        # Apply the filter
        df = df[(context_size >= context[0]) & (context_size <= context[1])]

    if not df.empty:  # Check if df is non-empty
        if tc.OPEN in open_weight and tc.COMM not in open_weight:
            df = df[df[tc.OPEN_WEIGHT] == True]
        elif tc.COMM in open_weight and tc.OPEN not in open_weight:
            df = df[df[tc.OPEN_WEIGHT] == False]
        elif tc.OPEN not in open_weight and tc.COMM not in open_weight:
            # Return empty DataFrame with same columns
            df = pd.DataFrame(columns=df.columns)
        
    if not df.empty:  # Check if df is non-empty
        df = df[df[tc.LICENSE_NAME].apply(lambda x: any(lic in x for lic in license))]

    df = filter_by_date(df, start_year, start_month, end_year, end_month, tc.TEMP_DATE)

    df = filter_cols(df)
    df = df.sort_values(by=tc.CLEMSCORE, ascending=False)

    return df  # Return the filtered dataframe