LLMCalc / src /filter_utils.py
carbonnnnn's picture
add date picker using dropdown
fd62121
raw
history blame
5.4 kB
# Utility functions for filtering the dataframe
import pandas as pd
import assets.text_content as tc
def filter_cols(df):
df = df[[
tc.MODEL_NAME,
tc.CLEMSCORE,
tc.INPUT,
tc.OUTPUT,
tc.LATENCY,
tc.CONTEXT,
tc.PARAMS,
tc.RELEASE_DATE,
tc.LICENSE
]]
return df
def convert_date_components_to_timestamp(year: str, month: str) -> int:
"""Convert year and month strings to timestamp."""
# Create a datetime object for the first day of the month
date_str = f"{year}-{month:02d}-01"
return int(pd.to_datetime(date_str).timestamp())
def filter_by_date(df: pd.DataFrame,
start_year: str,
start_month: str,
end_year: str,
end_month: str,
date_column: str) -> pd.DataFrame:
"""
Filter DataFrame by date range using separate year and month components.
Args:
df: DataFrame to filter
start_year: Starting year (e.g., "2023")
start_month: Starting month (e.g., "1" for January)
end_year: Ending year (e.g., "2024")
end_month: Ending month (e.g., "12" for December)
date_column: Name of the date column to filter on
"""
# Convert string inputs to integers for date creation
start_timestamp = convert_date_components_to_timestamp(
int(start_year),
int(start_month)
)
end_timestamp = convert_date_components_to_timestamp(
int(end_year),
int(end_month)
)
# Convert the DataFrame's date column to timestamps for comparison
date_timestamps = pd.to_datetime(df[date_column]).apply(lambda x: int(x.timestamp()))
# Filter the DataFrame
return df[
(date_timestamps >= start_timestamp) &
(date_timestamps <= end_timestamp)
]
def filter(df, language_list, parameters, input_price, output_price, multimodal,
context, open_weight, start_year, start_month, end_year, end_month, license ):
if not df.empty: # Check if df is non-empty
df = df[df[tc.LANGS].apply(lambda x: all(lang in x for lang in language_list))]
if not df.empty:
# Split dataframe by Open Weight
open_weight_true = df[df[tc.OPEN_WEIGHT] == True]
open_weight_false = df[df[tc.OPEN_WEIGHT] == False]
# Get max parameter size for open weight models
max_parameter_size = open_weight_true[tc.PARAMS].max() if not open_weight_true.empty else 0
# Filter only the open weight models based on parameters
if not open_weight_true.empty:
if parameters[1] >= max_parameter_size:
filtered_open = open_weight_true[
(open_weight_true[tc.PARAMS] >= parameters[0])
]
else:
filtered_open = open_weight_true[
(open_weight_true[tc.PARAMS] >= parameters[0]) &
(open_weight_true[tc.PARAMS] <= parameters[1])
]
# Combine filtered open weight models with unfiltered commercial models
df = pd.concat([filtered_open, open_weight_false])
if not df.empty: # Check if df is non-empty
df = df[(df[tc.INPUT] >= input_price[0]) & (df[tc.INPUT] <= input_price[1])]
if not df.empty: # Check if df is non-empty
df = df[(df[tc.OUTPUT] >= output_price[0]) & (df[tc.OUTPUT] <= output_price[1])]
if not df.empty: # Check if df is non-empty
if tc.SINGLE_IMG in multimodal:
df = df[df[tc.SINGLE_IMG] == True]
if tc.MULT_IMG in multimodal:
df = df[df[tc.MULT_IMG] == True]
if tc.AUDIO in multimodal:
df = df[df[tc.AUDIO] == True]
if tc.VIDEO in multimodal:
df = df[df[tc.VIDEO] == True]
# if not df.empty: # Check if df is non-empty
# df = df[(df['Context Size (k)'] >= (context[0])) & (df['Context Size (k)'] <= (context[1]))]
if not df.empty: # Check if df is non-empty
if tc.OPEN in open_weight and tc.COMM not in open_weight:
df = df[df[tc.OPEN_WEIGHT] == True]
elif tc.COMM in open_weight and tc.OPEN not in open_weight:
df = df[df[tc.OPEN_WEIGHT] == False]
elif tc.OPEN not in open_weight and tc.COMM not in open_weight:
# Return empty DataFrame with same columns
df = pd.DataFrame(columns=df.columns)
if not df.empty: # Check if df is non-empty
df = df[df[tc.LICENSE_NAME].apply(lambda x: any(lic in x for lic in license))]
# # Convert 'Release Date' to int temporarily
# if not df.empty: # Check if df is non-empty
# df[tc.TEMP_DATE] = pd.to_datetime(df[tc.TEMP_DATE]).astype(int) // 10**9 # Convert to seconds since epoch
# # Convert start and end to int (seconds since epoch)
# start = int(pd.to_datetime(start).timestamp())
# end = int(pd.to_datetime(end).timestamp())
# # Filter based on the converted 'Release Date'
# if not df.empty: # Check if df is non-empty
# df = df[(df[tc.TEMP_DATE] >= start) & (df[tc.TEMP_DATE] <= end)]
df = filter_by_date(df, start_year, start_month, end_year, end_month, tc.TEMP_DATE)
df = filter_cols(df)
df = df.sort_values(by=tc.CLEMSCORE, ascending=False)
return df # Return the filtered dataframe