File size: 4,654 Bytes
68e6513 fbdc657 144f299 890203a fbdc657 68e6513 144f299 68e6513 1580227 68e6513 890203a fd62121 890203a fd62121 890203a fd62121 890203a fd62121 68e6513 fbdc657 890203a ee6a180 890203a 3433b65 144f299 ee6a180 834ab51 ee6a180 3433b65 144f299 3433b65 144f299 3433b65 f3c3cd9 144f299 3433b65 834ab51 ee6a180 3433b65 144f299 7ce5480 fbdc657 3433b65 144f299 fbdc657 fd62121 68e6513 144f299 ee6a180 68e6513 fd62121 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# Utility functions for filtering the dataframe
import pandas as pd
import assets.text_content as tc
import calendar
from typing import Union, List
from datetime import datetime
current_year = str(datetime.now().year)
def filter_cols(df):
df = df[[
tc.MODEL_NAME,
tc.CLEMSCORE,
tc.INPUT,
tc.OUTPUT,
tc.LATENCY,
tc.CONTEXT,
tc.PARAMS,
tc.RELEASE_DATE,
tc.LICENSE
]]
return df
def convert_date_components_to_timestamp(year: str, month: str) -> int:
"""Convert year and month strings to timestamp."""
# Create a datetime object for the first day of the month
date_str = f"{year}-{month:02d}-01"
return int(pd.to_datetime(date_str).timestamp())
def filter_by_date(df: pd.DataFrame,
start_year, start_month,
end_year, end_month,
date_column: str = tc.RELEASE_DATE) -> pd.DataFrame:
"""
Filter DataFrame by date range using separate year and month components.
"""
# All lists are passed at once, so set default values here instead of passing them in args- Overwritten by empty lists
if not start_year:
start_year = tc.START_YEAR
if not end_year:
end_year = current_year
if not start_month:
start_month = "January"
if not end_month:
end_month = "December"
try:
# Convert string inputs to integers for date creation
start_timestamp = convert_date_components_to_timestamp(
int(start_year),
int(tc.MONTH_MAP[start_month])
)
end_timestamp = convert_date_components_to_timestamp(
int(end_year),
int(tc.MONTH_MAP[end_month])
)
# Convert the DataFrame's date column to timestamps for comparison
date_timestamps = pd.to_datetime(df[date_column]).apply(lambda x: int(x.timestamp()))
# Filter the DataFrame
return df[
(date_timestamps >= start_timestamp) &
(date_timestamps <= end_timestamp)
]
except (ValueError, TypeError) as e:
print(f"Error processing dates: {e}")
return df # Return unfiltered DataFrame if there's an error
def filter(df, language_list, parameters, input_price, output_price, multimodal,
context, open_weight,
start_year, start_month, end_year, end_month,
license ):
if not df.empty: # Check if df is non-empty
df = df[df[tc.LANGS].apply(lambda x: all(lang in x for lang in language_list))]
if not df.empty:
df = df[(df[tc.DUMMY_PARAMS] >= parameters[0]) & (df[tc.DUMMY_PARAMS] <= parameters[1])]
if not df.empty: # Check if df is non-empty
df = df[(df[tc.INPUT] >= input_price[0]) & (df[tc.INPUT] <= input_price[1])]
if not df.empty: # Check if df is non-empty
df = df[(df[tc.OUTPUT] >= output_price[0]) & (df[tc.OUTPUT] <= output_price[1])]
if not df.empty: # Check if df is non-empty
if tc.TEXT in multimodal:
df = df[(df[tc.SINGLE_IMG] == False) & (df[tc.MULT_IMG] == False) & (df[tc.AUDIO] == False) & (df[tc.VIDEO] == False) ]
if tc.SINGLE_IMG in multimodal:
df = df[df[tc.SINGLE_IMG] == True]
if tc.MULT_IMG in multimodal:
df = df[df[tc.MULT_IMG] == True]
if tc.AUDIO in multimodal:
df = df[df[tc.AUDIO] == True]
if tc.VIDEO in multimodal:
df = df[df[tc.VIDEO] == True]
if not df.empty: # Check if df is non-empty
# Convert 'Context Size (k)' to numeric, coercing errors to NaN
context_size = pd.to_numeric(df['Context Size (k)'], errors='coerce').fillna(0)
# Apply the filter
df = df[(context_size >= context[0]) & (context_size <= context[1])]
if not df.empty: # Check if df is non-empty
if tc.OPEN in open_weight and tc.COMM not in open_weight:
df = df[df[tc.OPEN_WEIGHT] == True]
elif tc.COMM in open_weight and tc.OPEN not in open_weight:
df = df[df[tc.OPEN_WEIGHT] == False]
elif tc.OPEN not in open_weight and tc.COMM not in open_weight:
# Return empty DataFrame with same columns
df = pd.DataFrame(columns=df.columns)
if not df.empty: # Check if df is non-empty
df = df[df[tc.LICENSE_NAME].apply(lambda x: any(lic in x for lic in license))]
df = filter_by_date(df, start_year, start_month, end_year, end_month, tc.TEMP_DATE)
df = filter_cols(df)
df = df.sort_values(by=tc.CLEMSCORE, ascending=False)
return df # Return the filtered dataframe
|