nesticot commited on
Commit
b3ddf26
·
verified ·
1 Parent(s): 310c527

Delete stuff_model

Browse files
stuff_model/__pycache__/feature_engineering.cpython-39.pyc DELETED
Binary file (2.17 kB)
 
stuff_model/__pycache__/stuff_apply.cpython-39.pyc DELETED
Binary file (1.33 kB)
 
stuff_model/feature_engineering.py DELETED
@@ -1,118 +0,0 @@
1
- import polars as pl
2
- import numpy as np
3
-
4
- def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
5
- # Extract the year from the game_date column
6
- df = df.with_columns(
7
- pl.col('game_date').str.slice(0, 4).alias('year')
8
- )
9
-
10
- df = df.with_columns([
11
-
12
- (-(pl.col('vy0')**2 - (2 * pl.col('ay') * (pl.col('y0') - 17/12)))**0.5).alias('vy_f'),
13
- ])
14
-
15
- df = df.with_columns([
16
- ((pl.col('vy_f') - pl.col('vy0')) / pl.col('ay')).alias('t'),
17
- ])
18
-
19
- df = df.with_columns([
20
- (pl.col('vz0') + (pl.col('az') * pl.col('t'))).alias('vz_f'),
21
- (pl.col('vx0') + (pl.col('ax') * pl.col('t'))).alias('vx_f')
22
- ])
23
-
24
- df = df.with_columns([
25
- (-np.arctan(pl.col('vz_f') / pl.col('vy_f')) * (180 / np.pi)).alias('vaa'),
26
- (-np.arctan(pl.col('vx_f') / pl.col('vy_f')) * (180 / np.pi)).alias('haa')
27
- ])
28
-
29
- # Mirror horizontal break for left-handed pitchers
30
- df = df.with_columns(
31
- pl.when(pl.col('pitcher_hand') == 'L')
32
- .then(-pl.col('ax'))
33
- .otherwise(pl.col('ax'))
34
- .alias('ax')
35
- )
36
-
37
- # Mirror horizontal break for left-handed pitchers
38
- df = df.with_columns(
39
- pl.when(pl.col('pitcher_hand') == 'L')
40
- .then(-pl.col('hb'))
41
- .otherwise(pl.col('hb'))
42
- .alias('hb')
43
- )
44
-
45
- # Mirror horizontal release point for left-handed pitchers
46
- df = df.with_columns(
47
- pl.when(pl.col('pitcher_hand') == 'L')
48
- .then(pl.col('x0'))
49
- .otherwise(-pl.col('x0'))
50
- .alias('x0')
51
- )
52
-
53
- # Define the pitch types to be considered
54
- pitch_types = ['SI', 'FF', 'FC']
55
-
56
- # Filter the DataFrame to include only the specified pitch types
57
- df_filtered = df.filter(pl.col('pitch_type').is_in(pitch_types))
58
-
59
- # Group by pitcher_id and year, then aggregate to calculate average speed and usage percentage
60
- df_agg = df_filtered.group_by(['pitcher_id', 'year', 'pitch_type']).agg([
61
- pl.col('start_speed').mean().alias('avg_fastball_speed'),
62
- pl.col('az').mean().alias('avg_fastball_az'),
63
- pl.col('ax').mean().alias('avg_fastball_ax'),
64
- pl.len().alias('count')
65
- ])
66
-
67
- # Sort the aggregated data by count and average fastball speed
68
- df_agg = df_agg.sort(['count', 'avg_fastball_speed'], descending=[True, True])
69
- df_agg = df_agg.unique(subset=['pitcher_id', 'year'], keep='first')
70
-
71
- # Join the aggregated data with the main DataFrame
72
- df = df.join(df_agg, on=['pitcher_id', 'year'])
73
-
74
- # If no fastball, use the fastest pitch for avg_fastball_speed
75
- df = df.with_columns(
76
- pl.when(pl.col('avg_fastball_speed').is_null())
77
- .then(pl.col('start_speed').max().over('pitcher_id'))
78
- .otherwise(pl.col('avg_fastball_speed'))
79
- .alias('avg_fastball_speed')
80
- )
81
-
82
- # If no fastball, use the fastest pitch for avg_fastball_az
83
- df = df.with_columns(
84
- pl.when(pl.col('avg_fastball_az').is_null())
85
- .then(pl.col('az').max().over('pitcher_id'))
86
- .otherwise(pl.col('avg_fastball_az'))
87
- .alias('avg_fastball_az')
88
- )
89
-
90
- # If no fastball, use the fastest pitch for avg_fastball_ax
91
- df = df.with_columns(
92
- pl.when(pl.col('avg_fastball_ax').is_null())
93
- .then(pl.col('ax').max().over('ax'))
94
- .otherwise(pl.col('avg_fastball_ax'))
95
- .alias('avg_fastball_ax')
96
- )
97
-
98
- # Calculate pitch differentials
99
- df = df.with_columns(
100
- (pl.col('start_speed') - pl.col('avg_fastball_speed')).alias('speed_diff'),
101
- (pl.col('az') - pl.col('avg_fastball_az')).alias('az_diff'),
102
- (pl.col('ax') - pl.col('avg_fastball_ax')).abs().alias('ax_diff')
103
- )
104
-
105
- # Cast the year column to integer type
106
- df = df.with_columns(
107
- pl.col('year').cast(pl.Int64)
108
- )
109
-
110
-
111
-
112
- df = df.with_columns([
113
- pl.lit('All').alias('all')
114
- ])
115
-
116
-
117
-
118
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
stuff_model/lgbm_model_2020_2023.joblib DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:41001a1acf6ce7dbe247f1b8b7e68a1bb1b112f39d080b7e95a83479e56cb7c1
3
- size 3092328
 
 
 
 
stuff_model/stuff_apply.py DELETED
@@ -1,57 +0,0 @@
1
- import polars as pl
2
- import joblib
3
-
4
- model = joblib.load('stuff_model/lgbm_model_2020_2023.joblib')
5
- # Read the values from the text file
6
- with open('stuff_model/target_stats.txt', 'r') as file:
7
- lines = file.readlines()
8
- target_mean = float(lines[0].strip())
9
- target_std = float(lines[1].strip())
10
-
11
- # Define the features to be used for training
12
- features = ['start_speed',
13
- 'spin_rate',
14
- 'extension',
15
- 'az',
16
- 'ax',
17
- 'x0',
18
- 'z0',
19
- 'speed_diff',
20
- 'az_diff',
21
- 'ax_diff']
22
-
23
-
24
- def stuff_apply(df:pl.DataFrame) -> pl.DataFrame:
25
- # Filter the dataframe to include only the rows for the year 2024 and drop rows with null values in the specified features and target column
26
- # df_test = df.drop_nulls(subset=features)
27
- df_test = df.clone()
28
-
29
- # Predict the target values for the 2024 data using the trained model
30
- df_test = df_test.with_columns(
31
- pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
32
- )
33
- # Standardize the target column to create a z-score
34
- df_test = df_test.with_columns(
35
- ((pl.col('target') - target_mean) / target_std).alias('target_zscore')
36
- )
37
-
38
- # Convert the z-score to tj_stuff_plus
39
- df_test = df_test.with_columns(
40
- (100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
41
- )
42
-
43
- df_pitch_types = pl.read_csv('stuff_model/tj_stuff_plus_pitch.csv')
44
-
45
- # Join the pitch type statistics with the main DataFrame based on pitch_type
46
- df_pitch_all = df_test.join(df_pitch_types, left_on='pitch_type', right_on='pitch_type')
47
-
48
- # Normalize pitch_grade values to a range between -0.5 and 0.5 based on the percentiles
49
- df_pitch_all = df_pitch_all.with_columns(
50
- ((pl.col('tj_stuff_plus') - pl.col('mean')) / pl.col('std')).alias('pitch_grade')
51
- )
52
-
53
- # Scale the pitch_grade values to a range between 20 and 80
54
- df_pitch_all = df_pitch_all.with_columns(
55
- (pl.col('pitch_grade') * 10 + 50).clip(20, 80)
56
- )
57
- return df_pitch_all
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
stuff_model/target_stats.txt DELETED
@@ -1,2 +0,0 @@
1
- 0.0034732498406374636
2
- 0.006846752748626548
 
 
 
stuff_model/tj_stuff_plus_pitch.csv DELETED
@@ -1,16 +0,0 @@
1
- pitch_type,mean,std,median,min,max,percentile_1,percentile_99
2
- ST,106.44784631565936,5.593943599731136,106.24878922952112,91.18894850636659,125.29541262167034,91.69322149368426,125.25688309207108
3
- SV,103.73183202363764,3.001226780758946,103.50047554089315,93.3173875900245,111.34757479687066,93.32953434698274,111.33689503153641
4
- SL,103.49296290610897,5.265572779780409,103.19144262214559,88.84957017284297,121.88798777026031,89.76670287371176,121.36013955239422
5
- KC,101.8993919341341,4.271694896723436,100.79211889194949,93.69754063161618,119.4933202093256,93.75149298057133,119.38166236091195
6
- All,99.9275100894791,5.01699442232884,99.65265124489378,84.73033633038408,116.94934527087541,86.65905811630736,116.7610246502804
7
- CU,99.88832068607897,4.615228571103906,99.08993373693156,89.84495168337246,119.90089262632986,90.20429983334718,117.89567125997061
8
- FC,98.83449547008738,5.811964883678063,98.54483029899575,83.20928731685326,119.78700324933075,83.34007602984008,118.21186533190846
9
- FS,98.25541635267653,6.898952096824192,98.46204303842217,72.25450024197754,114.88400714657823,73.39595959354874,114.78967217449389
10
- FO,98.15224613640243,1.081819065809178,99.94816563615653,94.0023252668585,100.50624750619224,94.0142169475971,100.50513134245217
11
- FF,97.29024735737988,6.078459125845886,97.09670890504734,81.2230917971995,118.10419744965911,81.32311771953398,117.7938724746093
12
- SC,97.27958020025409,1.2452898498180456,97.27958020025409,93.536223938276,101.02293646223218,93.54371065079995,101.01544974970822
13
- CH,96.35866365133434,6.178939251378385,95.80884625564597,81.28802319264824,121.14136334013493,82.02275793969746,119.09639344796777
14
- SI,95.14161603816645,4.9734372581529955,95.11657827702109,82.5850956341191,112.99618112461533,82.8856383780296,112.72626192694757
15
- CS,93.97853627048322,0.0,93.97853627048322,93.97853627048322,93.97853627048322,93.97853627048322,93.97853627048322
16
- KN,93.41890096234394,0.0,93.41890096234394,93.41890096234394,93.41890096234394,93.41890096234394,93.41890096234394