nesticot's picture
Upload 2 files
8dab285 verified
raw
history blame
26.7 kB
import polars as pl
import api_scraper
mlb_scrape = api_scraper.MLB_Scrape()
from stuff_model import *
from shiny import App, reactive, ui, render
from shiny.ui import h2, tags
from api_scraper import MLB_Scrape
import datetime
from stuff_model import feature_engineering as fe
from stuff_model import stuff_apply
from pytabulator import TableOptions, Tabulator, output_tabulator, render_tabulator, theme
theme.tabulator_site()
scraper = MLB_Scrape()
df_year_old_group = pl.read_parquet('pitch_data_agg_2024.parquet')
pitcher_old_dict = dict(zip(df_year_old_group['pitcher_id'],df_year_old_group['pitcher_name']))
app_ui = ui.page_fluid(
ui.card(
ui.card_header("2025 Spring Training Pitch Data App"),
ui.row(
ui.column(4,
ui.markdown("""This app generates a table which shows the 2025 Spring Training data.
* Differences are calculated based on 2024 regular season data
* If 2024 data does not exist for pitcher, 2023 Data is used
* If no difference exists, the pitch is labelled as a new pitch"""),
ui.input_action_button(
"refresh",
"Refresh Data",
class_="btn-primary",
width="100%"
)
),
ui.column(3,
ui.div(
"By: ",
ui.tags.a(
"@TJStats",
href="https://x.com/TJStats",
target="_blank"
)
),
ui.tags.p("Data: MLB"),
ui.tags.p(
ui.tags.a(
"Support me on Patreon for more baseball content",
href="https://www.patreon.com/TJ_Stats",
target="_blank"
)
)
)
),
ui.navset_tab(
ui.nav("All Pitches",
output_tabulator("table_all")
),
ui.nav("Daily Pitches",
output_tabulator("table_daily")
),
ui.nav("tjStuff+",
output_tabulator("table_tjstuff")
),
)
)
)
def server(input, output, session):
@output
@render_tabulator
@reactive.event(input.refresh)
def table_all():
import polars as pl
df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
print(datetime.datetime.now())
date_str = date.strftime('%Y-%m-%d')
# Initialize the scraper
game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
.filter(pl.col('date') == date)['game_id'])
data = scraper.get_data(game_list_input)
df = scraper.get_data_df(data)
df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
# df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
# df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
import polars as pl
# Compute total pitches for each pitcher
df_pitcher_totals = df_spring_stuff.group_by("pitcher_id").agg(
pl.col("start_speed").count().alias("pitcher_total")
)
df_spring_group = df_spring_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type']).agg([
pl.col('start_speed').count().alias('count'),
pl.col('start_speed').mean().alias('start_speed'),
pl.col('start_speed').max().alias('max_start_speed'),
pl.col('ivb').mean().alias('ivb'),
pl.col('hb').mean().alias('hb'),
pl.col('release_pos_z').mean().alias('release_pos_z'),
pl.col('release_pos_x').mean().alias('release_pos_x'),
pl.col('extension').mean().alias('extension'),
pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'),
(pl.col('start_speed').filter(pl.col('batter_hand')=='L').count()).alias('rhh_count'),
(pl.col('start_speed').filter(pl.col('batter_hand')=='R').count()).alias('lhh_count')
])
# Join total pitches per pitcher to the grouped DataFrame on pitcher_id
df_spring_group = df_spring_group.join(df_pitcher_totals, on="pitcher_id", how="left")
# Now calculate the pitch percent for each pitcher/pitch_type combination
df_spring_group = df_spring_group.with_columns(
(pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent")
)
# Optionally, if you want the percentage of left/right-handed batters within the group:
df_spring_group = df_spring_group.with_columns([
(pl.col("rhh_count") / pl.col("pitcher_total")).alias("rhh_percent"),
(pl.col("lhh_count") / pl.col("pitcher_total")).alias("lhh_percent")
])
df_merge = df_spring_group.join(df_year_old_group,on=['pitcher_id','pitch_type'],how='left',suffix='_old')
df_merge = df_merge.with_columns(
pl.col('pitcher_id').is_in(df_year_old_group['pitcher_id']).alias('exists_in_old')
)
df_merge = df_merge.with_columns(
pl.when(pl.col('start_speed_old').is_null() & pl.col('exists_in_old'))
.then(pl.lit(True))
.otherwise(pl.lit(None))
.alias("new_pitch")
)
import polars as pl
# Define the columns to subtract
cols_to_subtract = [
("start_speed", "start_speed_old"),
("max_start_speed", "max_start_speed_old"),
("ivb", "ivb_old"),
("hb", "hb_old"),
("release_pos_z", "release_pos_z_old"),
("release_pos_x", "release_pos_x_old"),
("extension", "extension_old"),
("tj_stuff_plus", "tj_stuff_plus_old")
]
df_merge = df_merge.with_columns([
# Step 1: Create _diff columns with the default value (e.g., 80) if old is null
pl.when(pl.col(old).is_null())
.then(pl.lit(10000)) # If old is null, assign 80 as the default
.otherwise(pl.col(new) - pl.col(old)) # Otherwise subtract old from new
.alias(new + "_diff")
for new, old in cols_to_subtract
])
# Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
df_merge = df_merge.with_columns([
pl.when(pl.col(new + "_diff").eq(10000)) # If diff is 80, no need to include brackets
.then(pl.col(new).round(1).cast(pl.Utf8)+'\n\t') # Just return the new value as string
.otherwise(
pl.col(new).round(1).cast(pl.Utf8) +
"\n(" +
pl.col(new + "_diff").round(1)
.map_elements(lambda x: f"{x:+.1f}") +
")"
).alias(new + "_formatted")
for new, _ in cols_to_subtract
])
percent_cols = ['pitch_percent', 'rhh_percent', 'lhh_percent']
df_merge = df_merge.with_columns([
(pl.col(col) * 100) # Convert to percentage
.round(1) # Round to 1 decimal
.map_elements(lambda x: f"{x:.1f}%") # Format as string with '%'
.alias(col + "_formatted")
for col in percent_cols
]).sort(['pitcher_id','count'],descending=True)
columns = [
{ "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
{ "title": "Team", "field": "pitcher_team", "width": 100, "headerFilter":"input" ,"frozen":True,},
{ "title": "Pitch Type", "field": "pitch_type", "width": 125, "headerFilter":"input" ,"frozen":True,},
{ "title": "New Pitch?", "field": "new_pitch", "width": 125, "headerFilter":"input" ,"frozen":False,},
{ "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input","contextMenu":True},
{ "title": "Pitch%", "field": "pitch_percent_formatted", "width": 100, "headerFilter":"input"},
{ "title": "RHH%", "field": "rhh_percent_formatted", "width": 100, "headerFilter":"input"},
{ "title": "LHH%", "field": "lhh_percent_formatted", "width": 100, "headerFilter":"input"},
{ "title": "Velocity", "field": "start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "Max Velo", "field": "max_start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "iVB", "field": "ivb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "HB", "field": "hb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "RelH", "field": "release_pos_z_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "RelS", "field": "release_pos_x_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "Extension", "field": "extension_formatted", "width": 125, "headerFilter":"input", "formatter":"textarea" },
{ "title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" }
]
df_plot = df_merge.to_pandas()
team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
return Tabulator(
df_plot,
table_options=TableOptions(
height=750,
columns=columns,
)
)
@output
@render_tabulator
@reactive.event(input.refresh)
def table_daily():
import polars as pl
df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
import datetime
date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
print(datetime.datetime.now())
date_str = date.strftime('%Y-%m-%d')
# Initialize the scraper
game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
.filter(pl.col('date') == date)['game_id'])
data = scraper.get_data(game_list_input)
df = scraper.get_data_df(data)
df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
# df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
# df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
import polars as pl
# Compute total pitches for each pitcher
df_pitcher_totals = df_spring_stuff.group_by(["pitcher_id",'game_id','game_date']).agg(
pl.col("start_speed").count().alias("pitcher_total")
)
df_spring_group = df_spring_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type','game_id','game_date']).agg([
pl.col('start_speed').count().alias('count'),
pl.col('start_speed').mean().alias('start_speed'),
pl.col('start_speed').max().alias('max_start_speed'),
pl.col('ivb').mean().alias('ivb'),
pl.col('hb').mean().alias('hb'),
pl.col('release_pos_z').mean().alias('release_pos_z'),
pl.col('release_pos_x').mean().alias('release_pos_x'),
pl.col('extension').mean().alias('extension'),
pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'),
(pl.col('start_speed').filter(pl.col('batter_hand')=='L').count()).alias('rhh_count'),
(pl.col('start_speed').filter(pl.col('batter_hand')=='R').count()).alias('lhh_count')
])
# Join total pitches per pitcher to the grouped DataFrame on pitcher_id
df_spring_group = df_spring_group.join(df_pitcher_totals, on=["pitcher_id",'game_id','game_date'], how="left")
# Now calculate the pitch percent for each pitcher/pitch_type combination
df_spring_group = df_spring_group.with_columns(
(pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent")
)
# Optionally, if you want the percentage of left/right-handed batters within the group:
df_spring_group = df_spring_group.with_columns([
(pl.col("rhh_count") / pl.col("pitcher_total")).alias("rhh_percent"),
(pl.col("lhh_count") / pl.col("pitcher_total")).alias("lhh_percent")
])
df_merge = df_spring_group.join(df_year_old_group,on=['pitcher_id','pitch_type'],how='left',suffix='_old')
df_merge = df_merge.with_columns(
pl.col('pitcher_id').is_in(df_year_old_group['pitcher_id']).alias('exists_in_old')
)
df_merge = df_merge.with_columns(
pl.when(pl.col('start_speed_old').is_null() & pl.col('exists_in_old'))
.then(pl.lit(True))
.otherwise(pl.lit(None))
.alias("new_pitch")
)
import polars as pl
# Define the columns to subtract
cols_to_subtract = [
("start_speed", "start_speed_old"),
("max_start_speed", "max_start_speed_old"),
("ivb", "ivb_old"),
("hb", "hb_old"),
("release_pos_z", "release_pos_z_old"),
("release_pos_x", "release_pos_x_old"),
("extension", "extension_old"),
("tj_stuff_plus", "tj_stuff_plus_old")
]
df_merge = df_merge.with_columns([
# Step 1: Create _diff columns with the default value (e.g., 80) if old is null
pl.when(pl.col(old).is_null())
.then(pl.lit(10000)) # If old is null, assign 80 as the default
.otherwise(pl.col(new) - pl.col(old)) # Otherwise subtract old from new
.alias(new + "_diff")
for new, old in cols_to_subtract
])
# Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
df_merge = df_merge.with_columns([
pl.when(pl.col(new + "_diff").eq(10000)) # If diff is 80, no need to include brackets
.then(pl.col(new).round(1).cast(pl.Utf8)+'\n\t') # Just return the new value as string
.otherwise(
pl.col(new).round(1).cast(pl.Utf8) +
"\n(" +
pl.col(new + "_diff").round(1)
.map_elements(lambda x: f"{x:+.1f}") +
")"
).alias(new + "_formatted")
for new, _ in cols_to_subtract
])
percent_cols = ['pitch_percent', 'rhh_percent', 'lhh_percent']
df_merge = df_merge.with_columns([
(pl.col(col) * 100) # Convert to percentage
.round(1) # Round to 1 decimal
.map_elements(lambda x: f"{x:.1f}%") # Format as string with '%'
.alias(col + "_formatted")
for col in percent_cols
]).sort(['pitcher_id','count'],descending=True)
columns = [
{ "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
{ "title": "Team", "field": "pitcher_team", "width": 100, "headerFilter":"input" ,"frozen":True,},
{ "title": "Pitch Type", "field": "pitch_type", "width": 125, "headerFilter":"input" ,"frozen":True,},
{ "title": "New Pitch?", "field": "new_pitch", "width": 125, "headerFilter":"input" ,"frozen":False,},
{ "title": "Date", "field": "game_date", "width": 100, "headerFilter":"input" ,"frozen":True,},
{ "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input"},
{ "title": "Pitch%", "field": "pitch_percent_formatted", "width": 100, "headerFilter":"input"},
{ "title": "RHH%", "field": "rhh_percent_formatted", "width": 100, "headerFilter":"input"},
{ "title": "LHH%", "field": "lhh_percent_formatted", "width": 100, "headerFilter":"input"},
{ "title": "Velocity", "field": "start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "Max Velo", "field": "max_start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "iVB", "field": "ivb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "HB", "field": "hb_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "RelH", "field": "release_pos_z_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "RelS", "field": "release_pos_x_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "Extension", "field": "extension_formatted", "width": 125, "headerFilter":"input", "formatter":"textarea" },
{ "title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" }
]
df_plot = df_merge.to_pandas()
team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
return Tabulator(
df_plot,
table_options=TableOptions(
height=750,
columns=columns,
)
)
@output
@render_tabulator
@reactive.event(input.refresh)
def table_tjstuff():
import polars as pl
df_spring = pl.read_parquet(f"hf://datasets/TJStatsApps/mlb_data/data/mlb_pitch_data_2025_spring.parquet")
import datetime
date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date()
print(datetime.datetime.now())
date_str = date.strftime('%Y-%m-%d')
# Initialize the scraper
game_list_input = (scraper.get_schedule(year_input=[int(date_str[0:4])], sport_id=[1], game_type=['S'])
.filter(pl.col('date') == date)['game_id'])
data = scraper.get_data(game_list_input)
df = scraper.get_data_df(data)
df_spring = pl.concat([df_spring, df]).sort('game_date', descending=True)
# df_year_old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb,df_aaa,df_a,df_afl])))
# df_year_2old = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_mlb_2023])))
df_spring_stuff = stuff_apply.stuff_apply(fe.feature_engineering(pl.concat([df_spring])))
import polars as pl
# Compute total pitches for each pitcher
df_pitcher_totals = df_spring_stuff.group_by(["pitcher_id"]).agg(
pl.col("start_speed").count().alias("pitcher_total")
)
df_spring_group = df_spring_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type']).agg([
pl.col('start_speed').count().alias('count'),
pl.col('start_speed').mean().alias('start_speed'),
pl.col('start_speed').max().alias('max_start_speed'),
pl.col('ivb').mean().alias('ivb'),
pl.col('hb').mean().alias('hb'),
pl.col('release_pos_z').mean().alias('release_pos_z'),
pl.col('release_pos_x').mean().alias('release_pos_x'),
pl.col('extension').mean().alias('extension'),
pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'),
(pl.col('start_speed').filter(pl.col('batter_hand')=='L').count()).alias('rhh_count'),
(pl.col('start_speed').filter(pl.col('batter_hand')=='R').count()).alias('lhh_count')
])
# Join total pitches per pitcher to the grouped DataFrame on pitcher_id
df_spring_group = df_spring_group.join(df_pitcher_totals, on=["pitcher_id"], how="left")
# Now calculate the pitch percent for each pitcher/pitch_type combination
df_spring_group = df_spring_group.with_columns(
(pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent")
)
# Optionally, if you want the percentage of left/right-handed batters within the group:
df_spring_group = df_spring_group.with_columns([
(pl.col("rhh_count") / pl.col("pitcher_total")).alias("rhh_percent"),
(pl.col("lhh_count") / pl.col("pitcher_total")).alias("lhh_percent")
])
df_merge = df_spring_group.join(df_year_old_group,on=['pitcher_id','pitch_type'],how='left',suffix='_old')
df_merge = df_merge.with_columns(
pl.col('pitcher_id').is_in(df_year_old_group['pitcher_id']).alias('exists_in_old')
)
df_merge = df_merge.with_columns(
pl.when(pl.col('start_speed_old').is_null() & pl.col('exists_in_old'))
.then(pl.lit(True))
.otherwise(pl.lit(None))
.alias("new_pitch")
)
import polars as pl
# Define the columns to subtract
cols_to_subtract = [
("start_speed", "start_speed_old"),
("max_start_speed", "max_start_speed_old"),
("ivb", "ivb_old"),
("hb", "hb_old"),
("release_pos_z", "release_pos_z_old"),
("release_pos_x", "release_pos_x_old"),
("extension", "extension_old"),
("tj_stuff_plus", "tj_stuff_plus_old")
]
df_merge = df_merge.with_columns([
# Step 1: Create _diff columns with the default value (e.g., 80) if old is null
pl.when(pl.col(old).is_null())
.then(pl.lit(None)) # If old is null, assign 80 as the default
.otherwise(pl.col(new) - pl.col(old)) # Otherwise subtract old from new
.alias(new + "_diff")
for new, old in cols_to_subtract
])
# Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
# Step 2: Format the columns with (value (+diff)) - exclude brackets if diff is 80
df_merge = df_merge.with_columns([
pl.col(new).round(1).cast(pl.Utf8).alias(new + "_formatted")
for new, _ in cols_to_subtract
])
df_merge = df_merge.with_columns([
pl.col("tj_stuff_plus_old").round(1).cast(pl.Utf8).alias("tj_stuff_plus_old"),
pl.col("tj_stuff_plus_diff").round(1).map_elements(lambda x: f"{x:+.1f}").alias("tj_stuff_plus_diff")
])
percent_cols = ['pitch_percent', 'rhh_percent', 'lhh_percent']
df_merge = df_merge.with_columns([
(pl.col(col) * 100) # Convert to percentage
.round(1) # Round to 1 decimal
.map_elements(lambda x: f"{x:.1f}%") # Format as string with '%'
.alias(col + "_formatted")
for col in percent_cols
]).sort(['pitcher_id','count'],descending=True)
columns = [
{ "title": "Pitcher Name", "field": "pitcher_name", "width": 250, "headerFilter":"input" ,"frozen":True,},
{ "title": "Team", "field": "pitcher_team", "width": 90, "headerFilter":"input" ,"frozen":True,},
{ "title": "Pitch Type", "field": "pitch_type", "width": 125, "headerFilter":"input" ,"frozen":True,},
{ "title": "New?", "field": "new_pitch", "width": 125, "headerFilter":"input" ,"frozen":False,},
{ "title": "Pitches", "field": "count", "width": 100 , "headerFilter":"input"},
{ "title": "Pitch%", "field": "pitch_percent_formatted", "width": 100, "headerFilter":"input"},
{ "title": "RHH%", "field": "rhh_percent_formatted", "width": 90, "headerFilter":"input"},
{ "title": "LHH%", "field": "lhh_percent_formatted", "width": 90, "headerFilter":"input"},
{ "title": "Velocity", "field": "start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "Max Velo", "field": "max_start_speed_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "iVB", "field": "ivb_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
{ "title": "HB", "field": "hb_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
{ "title": "RelH", "field": "release_pos_z_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
{ "title": "RelS", "field": "release_pos_x_formatted", "width": 80, "headerFilter":"input", "formatter":"textarea" },
{ "title": "Extension", "field": "extension_formatted", "width": 125, "headerFilter":"input", "formatter":"textarea" },
{ "title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "2024 tjStuff+", "field": "tj_stuff_plus_old", "width": 100, "headerFilter":"input", "formatter":"textarea" },
{ "title": "Δ", "field": "tj_stuff_plus_diff", "width": 100, "headerFilter":"input", "formatter":"textarea" }
]
df_plot = df_merge.sort(['pitcher_id','count'],descending=True).to_pandas()
team_dict = dict(zip(df_spring['pitcher_id'],df_spring['pitcher_team']))
df_plot['pitcher_team'] = df_plot['pitcher_id'].map(team_dict)
return Tabulator(
df_plot,
table_options=TableOptions(
height=750,
columns=columns,
)
)
app = App(app_ui, server)