nesticot commited on
Commit
40d77df
·
verified ·
1 Parent(s): b3ddf26

Delete api_scraper.py

Browse files
Files changed (1) hide show
  1. api_scraper.py +0 -927
api_scraper.py DELETED
@@ -1,927 +0,0 @@
1
- import requests
2
- import polars as pl
3
- import numpy as np
4
- from datetime import datetime
5
- from tqdm import tqdm
6
- from pytz import timezone
7
- import re
8
- from concurrent.futures import ThreadPoolExecutor, as_completed
9
- import time
10
-
11
-
12
- class MLB_Scrape:
13
-
14
- def __init__(self):
15
- # Initialize your class here if needed
16
- pass
17
-
18
- def get_sport_id(self):
19
- """
20
- Retrieves the list of sports from the MLB API and processes it into a Polars DataFrame.
21
-
22
- Returns:
23
- - df (pl.DataFrame): A DataFrame containing the sports information.
24
- """
25
- # Make API call to retrieve sports information
26
- response = requests.get(url='https://statsapi.mlb.com/api/v1/sports').json()
27
-
28
- # Convert the JSON response into a Polars DataFrame
29
- df = pl.DataFrame(response['sports'])
30
-
31
- return df
32
-
33
- def get_sport_id_check(self, sport_id: int = 1):
34
- """
35
- Checks if the provided sport ID exists in the list of sports retrieved from the MLB API.
36
-
37
- Parameters:
38
- - sport_id (int): The sport ID to check. Default is 1.
39
-
40
- Returns:
41
- - bool: True if the sport ID exists, False otherwise. If False, prints the available sport IDs.
42
- """
43
- # Retrieve the list of sports from the MLB API
44
- sport_id_df = self.get_sport_id()
45
-
46
- # Check if the provided sport ID exists in the DataFrame
47
- if sport_id not in sport_id_df['id']:
48
- print('Please Select a New Sport ID from the following')
49
- print(sport_id_df)
50
- return False
51
-
52
- return True
53
-
54
-
55
- def get_game_types(self):
56
- """
57
- Retrieves the different types of MLB games from the MLB API and processes them into a Polars DataFrame.
58
-
59
- Returns:
60
- - df (pl.DataFrame): A DataFrame containing the game types information.
61
- """
62
- # Make API call to retrieve game types information
63
- response = requests.get(url='https://statsapi.mlb.com/api/v1/gameTypes').json()
64
-
65
- # Convert the JSON response into a Polars DataFrame
66
- df = pl.DataFrame(response)
67
-
68
- return df
69
-
70
- def get_schedule(self,
71
- year_input: list = [2024],
72
- sport_id: list = [1],
73
- game_type: list = ['R']):
74
-
75
- """
76
- Retrieves the schedule of baseball games based on the specified parameters.
77
- Parameters:
78
- - year_input (list): A list of years to filter the schedule. Default is [2024].
79
- - sport_id (list): A list of sport IDs to filter the schedule. Default is [1].
80
- - game_type (list): A list of game types to filter the schedule. Default is ['R'].
81
- Returns:
82
- - game_df (pandas.DataFrame): A DataFrame containing the game schedule information, including game ID, date, time, away team, home team, game state, venue ID, and venue name. If the schedule length is 0, it returns a message indicating that different parameters should be selected.
83
- """
84
-
85
- # Type checks
86
- if not isinstance(year_input, list) or not all(isinstance(year, int) for year in year_input):
87
- raise ValueError("year_input must be a list of integers.")
88
- if not isinstance(sport_id, list) or not all(isinstance(sid, int) for sid in sport_id):
89
- raise ValueError("sport_id must be a list of integers.")
90
-
91
- if not isinstance(game_type, list) or not all(isinstance(gt, str) for gt in game_type):
92
- raise ValueError("game_type must be a list of strings.")
93
-
94
- eastern = timezone('US/Eastern')
95
-
96
- # Convert input lists to comma-separated strings
97
- year_input_str = ','.join([str(x) for x in year_input])
98
- sport_id_str = ','.join([str(x) for x in sport_id])
99
- game_type_str = ','.join([str(x) for x in game_type])
100
-
101
- # Make API call to retrieve game schedule
102
- game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
103
-
104
- # Extract relevant data from the API response
105
- game_list = [item for sublist in [[y['gamePk'] for y in x['games']] for x in game_call['dates']] for item in sublist]
106
- time_list = [item for sublist in [[y['gameDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
107
- date_list = [item for sublist in [[y['officialDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
108
- away_team_list = [item for sublist in [[y['teams']['away']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
109
- home_team_list = [item for sublist in [[y['teams']['home']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
110
- state_list = [item for sublist in [[y['status']['codedGameState'] for y in x['games']] for x in game_call['dates']] for item in sublist]
111
- venue_id = [item for sublist in [[y['venue']['id'] for y in x['games']] for x in game_call['dates']] for item in sublist]
112
- venue_name = [item for sublist in [[y['venue']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
113
-
114
- # Create a Polars DataFrame with the extracted data
115
- game_df = pl.DataFrame(data={'game_id': game_list,
116
- 'time': time_list,
117
- 'date': date_list,
118
- 'away': away_team_list,
119
- 'home': home_team_list,
120
- 'state': state_list,
121
- 'venue_id': venue_id,
122
- 'venue_name': venue_name})
123
-
124
- # Check if the DataFrame is empty
125
- if len(game_df) == 0:
126
- return 'Schedule Length of 0, please select different parameters.'
127
-
128
- # Convert date and time columns to appropriate formats
129
- game_df = game_df.with_columns(
130
- game_df['date'].str.to_date(),
131
- game_df['time'].str.to_datetime().dt.convert_time_zone(eastern.zone).dt.strftime("%I:%M %p"))
132
-
133
- # Remove duplicate games and sort by date
134
- game_df = game_df.unique(subset='game_id').sort('date')
135
-
136
- # Check again if the DataFrame is empty after processing
137
- if len(game_df) == 0:
138
- return 'Schedule Length of 0, please select different parameters.'
139
-
140
- return game_df
141
-
142
- def get_data(self, game_list_input: list):
143
- """
144
- Retrieves live game data for a list of game IDs.
145
-
146
- Parameters:
147
- - game_list_input (list): A list of game IDs for which to retrieve live data.
148
-
149
- Returns:
150
- - data_total (list): A list of JSON responses containing live game data for each game ID.
151
- """
152
- data_total = []
153
- print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
154
-
155
- # Iterate over the list of game IDs with a progress bar
156
- for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
157
- # Make a GET request to the MLB API for each game ID
158
- r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
159
- # Append the JSON response to the data_total list
160
- data_total.append(r.json())
161
-
162
- return data_total
163
-
164
-
165
- return data_total
166
-
167
- def get_data_df(self, data_list):
168
- """
169
- Converts a list of game data JSON objects into a Polars DataFrame.
170
-
171
- Parameters:
172
- - data_list (list): A list of JSON objects containing game data.
173
-
174
- Returns:
175
- - data_df (pl.DataFrame): A DataFrame containing the structured game data.
176
- """
177
- swing_list = ['X','F','S','D','E','T','W']
178
- whiff_list = ['S','T','W']
179
- print('Converting Data to Dataframe.')
180
- game_id = []
181
- game_date = []
182
- batter_id = []
183
- batter_name = []
184
- batter_hand = []
185
- batter_team = []
186
- batter_team_id = []
187
- pitcher_id = []
188
- pitcher_name = []
189
- pitcher_hand = []
190
- pitcher_team = []
191
- pitcher_team_id = []
192
-
193
- play_description = []
194
- play_code = []
195
- in_play = []
196
- is_strike = []
197
- is_swing = []
198
- is_whiff = []
199
- is_out = []
200
- is_ball = []
201
- is_review = []
202
- pitch_type = []
203
- pitch_description = []
204
- strikes = []
205
- balls = []
206
- outs = []
207
- strikes_after = []
208
- balls_after = []
209
- outs_after = []
210
-
211
- start_speed = []
212
- end_speed = []
213
- sz_top = []
214
- sz_bot = []
215
- x = []
216
- y = []
217
- ax = []
218
- ay = []
219
- az = []
220
- pfxx = []
221
- pfxz = []
222
- px = []
223
- pz = []
224
- vx0 = []
225
- vy0 = []
226
- vz0 = []
227
- x0 = []
228
- y0 = []
229
- z0 = []
230
- zone = []
231
- type_confidence = []
232
- plate_time = []
233
- extension = []
234
- spin_rate = []
235
- spin_direction = []
236
- vb = []
237
- ivb = []
238
- hb = []
239
-
240
- launch_speed = []
241
- launch_angle = []
242
- launch_distance = []
243
- launch_location = []
244
- trajectory = []
245
- hardness = []
246
- hit_x = []
247
- hit_y = []
248
-
249
- index_play = []
250
- play_id = []
251
- start_time = []
252
- end_time = []
253
- is_pitch = []
254
- type_type = []
255
-
256
-
257
- type_ab = []
258
- ab_number = []
259
- event = []
260
- event_type = []
261
- rbi = []
262
- away_score = []
263
- home_score = []
264
-
265
- for data in data_list:
266
- for ab_id in range(len(data['liveData']['plays']['allPlays'])):
267
- ab_list = data['liveData']['plays']['allPlays'][ab_id]
268
- for n in range(len(ab_list['playEvents'])):
269
-
270
-
271
- if ab_list['playEvents'][n]['isPitch'] == True or 'call' in ab_list['playEvents'][n]['details']:
272
- ab_number.append(ab_list['atBatIndex'] if 'atBatIndex' in ab_list else None)
273
-
274
- game_id.append(data['gamePk'])
275
- game_date.append(data['gameData']['datetime']['officialDate'])
276
- if 'matchup' in ab_list:
277
- batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else None)
278
- if 'batter' in ab_list['matchup']:
279
- batter_name.append(ab_list['matchup']['batter']['fullName'] if 'fullName' in ab_list['matchup']['batter'] else None)
280
- else:
281
- batter_name.append(None)
282
-
283
- batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else None)
284
- pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else None)
285
- if 'pitcher' in ab_list['matchup']:
286
- pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'fullName' in ab_list['matchup']['pitcher'] else None)
287
- else:
288
- pitcher_name.append(None)
289
-
290
- pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else None)
291
-
292
-
293
- if ab_list['about']['isTopInning']:
294
- batter_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
295
- batter_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
296
- pitcher_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
297
- pitcher_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
298
-
299
- else:
300
- batter_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
301
- batter_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
302
- pitcher_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
303
- pitcher_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
304
-
305
- play_description.append(ab_list['playEvents'][n]['details']['description'] if 'description' in ab_list['playEvents'][n]['details'] else None)
306
- play_code.append(ab_list['playEvents'][n]['details']['code'] if 'code' in ab_list['playEvents'][n]['details'] else None)
307
- in_play.append(ab_list['playEvents'][n]['details']['isInPlay'] if 'isInPlay' in ab_list['playEvents'][n]['details'] else None)
308
- is_strike.append(ab_list['playEvents'][n]['details']['isStrike'] if 'isStrike' in ab_list['playEvents'][n]['details'] else None)
309
-
310
- if 'details' in ab_list['playEvents'][n]:
311
- is_swing.append(True if ab_list['playEvents'][n]['details']['code'] in swing_list else None)
312
- is_whiff.append(True if ab_list['playEvents'][n]['details']['code'] in whiff_list else None)
313
- else:
314
- is_swing.append(None)
315
- is_whiff.append(None)
316
-
317
- is_ball.append(ab_list['playEvents'][n]['details']['isOut'] if 'isOut' in ab_list['playEvents'][n]['details'] else None)
318
- is_review.append(ab_list['playEvents'][n]['details']['hasReview'] if 'hasReview' in ab_list['playEvents'][n]['details'] else None)
319
- pitch_type.append(ab_list['playEvents'][n]['details']['type']['code'] if 'type' in ab_list['playEvents'][n]['details'] else None)
320
- pitch_description.append(ab_list['playEvents'][n]['details']['type']['description'] if 'type' in ab_list['playEvents'][n]['details'] else None)
321
-
322
- if ab_list['playEvents'][n]['pitchNumber'] == 1:
323
- strikes.append(0)
324
- balls.append(0)
325
- strikes_after.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
326
- balls_after.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
327
- outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
328
- outs_after.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
329
-
330
- else:
331
- strikes.append(ab_list['playEvents'][n-1]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n-1]['count'] else None)
332
- balls.append(ab_list['playEvents'][n-1]['count']['balls'] if 'balls' in ab_list['playEvents'][n-1]['count'] else None)
333
- outs.append(ab_list['playEvents'][n-1]['count']['outs'] if 'outs' in ab_list['playEvents'][n-1]['count'] else None)
334
-
335
- strikes_after.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
336
- balls_after.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
337
- outs_after.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
338
-
339
-
340
- if 'pitchData' in ab_list['playEvents'][n]:
341
-
342
- start_speed.append(ab_list['playEvents'][n]['pitchData']['startSpeed'] if 'startSpeed' in ab_list['playEvents'][n]['pitchData'] else None)
343
- end_speed.append(ab_list['playEvents'][n]['pitchData']['endSpeed'] if 'endSpeed' in ab_list['playEvents'][n]['pitchData'] else None)
344
-
345
- sz_top.append(ab_list['playEvents'][n]['pitchData']['strikeZoneTop'] if 'strikeZoneTop' in ab_list['playEvents'][n]['pitchData'] else None)
346
- sz_bot.append(ab_list['playEvents'][n]['pitchData']['strikeZoneBottom'] if 'strikeZoneBottom' in ab_list['playEvents'][n]['pitchData'] else None)
347
- x.append(ab_list['playEvents'][n]['pitchData']['coordinates']['x'] if 'x' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
348
- y.append(ab_list['playEvents'][n]['pitchData']['coordinates']['y'] if 'y' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
349
-
350
- ax.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aX'] if 'aX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
351
- ay.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aY'] if 'aY' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
352
- az.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aZ'] if 'aZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
353
- pfxx.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pfxX'] if 'pfxX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
354
- pfxz.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pfxZ'] if 'pfxZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
355
- px.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pX'] if 'pX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
356
- pz.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pZ'] if 'pZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
357
- vx0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vX0'] if 'vX0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
358
- vy0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vY0'] if 'vY0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
359
- vz0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vZ0'] if 'vZ0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
360
- x0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['x0'] if 'x0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
361
- y0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['y0'] if 'y0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
362
- z0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['z0'] if 'z0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else None)
363
-
364
- zone.append(ab_list['playEvents'][n]['pitchData']['zone'] if 'zone' in ab_list['playEvents'][n]['pitchData'] else None)
365
- type_confidence.append(ab_list['playEvents'][n]['pitchData']['typeConfidence'] if 'typeConfidence' in ab_list['playEvents'][n]['pitchData'] else None)
366
- plate_time.append(ab_list['playEvents'][n]['pitchData']['plateTime'] if 'plateTime' in ab_list['playEvents'][n]['pitchData'] else None)
367
- extension.append(ab_list['playEvents'][n]['pitchData']['extension'] if 'extension' in ab_list['playEvents'][n]['pitchData'] else None)
368
-
369
- if 'breaks' in ab_list['playEvents'][n]['pitchData']:
370
- spin_rate.append(ab_list['playEvents'][n]['pitchData']['breaks']['spinRate'] if 'spinRate' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
371
- spin_direction.append(ab_list['playEvents'][n]['pitchData']['breaks']['spinDirection'] if 'spinDirection' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
372
- vb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakVertical'] if 'breakVertical' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
373
- ivb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakVerticalInduced'] if 'breakVerticalInduced' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
374
- hb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakHorizontal'] if 'breakHorizontal' in ab_list['playEvents'][n]['pitchData']['breaks'] else None)
375
-
376
- else:
377
- start_speed.append(None)
378
- end_speed.append(None)
379
-
380
- sz_top.append(None)
381
- sz_bot.append(None)
382
- x.append(None)
383
- y.append(None)
384
-
385
- ax.append(None)
386
- ay.append(None)
387
- az.append(None)
388
- pfxx.append(None)
389
- pfxz.append(None)
390
- px.append(None)
391
- pz.append(None)
392
- vx0.append(None)
393
- vy0.append(None)
394
- vz0.append(None)
395
- x0.append(None)
396
- y0.append(None)
397
- z0.append(None)
398
-
399
- zone.append(None)
400
- type_confidence.append(None)
401
- plate_time.append(None)
402
- extension.append(None)
403
- spin_rate.append(None)
404
- spin_direction.append(None)
405
- vb.append(None)
406
- ivb.append(None)
407
- hb.append(None)
408
-
409
- if 'hitData' in ab_list['playEvents'][n]:
410
- launch_speed.append(ab_list['playEvents'][n]['hitData']['launchSpeed'] if 'launchSpeed' in ab_list['playEvents'][n]['hitData'] else None)
411
- launch_angle.append(ab_list['playEvents'][n]['hitData']['launchAngle'] if 'launchAngle' in ab_list['playEvents'][n]['hitData'] else None)
412
- launch_distance.append(ab_list['playEvents'][n]['hitData']['totalDistance'] if 'totalDistance' in ab_list['playEvents'][n]['hitData'] else None)
413
- launch_location.append(ab_list['playEvents'][n]['hitData']['location'] if 'location' in ab_list['playEvents'][n]['hitData'] else None)
414
-
415
- trajectory.append(ab_list['playEvents'][n]['hitData']['trajectory'] if 'trajectory' in ab_list['playEvents'][n]['hitData'] else None)
416
- hardness.append(ab_list['playEvents'][n]['hitData']['hardness'] if 'hardness' in ab_list['playEvents'][n]['hitData'] else None)
417
- hit_x.append(ab_list['playEvents'][n]['hitData']['coordinates']['coordX'] if 'coordX' in ab_list['playEvents'][n]['hitData']['coordinates'] else None)
418
- hit_y.append(ab_list['playEvents'][n]['hitData']['coordinates']['coordY'] if 'coordY' in ab_list['playEvents'][n]['hitData']['coordinates'] else None)
419
- else:
420
- launch_speed.append(None)
421
- launch_angle.append(None)
422
- launch_distance.append(None)
423
- launch_location.append(None)
424
- trajectory.append(None)
425
- hardness.append(None)
426
- hit_x.append(None)
427
- hit_y.append(None)
428
-
429
- index_play.append(ab_list['playEvents'][n]['index'] if 'index' in ab_list['playEvents'][n] else None)
430
- play_id.append(ab_list['playEvents'][n]['playId'] if 'playId' in ab_list['playEvents'][n] else None)
431
- start_time.append(ab_list['playEvents'][n]['startTime'] if 'startTime' in ab_list['playEvents'][n] else None)
432
- end_time.append(ab_list['playEvents'][n]['endTime'] if 'endTime' in ab_list['playEvents'][n] else None)
433
- is_pitch.append(ab_list['playEvents'][n]['isPitch'] if 'isPitch' in ab_list['playEvents'][n] else None)
434
- type_type.append(ab_list['playEvents'][n]['type'] if 'type' in ab_list['playEvents'][n] else None)
435
-
436
-
437
-
438
- if n == len(ab_list['playEvents']) - 1 :
439
-
440
- type_ab.append(data['liveData']['plays']['allPlays'][ab_id]['result']['type'] if 'type' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
441
- event.append(data['liveData']['plays']['allPlays'][ab_id]['result']['event'] if 'event' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
442
- event_type.append(data['liveData']['plays']['allPlays'][ab_id]['result']['eventType'] if 'eventType' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
443
- rbi.append(data['liveData']['plays']['allPlays'][ab_id]['result']['rbi'] if 'rbi' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
444
- away_score.append(data['liveData']['plays']['allPlays'][ab_id]['result']['awayScore'] if 'awayScore' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
445
- home_score.append(data['liveData']['plays']['allPlays'][ab_id]['result']['homeScore'] if 'homeScore' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
446
- is_out.append(data['liveData']['plays']['allPlays'][ab_id]['result']['isOut'] if 'isOut' in data['liveData']['plays']['allPlays'][ab_id]['result'] else None)
447
-
448
- else:
449
-
450
- type_ab.append(None)
451
- event.append(None)
452
- event_type.append(None)
453
- rbi.append(None)
454
- away_score.append(None)
455
- home_score.append(None)
456
- is_out.append(None)
457
-
458
- elif ab_list['playEvents'][n]['count']['balls'] == 4:
459
-
460
- event.append(data['liveData']['plays']['allPlays'][ab_id]['result']['event'])
461
- event_type.append(data['liveData']['plays']['allPlays'][ab_id]['result']['eventType'])
462
-
463
-
464
- game_id.append(data['gamePk'])
465
- game_date.append(data['gameData']['datetime']['officialDate'])
466
- batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else None)
467
- batter_name.append(ab_list['matchup']['batter']['fullName'] if 'batter' in ab_list['matchup'] else None)
468
- batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else None)
469
- pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else None)
470
- pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'pitcher' in ab_list['matchup'] else None)
471
- pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else None)
472
- if ab_list['about']['isTopInning']:
473
- batter_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
474
- batter_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
475
- pitcher_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
476
- pitcher_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else None)
477
- else:
478
- batter_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else None)
479
- batter_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
480
- pitcher_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else None)
481
- pitcher_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else None)
482
-
483
- play_description.append(None)
484
- play_code.append(None)
485
- in_play.append(None)
486
- is_strike.append(None)
487
- is_ball.append(None)
488
- is_review.append(None)
489
- pitch_type.append(None)
490
- pitch_description.append(None)
491
- strikes.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
492
- balls.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
493
- outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
494
- strikes_after.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else None)
495
- balls_after.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else None)
496
- outs_after.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else None)
497
- index_play.append(ab_list['playEvents'][n]['index'] if 'index' in ab_list['playEvents'][n] else None)
498
- play_id.append(ab_list['playEvents'][n]['playId'] if 'playId' in ab_list['playEvents'][n] else None)
499
- start_time.append(ab_list['playEvents'][n]['startTime'] if 'startTime' in ab_list['playEvents'][n] else None)
500
- end_time.append(ab_list['playEvents'][n]['endTime'] if 'endTime' in ab_list['playEvents'][n] else None)
501
- is_pitch.append(ab_list['playEvents'][n]['isPitch'] if 'isPitch' in ab_list['playEvents'][n] else None)
502
- type_type.append(ab_list['playEvents'][n]['type'] if 'type' in ab_list['playEvents'][n] else None)
503
-
504
-
505
-
506
- is_swing.append(None)
507
- is_whiff.append(None)
508
- start_speed.append(None)
509
- end_speed.append(None)
510
- sz_top.append(None)
511
- sz_bot.append(None)
512
- x.append(None)
513
- y.append(None)
514
- ax.append(None)
515
- ay.append(None)
516
- az.append(None)
517
- pfxx.append(None)
518
- pfxz.append(None)
519
- px.append(None)
520
- pz.append(None)
521
- vx0.append(None)
522
- vy0.append(None)
523
- vz0.append(None)
524
- x0.append(None)
525
- y0.append(None)
526
- z0.append(None)
527
- zone.append(None)
528
- type_confidence.append(None)
529
- plate_time.append(None)
530
- extension.append(None)
531
- spin_rate.append(None)
532
- spin_direction.append(None)
533
- vb.append(None)
534
- ivb.append(None)
535
- hb.append(None)
536
- launch_speed.append(None)
537
- launch_angle.append(None)
538
- launch_distance.append(None)
539
- launch_location.append(None)
540
- trajectory.append(None)
541
- hardness.append(None)
542
- hit_x.append(None)
543
- hit_y.append(None)
544
- type_ab.append(None)
545
- ab_number.append(None)
546
-
547
- rbi.append(None)
548
- away_score.append(None)
549
- home_score.append(None)
550
- is_out.append(None)
551
-
552
- # print({
553
- # 'game_id':len(game_id),
554
- # 'game_date':len(game_date),
555
- # 'batter_id':len(batter_id),
556
- # 'batter_name':len(batter_name),
557
- # 'batter_hand':len(batter_hand),
558
- # 'batter_team':len(batter_team),
559
- # 'batter_team_id':len(batter_team_id),
560
- # 'pitcher_id':len(pitcher_id),
561
- # 'pitcher_name':len(pitcher_name),
562
- # 'pitcher_hand':len(pitcher_hand),
563
- # 'pitcher_team':len(pitcher_team),
564
- # 'pitcher_team_id':len(pitcher_team_id),
565
-
566
- # 'play_description':len(play_description),
567
- # 'play_code':len(play_code),
568
- # 'in_play':len(in_play),
569
- # 'is_strike':len(is_strike),
570
- # 'is_swing':len(is_swing),
571
- # 'is_whiff':len(is_whiff),
572
- # 'is_out':len(is_out),
573
- # 'is_ball':len(is_ball),
574
- # 'is_review':len(is_review),
575
- # 'pitch_type':len(pitch_type),
576
- # 'pitch_description':len(pitch_description),
577
- # 'strikes':len(strikes),
578
- # 'balls':len(balls),
579
- # 'outs':len(outs),
580
- # 'strikes_after':len(strikes_after),
581
- # 'balls_after':len(balls_after),
582
- # 'outs_after':len(outs_after),
583
- # 'start_speed':len(start_speed),
584
- # 'end_speed':len(end_speed),
585
- # 'sz_top':len(sz_top),
586
- # 'sz_bot':len(sz_bot),
587
- # 'x':len(x),
588
- # 'y':len(y),
589
- # 'ax':len(ax),
590
- # 'ay':len(ay),
591
- # 'az':len(az),
592
- # 'pfxx':len(pfxx),
593
- # 'pfxz':len(pfxz),
594
- # 'px':len(px),
595
- # 'pz':len(pz),
596
- # 'vx0':len(vx0),
597
- # 'vy0':len(vy0),
598
- # 'vz0':len(vz0),
599
- # 'x0':len(x0),
600
- # 'y0':len(y0),
601
- # 'z0':len(z0),
602
- # 'zone':len(zone),
603
- # 'type_confidence':len(type_confidence),
604
- # 'plate_time':len(plate_time),
605
- # 'extension':len(extension),
606
- # 'spin_rate':len(spin_rate),
607
- # 'spin_direction':len(spin_direction),
608
- # 'vb':len(vb),
609
- # 'ivb':len(ivb),
610
- # 'hb':len(hb),
611
- # 'launch_speed':len(launch_speed),
612
- # 'launch_angle':len(launch_angle),
613
- # 'launch_distance':len(launch_distance),
614
- # 'launch_location':len(launch_location),
615
- # 'trajectory':len(trajectory),
616
- # 'hardness':len(hardness),
617
- # 'hit_x':len(hit_x),
618
- # 'hit_y':len(hit_y),
619
- # 'index_play':len(index_play),
620
- # 'play_id':len(play_id),
621
- # 'start_time':len(start_time),
622
- # 'end_time':len(end_time),
623
- # 'is_pitch':len(is_pitch),
624
- # 'type_type':len(type_type),
625
- # 'type_ab':len(type_ab),
626
- # 'event':len(event),
627
- # 'event_type':len(event_type),
628
- # 'rbi':len(rbi),
629
- # 'away_score':len(away_score),
630
- # 'home_score':len(home_score),
631
- # }
632
-
633
-
634
- # )
635
- df = pl.DataFrame(data={
636
- 'game_id':game_id,
637
- 'game_date':game_date,
638
- 'batter_id':batter_id,
639
- 'batter_name':batter_name,
640
- 'batter_hand':batter_hand,
641
- 'batter_team':batter_team,
642
- 'batter_team_id':batter_team_id,
643
- 'pitcher_id':pitcher_id,
644
- 'pitcher_name':pitcher_name,
645
- 'pitcher_hand':pitcher_hand,
646
- 'pitcher_team':pitcher_team,
647
- 'pitcher_team_id':pitcher_team_id,
648
- 'ab_number':ab_number,
649
- 'play_description':play_description,
650
- 'play_code':play_code,
651
- 'in_play':in_play,
652
- 'is_strike':is_strike,
653
- 'is_swing':is_swing,
654
- 'is_whiff':is_whiff,
655
- 'is_out':is_out,
656
- 'is_ball':is_ball,
657
- 'is_review':is_review,
658
- 'pitch_type':pitch_type,
659
- 'pitch_description':pitch_description,
660
- 'strikes':strikes,
661
- 'balls':balls,
662
- 'outs':outs,
663
- 'strikes_after':strikes_after,
664
- 'balls_after':balls_after,
665
- 'outs_after':outs_after,
666
- 'start_speed':start_speed,
667
- 'end_speed':end_speed,
668
- 'sz_top':sz_top,
669
- 'sz_bot':sz_bot,
670
- 'x':x,
671
- 'y':y,
672
- 'ax':ax,
673
- 'ay':ay,
674
- 'az':az,
675
- 'pfxx':pfxx,
676
- 'pfxz':pfxz,
677
- 'px':px,
678
- 'pz':pz,
679
- 'vx0':vx0,
680
- 'vy0':vy0,
681
- 'vz0':vz0,
682
- 'x0':x0,
683
- 'y0':y0,
684
- 'z0':z0,
685
- 'zone':zone,
686
- 'type_confidence':type_confidence,
687
- 'plate_time':plate_time,
688
- 'extension':extension,
689
- 'spin_rate':spin_rate,
690
- 'spin_direction':spin_direction,
691
- 'vb':vb,
692
- 'ivb':ivb,
693
- 'hb':hb,
694
- 'launch_speed':launch_speed,
695
- 'launch_angle':launch_angle,
696
- 'launch_distance':launch_distance,
697
- 'launch_location':launch_location,
698
- 'trajectory':trajectory,
699
- 'hardness':hardness,
700
- 'hit_x':hit_x,
701
- 'hit_y':hit_y,
702
- 'index_play':index_play,
703
- 'play_id':play_id,
704
- 'start_time':start_time,
705
- 'end_time':end_time,
706
- 'is_pitch':is_pitch,
707
- 'type_type':type_type,
708
- 'type_ab':type_ab,
709
- 'event':event,
710
- 'event_type':event_type,
711
- 'rbi':rbi,
712
- 'away_score':away_score,
713
- 'home_score':home_score,
714
-
715
- },strict=False
716
- )
717
-
718
- return df
719
-
720
- # def get_players(self,sport_id:int):
721
- # player_data = requests.get(url=f'https://statsapi.mlb.com/api/v1/sports/{sport_id}/players').json()
722
-
723
- # #Select relevant data that will help distinguish players from one another
724
- # fullName_list = [x['fullName'] for x in player_data['people']]
725
- # id_list = [x['id'] for x in player_data['people']]
726
- # position_list = [x['primaryPosition']['abbreviation'] for x in player_data['people']]
727
- # team_list = [x['currentTeam']['id']for x in player_data['people']]
728
- # age_list = [x['currentAge']for x in player_data['people']]
729
-
730
- # player_df = pl.DataFrame(data={'player_id':id_list,
731
- # 'name':fullName_list,
732
- # 'position':position_list,
733
- # 'team':team_list,
734
- # 'age':age_list})
735
- # return player_df
736
-
737
- def get_teams(self):
738
- """
739
- Retrieves information about MLB teams from the MLB API and processes it into a Polars DataFrame.
740
-
741
- Returns:
742
- - mlb_teams_df (pl.DataFrame): A DataFrame containing team information, including team ID, city, name, franchise, abbreviation, parent organization ID, parent organization name, league ID, and league name.
743
- """
744
- # Make API call to retrieve team information
745
- teams = requests.get(url='https://statsapi.mlb.com/api/v1/teams/').json()
746
-
747
- # Extract relevant data from the API response
748
- mlb_teams_city = [x['franchiseName'] if 'franchiseName' in x else None for x in teams['teams']]
749
- mlb_teams_name = [x['teamName'] if 'franchiseName' in x else None for x in teams['teams']]
750
- mlb_teams_franchise = [x['name'] if 'franchiseName' in x else None for x in teams['teams']]
751
- mlb_teams_id = [x['id'] if 'franchiseName' in x else None for x in teams['teams']]
752
- mlb_teams_abb = [x['abbreviation'] if 'franchiseName' in x else None for x in teams['teams']]
753
- mlb_teams_parent_id = [x['parentOrgId'] if 'parentOrgId' in x else None for x in teams['teams']]
754
- mlb_teams_parent = [x['parentOrgName'] if 'parentOrgName' in x else None for x in teams['teams']]
755
- mlb_teams_league_id = [x['league']['id'] if 'id' in x['league'] else None for x in teams['teams']]
756
- mlb_teams_league_name = [x['league']['name'] if 'name' in x['league'] else None for x in teams['teams']]
757
-
758
- # Create a Polars DataFrame with the extracted data
759
- mlb_teams_df = pl.DataFrame(data={'team_id': mlb_teams_id,
760
- 'city': mlb_teams_franchise,
761
- 'name': mlb_teams_name,
762
- 'franchise': mlb_teams_franchise,
763
- 'abbreviation': mlb_teams_abb,
764
- 'parent_org_id': mlb_teams_parent_id,
765
- 'parent_org': mlb_teams_parent,
766
- 'league_id': mlb_teams_league_id,
767
- 'league_name': mlb_teams_league_name
768
- }).unique().drop_nulls(subset=['team_id']).sort('team_id')
769
-
770
- # Fill missing parent organization IDs with team IDs
771
- mlb_teams_df = mlb_teams_df.with_columns(
772
- pl.when(pl.col('parent_org_id').is_null())
773
- .then(pl.col('team_id'))
774
- .otherwise(pl.col('parent_org_id'))
775
- .alias('parent_org_id')
776
- )
777
-
778
- # Fill missing parent organization names with franchise names
779
- mlb_teams_df = mlb_teams_df.with_columns(
780
- pl.when(pl.col('parent_org').is_null())
781
- .then(pl.col('franchise'))
782
- .otherwise(pl.col('parent_org'))
783
- .alias('parent_org')
784
- )
785
-
786
- # Create a dictionary for mapping team IDs to abbreviations
787
- abbreviation_dict = mlb_teams_df.select(['team_id', 'abbreviation']).to_dict(as_series=False)
788
- abbreviation_map = {k: v for k, v in zip(abbreviation_dict['team_id'], abbreviation_dict['abbreviation'])}
789
-
790
- # Create a DataFrame for parent organization abbreviations
791
- abbreviation_df = mlb_teams_df.select(['team_id', 'abbreviation']).rename({'team_id': 'parent_org_id', 'abbreviation': 'parent_org_abbreviation'})
792
-
793
- # Join the parent organization abbreviations with the main DataFrame
794
- mlb_teams_df = mlb_teams_df.join(abbreviation_df, on='parent_org_id', how='left')
795
-
796
- return mlb_teams_df
797
-
798
- def get_leagues(self):
799
- """
800
- Retrieves information about MLB leagues from the MLB API and processes it into a Polars DataFrame.
801
-
802
- Returns:
803
- - leagues_df (pl.DataFrame): A DataFrame containing league information, including league ID, league name, league abbreviation, and sport ID.
804
- """
805
- # Make API call to retrieve league information
806
- leagues = requests.get(url='https://statsapi.mlb.com/api/v1/leagues/').json()
807
-
808
- # Extract relevant data from the API response
809
- sport_id = [x['sport']['id'] if 'sport' in x else None for x in leagues['leagues']]
810
- league_id = [x['id'] if 'id' in x else None for x in leagues['leagues']]
811
- league_name = [x['name'] if 'name' in x else None for x in leagues['leagues']]
812
- league_abbreviation = [x['abbreviation'] if 'abbreviation' in x else None for x in leagues['leagues']]
813
-
814
- # Create a Polars DataFrame with the extracted data
815
- leagues_df = pl.DataFrame(data={
816
- 'league_id': league_id,
817
- 'league_name': league_name,
818
- 'league_abbreviation': league_abbreviation,
819
- 'sport_id': sport_id,
820
- })
821
-
822
- return leagues_df
823
-
824
- def get_player_games_list(self, player_id: int, season: int, start_date: str = None, end_date: str = None, sport_id: int = 1, game_type: list = ['R']):
825
- """
826
- Retrieves a list of game IDs for a specific player in a given season.
827
-
828
- Parameters:
829
- - player_id (int): The ID of the player.
830
- - season (int): The season year for which to retrieve the game list.
831
- - start_date (str): The start date (YYYY-MM-DD) of the range (default is January 1st of the specified season).
832
- - end_date (str): The end date (YYYY-MM-DD) of the range (default is December 31st of the specified season).
833
- - sport_id (int): The ID of the sport for which to retrieve player data.
834
- - game_type (list): A list of game types to filter the schedule. Default is ['R'].
835
-
836
- Returns:
837
- - player_game_list (list): A list of game IDs in which the player participated during the specified season.
838
- """
839
- # Set default start and end dates if not provided
840
-
841
- if not start_date:
842
- start_date = f'{season}-01-01'
843
- if not end_date:
844
- end_date = f'{season}-12-31'
845
-
846
-
847
-
848
- # Validate date format
849
- date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}$')
850
- if not date_pattern.match(start_date):
851
- raise ValueError(f"start_date {start_date} is not in YYYY-MM-DD format")
852
- if not date_pattern.match(end_date):
853
- raise ValueError(f"end_date {end_date} is not in YYYY-MM-DD format")
854
-
855
- game_type_str = ','.join([str(x) for x in game_type])
856
-
857
- # Make API call to retrieve player game logs
858
- response = requests.get(url=f'http://statsapi.mlb.com/api/v1/people/{player_id}?hydrate=stats(type=gameLog,season={season},startDate={start_date},endDate={end_date},sportId={sport_id},gameType=[{game_type_str}]),hydrations').json()
859
- print(f'http://statsapi.mlb.com/api/v1/people/{player_id}?hydrate=stats(type=gameLog,season={season},startDate={start_date},endDate={end_date},sportId={sport_id},gameType=[{game_type_str}]),hydrations')
860
- # Extract game IDs from the API response
861
- player_game_list = [x['game']['gamePk'] for x in response['people'][0]['stats'][0]['splits']]
862
-
863
- return player_game_list
864
-
865
-
866
- def get_players(self, sport_id: int, season: int, game_type: list = ['R']):
867
- """
868
- Retrieves data frame of players in a given league
869
-
870
- Parameters:
871
- - sport_id (int): The ID of the sport for which to retrieve player data.
872
- - season (int): The season year for which to retrieve player data.
873
-
874
- Returns:
875
- - player_df (pl.DataFrame): A DataFrame containing player information, including player ID, name, position, team, and age.
876
- """
877
-
878
- game_type_str = ','.join([str(x) for x in game_type])
879
-
880
- if game_type_str == 'S':
881
- player_data = requests.get(f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?&env=prod&season={season}&sportId=1&stats=season&group=pitching&gameType=S&limit=1000000&offset=0&sortStat=inningsPitched&order=asc').json()
882
- fullName_list = [x['playerFullName'] if 'playerFullName' in x else None for x in player_data['stats']]
883
- firstName_list = [x['playerFirstName'] if 'playerFirstName' in x else None for x in player_data['stats']]
884
- lastName_list = [x['playerLastName'] if 'playerLastName' in x else None for x in player_data['stats']]
885
- id_list = [x['playerId'] if 'playerId' in x else None for x in player_data['stats']]
886
- position_list = [x['primaryPositionAbbrev'] if 'primaryPositionAbbrev' in x else None for x in player_data['stats']]
887
- team_list = [x['teamId'] if 'teamId' in x else None for x in player_data['stats']]
888
-
889
- df = pl.DataFrame(data={'player_id':id_list,
890
- 'first_name':firstName_list,
891
- 'last_name':lastName_list,
892
- 'name':fullName_list,
893
- 'position':position_list,
894
- 'team':team_list})
895
-
896
- else:
897
- player_data = requests.get(url=f'https://statsapi.mlb.com/api/v1/sports/{sport_id}/players?season={season}&gameType=[{game_type_str}]').json()['people']
898
-
899
- #Select relevant data that will help distinguish players from one another
900
-
901
- fullName_list = [x['fullName'] if 'fullName' in x else None for x in player_data]
902
- firstName_list = [x['firstName'] if 'firstName' in x else None for x in player_data]
903
- lastName_list = [x['lastName'] if 'lastName' in x else None for x in player_data]
904
- id_list = [x['id'] if 'id' in x else None for x in player_data]
905
- position_list = [x['primaryPosition']['abbreviation'] if 'primaryPosition' in x and 'abbreviation' in x['primaryPosition'] else None for x in player_data]
906
- team_list = [x['currentTeam']['id'] if 'currentTeam' in x and 'id' in x['currentTeam'] else None for x in player_data]
907
- weight_list = [x['weight'] if 'weight' in x else None for x in player_data]
908
- height_list = [x['height'] if 'height' in x else None for x in player_data]
909
- age_list = [x['currentAge'] if 'currentAge' in x else None for x in player_data]
910
- birthDate_list = [x['birthDate'] if 'birthDate' in x else None for x in player_data]
911
-
912
-
913
-
914
- df = pl.DataFrame(data={'player_id':id_list,
915
- 'first_name':firstName_list,
916
- 'last_name':lastName_list,
917
- 'name':fullName_list,
918
- 'position':position_list,
919
- 'team':team_list,
920
- 'weight':weight_list,
921
- 'height':height_list,
922
- 'age':age_list,
923
- 'birthDate':birthDate_list})
924
-
925
- return df
926
-
927
-