Alvaro commited on
Commit
7036785
·
1 Parent(s): 5f34bc0
output/fighters_data.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d35f0b17445b0e887c33d4bcb30f71e8b62bd9749897117ccfbde9069935aa1b
3
+ size 2039299
output/ufc_fighters_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/scrape/main.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from scrape_fights import scrape_all_events
4
+ from scrape_fighters import scrape_all_fighters
5
+ from to_csv import json_to_csv, fighters_json_to_csv
6
+
7
+ def main():
8
+ """
9
+ Main pipeline to scrape UFC data and convert it to CSV.
10
+ """
11
+ # Ensure the output directory exists
12
+ output_dir = 'output'
13
+ if not os.path.exists(output_dir):
14
+ os.makedirs(output_dir)
15
+ print(f"Created directory: {output_dir}")
16
+
17
+ # --- File Paths ---
18
+ events_json_path = os.path.join(output_dir, 'ufc_events_detailed.json')
19
+ fighters_json_path = os.path.join(output_dir, 'fighters_data.json')
20
+ fights_csv_path = os.path.join(output_dir, 'ufc_fights.csv')
21
+ fighters_csv_path = os.path.join(output_dir, 'ufc_fighters_data.csv')
22
+
23
+ # --- Step 1: Scrape Events and Fights ---
24
+ print("\n--- Starting Events and Fights Scraping ---")
25
+ # all_events_data = scrape_all_events()
26
+ # with open(events_json_path, 'w') as f:
27
+ # json.dump(all_events_data, f, indent=4)
28
+ print(f"Scraping for events complete. Data saved to {events_json_path}")
29
+
30
+ # --- Step 2: Scrape Fighters ---
31
+ print("\n--- Starting Fighters Scraping ---")
32
+ # all_fighters_data = scrape_all_fighters()
33
+ # with open(fighters_json_path, 'w') as f:
34
+ # json.dump(all_fighters_data, f, indent=4)
35
+ print(f"Scraping for fighters complete. Data saved to {fighters_json_path}")
36
+
37
+ # --- Step 3: Convert JSON to CSV ---
38
+ print("\n--- Converting all JSON files to CSV ---")
39
+ json_to_csv(events_json_path, fights_csv_path)
40
+ fighters_json_to_csv(fighters_json_path, fighters_csv_path)
41
+ print("\n--- Pipeline Finished ---")
42
+
43
+ if __name__ == '__main__':
44
+ main()
src/scrape/scrape_fighters.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ import time
5
+ import string
6
+ import concurrent.futures
7
+
8
+ # --- Configuration ---
9
+ # The number of parallel threads to use for scraping fighter details.
10
+ # Increase this to scrape faster, but be mindful of rate limits.
11
+ MAX_WORKERS = 10
12
+ # The delay in seconds between each request to a fighter's detail page.
13
+ # This is a politeness measure to avoid overwhelming the server.
14
+ REQUEST_DELAY = 0.1
15
+ # --- End Configuration ---
16
+
17
+ BASE_URL = "http://ufcstats.com/statistics/fighters?page=all"
18
+
19
+ def get_soup(url):
20
+ """Fetches and parses a URL into a BeautifulSoup object."""
21
+ try:
22
+ response = requests.get(url)
23
+ response.raise_for_status()
24
+ return BeautifulSoup(response.text, 'html.parser')
25
+ except requests.exceptions.RequestException as e:
26
+ print(f"Error fetching {url}: {e}")
27
+ return None
28
+
29
+ def scrape_fighter_details(fighter_url):
30
+ """Scrapes detailed statistics for a single fighter from their page."""
31
+ print(f" Scraping fighter details from: {fighter_url}")
32
+ soup = get_soup(fighter_url)
33
+ if not soup:
34
+ return None
35
+
36
+ details = {}
37
+
38
+ # Career stats are usually in a list format on the fighter's page.
39
+ # This finds all list items within the career statistics div and extracts the data.
40
+ career_stats_div = soup.find('div', class_='b-list__info-box_style_small-width')
41
+ if career_stats_div:
42
+ stats_list = career_stats_div.find_all('li', class_='b-list__box-list-item')
43
+ for item in stats_list:
44
+ text = item.text.strip()
45
+ if ":" in text:
46
+ parts = text.split(":", 1)
47
+ key = parts[0].strip().lower().replace(' ', '_').replace('.', '')
48
+ value = parts[1].strip()
49
+ details[key] = value
50
+
51
+ return details
52
+
53
+ def process_fighter(fighter_data):
54
+ """
55
+ Worker function for the thread pool. Scrapes details for a single fighter,
56
+ updates the dictionary, and applies a delay.
57
+ """
58
+ fighter_url = fighter_data['url']
59
+ try:
60
+ details = scrape_fighter_details(fighter_url)
61
+ if details:
62
+ fighter_data.update(details)
63
+ except Exception as e:
64
+ print(f" Could not scrape details for {fighter_url}: {e}")
65
+
66
+ time.sleep(REQUEST_DELAY)
67
+ return fighter_data
68
+
69
+ def scrape_all_fighters():
70
+ """Scrapes all fighters from a-z pages using parallel processing."""
71
+
72
+ # Step 1: Sequentially scrape all fighter list pages. This is fast.
73
+ initial_fighter_list = []
74
+ alphabet = string.ascii_lowercase
75
+ print("--- Step 1: Collecting basic fighter info from all list pages ---")
76
+ for char in alphabet:
77
+ page_url = f"http://ufcstats.com/statistics/fighters?char={char}&page=all"
78
+ print(f"Scanning page: {page_url}")
79
+
80
+ soup = get_soup(page_url)
81
+ if not soup:
82
+ continue
83
+
84
+ table = soup.find('table', class_='b-statistics__table')
85
+ if not table:
86
+ print(f"Could not find fighters table on page {page_url}")
87
+ continue
88
+
89
+ fighter_rows = table.find('tbody').find_all('tr')[1:]
90
+ if not fighter_rows:
91
+ continue
92
+
93
+ for row in fighter_rows:
94
+ cols = row.find_all('td')
95
+ if len(cols) < 11:
96
+ continue
97
+
98
+ fighter_link_tag = cols[0].find('a')
99
+ if not fighter_link_tag or not fighter_link_tag.has_attr('href'):
100
+ continue
101
+
102
+ initial_fighter_list.append({
103
+ 'first_name': cols[0].text.strip(),
104
+ 'last_name': cols[1].text.strip(),
105
+ 'nickname': cols[2].text.strip(),
106
+ 'height': cols[3].text.strip(),
107
+ 'weight_lbs': cols[4].text.strip(),
108
+ 'reach_in': cols[5].text.strip(),
109
+ 'stance': cols[6].text.strip(),
110
+ 'wins': cols[7].text.strip(),
111
+ 'losses': cols[8].text.strip(),
112
+ 'draws': cols[9].text.strip(),
113
+ 'belt': False if not cols[10].find('img') else True,
114
+ 'url': fighter_link_tag['href']
115
+ })
116
+
117
+ print(f"\n--- Step 2: Scraping details for {len(initial_fighter_list)} fighters in parallel (using up to {MAX_WORKERS} workers) ---")
118
+ fighters_with_details = []
119
+ total_fighters = len(initial_fighter_list)
120
+
121
+ with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
122
+ results = executor.map(process_fighter, initial_fighter_list)
123
+
124
+ for i, fighter_data in enumerate(results):
125
+ fighters_with_details.append(fighter_data)
126
+ print(f"Progress: {i + 1}/{total_fighters} fighters scraped.")
127
+
128
+ if (i + 1) > 0 and (i + 1) % 50 == 0:
129
+ print(f"--- Saving progress: {i + 1} fighters saved. ---")
130
+ # Sort before saving to maintain a consistent order in the file
131
+ fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
132
+ with open('output/fighters_data.json', 'w') as f:
133
+ json.dump(fighters_with_details, f, indent=4)
134
+
135
+ # Final sort for the complete dataset
136
+ fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
137
+ return fighters_with_details
138
+
139
+ if __name__ == "__main__":
140
+ all_fighters_data = scrape_all_fighters()
141
+
142
+ # Create output directory if it doesn't exist
143
+ import os
144
+ if not os.path.exists('output'):
145
+ os.makedirs('output')
146
+
147
+ with open('output/fighters_data.json', 'w') as f:
148
+ json.dump(all_fighters_data, f, indent=4)
149
+
150
+ print(f"\nScraping complete. Final data for {len(all_fighters_data)} fighters saved to output/fighters_data.json")
src/{scrape_fights.py → scrape/scrape_fights.py} RENAMED
@@ -2,6 +2,16 @@ import requests
2
  from bs4 import BeautifulSoup
3
  import json
4
  import time
 
 
 
 
 
 
 
 
 
 
5
 
6
  BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"
7
 
@@ -69,6 +79,20 @@ def scrape_fight_details(fight_url):
69
 
70
  return fight_details
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def scrape_event_details(event_url):
73
  print(f"Scraping event: {event_url}")
74
  soup = get_soup(event_url)
@@ -83,8 +107,8 @@ def scrape_event_details(event_url):
83
  event_details['date'] = list_items[0].text.split(':')[1].strip()
84
  event_details['location'] = list_items[1].text.split(':')[1].strip()
85
 
86
- # Extract fights
87
- fights = []
88
  fight_table = soup.find('table', class_='b-fight-details__table')
89
  if fight_table:
90
  rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')
@@ -100,21 +124,26 @@ def scrape_event_details(event_url):
100
  'method': ' '.join(cols[7].stripped_strings),
101
  'round': cols[8].text.strip(),
102
  'time': cols[9].text.strip(),
 
103
  }
 
104
 
105
- try:
106
- details = scrape_fight_details(fight_url)
107
- if details:
108
- fight['details'] = details
109
- else:
110
- fight['details'] = None
111
- time.sleep(0.1) # a small delay to be polite to the server
112
- except Exception as e:
113
- print(f" Could not scrape fight details for {fight_url}: {e}")
114
-
115
- fights.append(fight)
116
-
117
- event_details['fights'] = fights
 
 
 
118
  return event_details
119
 
120
  def scrape_all_events():
 
2
  from bs4 import BeautifulSoup
3
  import json
4
  import time
5
+ import concurrent.futures
6
+
7
+ # --- Configuration ---
8
+ # The number of parallel threads to use for scraping fight details.
9
+ # Increase this to scrape faster, but be mindful of rate limits.
10
+ MAX_WORKERS = 10
11
+ # The delay in seconds between each request to a fight's detail page.
12
+ # This is a politeness measure to avoid overwhelming the server.
13
+ REQUEST_DELAY = 0.1
14
+ # --- End Configuration ---
15
 
16
  BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"
17
 
 
79
 
80
  return fight_details
81
 
82
+ def fetch_fight_details_worker(fight_url):
83
+ """
84
+ Worker function for the thread pool. Scrapes details for a single fight
85
+ and applies a delay to be polite to the server.
86
+ """
87
+ try:
88
+ details = scrape_fight_details(fight_url)
89
+ time.sleep(REQUEST_DELAY)
90
+ return details
91
+ except Exception as e:
92
+ print(f" Could not scrape fight details for {fight_url}: {e}")
93
+ time.sleep(REQUEST_DELAY) # Also sleep on failure to be safe
94
+ return None
95
+
96
  def scrape_event_details(event_url):
97
  print(f"Scraping event: {event_url}")
98
  soup = get_soup(event_url)
 
107
  event_details['date'] = list_items[0].text.split(':')[1].strip()
108
  event_details['location'] = list_items[1].text.split(':')[1].strip()
109
 
110
+ # Step 1: Gather base info and URLs for all fights on the event page.
111
+ fights_to_process = []
112
  fight_table = soup.find('table', class_='b-fight-details__table')
113
  if fight_table:
114
  rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')
 
124
  'method': ' '.join(cols[7].stripped_strings),
125
  'round': cols[8].text.strip(),
126
  'time': cols[9].text.strip(),
127
+ 'url': fight_url # Temporarily store the URL for the worker
128
  }
129
+ fights_to_process.append(fight)
130
 
131
+ # Step 2: Scrape the details for all fights in parallel.
132
+ fight_urls = [fight['url'] for fight in fights_to_process]
133
+ completed_fights = []
134
+
135
+ if fight_urls:
136
+ with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
137
+ # The map function maintains the order of results.
138
+ fight_details_list = executor.map(fetch_fight_details_worker, fight_urls)
139
+
140
+ for i, details in enumerate(fight_details_list):
141
+ fight_data = fights_to_process[i]
142
+ del fight_data['url'] # Clean up the temporary URL
143
+ fight_data['details'] = details if details else None
144
+ completed_fights.append(fight_data)
145
+
146
+ event_details['fights'] = completed_fights
147
  return event_details
148
 
149
  def scrape_all_events():
src/{to_csv.py → scrape/to_csv.py} RENAMED
@@ -83,5 +83,60 @@ def json_to_csv(json_file_path, csv_file_path):
83
 
84
  print(f"Successfully converted {json_file_path} to {csv_file_path}")
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  if __name__ == '__main__':
87
- json_to_csv('output/ufc_events_detailed.json', 'output/ufc_fights.csv')
 
 
83
 
84
  print(f"Successfully converted {json_file_path} to {csv_file_path}")
85
 
86
+ def fighters_json_to_csv(json_file_path, csv_file_path):
87
+ """
88
+ Converts a JSON file containing a list of fighter data to a CSV file.
89
+ It cleans the data by removing unwanted characters and standardizing formats.
90
+ """
91
+ try:
92
+ with open(json_file_path, 'r', encoding='utf-8') as json_file:
93
+ data = json.load(json_file)
94
+ except FileNotFoundError:
95
+ print(f"Error: The file {json_file_path} was not found.")
96
+ return
97
+ except json.JSONDecodeError:
98
+ print(f"Error: Could not decode JSON from {json_file_path}.")
99
+ return
100
+
101
+ if not data:
102
+ print(f"Warning: The file {json_file_path} is empty. No CSV will be created.")
103
+ return
104
+
105
+ # Dynamically determine headers by collecting all keys from all records
106
+ all_keys = set()
107
+ for item in data:
108
+ all_keys.update(item.keys())
109
+
110
+ # Define a preferred order for the most important columns
111
+ preferred_headers = [
112
+ 'first_name', 'last_name', 'nickname', 'wins', 'losses', 'draws', 'belt',
113
+ 'height', 'weight_lbs', 'reach_in', 'stance', 'dob', 'slpm',
114
+ 'str_acc', 'sapm', 'str_def', 'td_avg', 'td_acc', 'td_def', 'sub_avg', 'url'
115
+ ]
116
+
117
+ # Create the final list of headers, with preferred ones first
118
+ headers = [h for h in preferred_headers if h in all_keys]
119
+ headers.extend(sorted([k for k in all_keys if k not in preferred_headers]))
120
+
121
+ def clean_value(value):
122
+ if isinstance(value, str):
123
+ # Clean data by removing unwanted characters and standardizing units
124
+ # As requested, this removes '"' and '--'. It also cleans up units.
125
+ cleaned_value = value.replace('--', '').replace('"', '').replace("'", " ft").replace(' lbs.', '')
126
+ return cleaned_value.strip()
127
+ return value
128
+
129
+ with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
130
+ writer = csv.DictWriter(csv_file, fieldnames=headers)
131
+ writer.writeheader()
132
+
133
+ for fighter_data in data:
134
+ # Get a cleaned version of the row, using get() for safety
135
+ cleaned_row = {key: clean_value(fighter_data.get(key, '')) for key in headers}
136
+ writer.writerow(cleaned_row)
137
+
138
+ print(f"Successfully converted {json_file_path} to {csv_file_path}")
139
+
140
  if __name__ == '__main__':
141
+ json_to_csv('output/ufc_events_detailed.json', 'output/ufc_fights.csv')
142
+ fighters_json_to_csv('output/fighters_data.json', 'output/ufc_fighters_data.csv')