Spaces:
Running
Running
Alvaro
commited on
Commit
·
7036785
1
Parent(s):
5f34bc0
Pipeline
Browse files- output/fighters_data.json +3 -0
- output/ufc_fighters_data.csv +0 -0
- src/scrape/main.py +44 -0
- src/scrape/scrape_fighters.py +150 -0
- src/{scrape_fights.py → scrape/scrape_fights.py} +44 -15
- src/{to_csv.py → scrape/to_csv.py} +56 -1
output/fighters_data.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d35f0b17445b0e887c33d4bcb30f71e8b62bd9749897117ccfbde9069935aa1b
|
3 |
+
size 2039299
|
output/ufc_fighters_data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/scrape/main.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from scrape_fights import scrape_all_events
|
4 |
+
from scrape_fighters import scrape_all_fighters
|
5 |
+
from to_csv import json_to_csv, fighters_json_to_csv
|
6 |
+
|
7 |
+
def main():
|
8 |
+
"""
|
9 |
+
Main pipeline to scrape UFC data and convert it to CSV.
|
10 |
+
"""
|
11 |
+
# Ensure the output directory exists
|
12 |
+
output_dir = 'output'
|
13 |
+
if not os.path.exists(output_dir):
|
14 |
+
os.makedirs(output_dir)
|
15 |
+
print(f"Created directory: {output_dir}")
|
16 |
+
|
17 |
+
# --- File Paths ---
|
18 |
+
events_json_path = os.path.join(output_dir, 'ufc_events_detailed.json')
|
19 |
+
fighters_json_path = os.path.join(output_dir, 'fighters_data.json')
|
20 |
+
fights_csv_path = os.path.join(output_dir, 'ufc_fights.csv')
|
21 |
+
fighters_csv_path = os.path.join(output_dir, 'ufc_fighters_data.csv')
|
22 |
+
|
23 |
+
# --- Step 1: Scrape Events and Fights ---
|
24 |
+
print("\n--- Starting Events and Fights Scraping ---")
|
25 |
+
# all_events_data = scrape_all_events()
|
26 |
+
# with open(events_json_path, 'w') as f:
|
27 |
+
# json.dump(all_events_data, f, indent=4)
|
28 |
+
print(f"Scraping for events complete. Data saved to {events_json_path}")
|
29 |
+
|
30 |
+
# --- Step 2: Scrape Fighters ---
|
31 |
+
print("\n--- Starting Fighters Scraping ---")
|
32 |
+
# all_fighters_data = scrape_all_fighters()
|
33 |
+
# with open(fighters_json_path, 'w') as f:
|
34 |
+
# json.dump(all_fighters_data, f, indent=4)
|
35 |
+
print(f"Scraping for fighters complete. Data saved to {fighters_json_path}")
|
36 |
+
|
37 |
+
# --- Step 3: Convert JSON to CSV ---
|
38 |
+
print("\n--- Converting all JSON files to CSV ---")
|
39 |
+
json_to_csv(events_json_path, fights_csv_path)
|
40 |
+
fighters_json_to_csv(fighters_json_path, fighters_csv_path)
|
41 |
+
print("\n--- Pipeline Finished ---")
|
42 |
+
|
43 |
+
if __name__ == '__main__':
|
44 |
+
main()
|
src/scrape/scrape_fighters.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import json
|
4 |
+
import time
|
5 |
+
import string
|
6 |
+
import concurrent.futures
|
7 |
+
|
8 |
+
# --- Configuration ---
|
9 |
+
# The number of parallel threads to use for scraping fighter details.
|
10 |
+
# Increase this to scrape faster, but be mindful of rate limits.
|
11 |
+
MAX_WORKERS = 10
|
12 |
+
# The delay in seconds between each request to a fighter's detail page.
|
13 |
+
# This is a politeness measure to avoid overwhelming the server.
|
14 |
+
REQUEST_DELAY = 0.1
|
15 |
+
# --- End Configuration ---
|
16 |
+
|
17 |
+
BASE_URL = "http://ufcstats.com/statistics/fighters?page=all"
|
18 |
+
|
19 |
+
def get_soup(url):
|
20 |
+
"""Fetches and parses a URL into a BeautifulSoup object."""
|
21 |
+
try:
|
22 |
+
response = requests.get(url)
|
23 |
+
response.raise_for_status()
|
24 |
+
return BeautifulSoup(response.text, 'html.parser')
|
25 |
+
except requests.exceptions.RequestException as e:
|
26 |
+
print(f"Error fetching {url}: {e}")
|
27 |
+
return None
|
28 |
+
|
29 |
+
def scrape_fighter_details(fighter_url):
|
30 |
+
"""Scrapes detailed statistics for a single fighter from their page."""
|
31 |
+
print(f" Scraping fighter details from: {fighter_url}")
|
32 |
+
soup = get_soup(fighter_url)
|
33 |
+
if not soup:
|
34 |
+
return None
|
35 |
+
|
36 |
+
details = {}
|
37 |
+
|
38 |
+
# Career stats are usually in a list format on the fighter's page.
|
39 |
+
# This finds all list items within the career statistics div and extracts the data.
|
40 |
+
career_stats_div = soup.find('div', class_='b-list__info-box_style_small-width')
|
41 |
+
if career_stats_div:
|
42 |
+
stats_list = career_stats_div.find_all('li', class_='b-list__box-list-item')
|
43 |
+
for item in stats_list:
|
44 |
+
text = item.text.strip()
|
45 |
+
if ":" in text:
|
46 |
+
parts = text.split(":", 1)
|
47 |
+
key = parts[0].strip().lower().replace(' ', '_').replace('.', '')
|
48 |
+
value = parts[1].strip()
|
49 |
+
details[key] = value
|
50 |
+
|
51 |
+
return details
|
52 |
+
|
53 |
+
def process_fighter(fighter_data):
|
54 |
+
"""
|
55 |
+
Worker function for the thread pool. Scrapes details for a single fighter,
|
56 |
+
updates the dictionary, and applies a delay.
|
57 |
+
"""
|
58 |
+
fighter_url = fighter_data['url']
|
59 |
+
try:
|
60 |
+
details = scrape_fighter_details(fighter_url)
|
61 |
+
if details:
|
62 |
+
fighter_data.update(details)
|
63 |
+
except Exception as e:
|
64 |
+
print(f" Could not scrape details for {fighter_url}: {e}")
|
65 |
+
|
66 |
+
time.sleep(REQUEST_DELAY)
|
67 |
+
return fighter_data
|
68 |
+
|
69 |
+
def scrape_all_fighters():
|
70 |
+
"""Scrapes all fighters from a-z pages using parallel processing."""
|
71 |
+
|
72 |
+
# Step 1: Sequentially scrape all fighter list pages. This is fast.
|
73 |
+
initial_fighter_list = []
|
74 |
+
alphabet = string.ascii_lowercase
|
75 |
+
print("--- Step 1: Collecting basic fighter info from all list pages ---")
|
76 |
+
for char in alphabet:
|
77 |
+
page_url = f"http://ufcstats.com/statistics/fighters?char={char}&page=all"
|
78 |
+
print(f"Scanning page: {page_url}")
|
79 |
+
|
80 |
+
soup = get_soup(page_url)
|
81 |
+
if not soup:
|
82 |
+
continue
|
83 |
+
|
84 |
+
table = soup.find('table', class_='b-statistics__table')
|
85 |
+
if not table:
|
86 |
+
print(f"Could not find fighters table on page {page_url}")
|
87 |
+
continue
|
88 |
+
|
89 |
+
fighter_rows = table.find('tbody').find_all('tr')[1:]
|
90 |
+
if not fighter_rows:
|
91 |
+
continue
|
92 |
+
|
93 |
+
for row in fighter_rows:
|
94 |
+
cols = row.find_all('td')
|
95 |
+
if len(cols) < 11:
|
96 |
+
continue
|
97 |
+
|
98 |
+
fighter_link_tag = cols[0].find('a')
|
99 |
+
if not fighter_link_tag or not fighter_link_tag.has_attr('href'):
|
100 |
+
continue
|
101 |
+
|
102 |
+
initial_fighter_list.append({
|
103 |
+
'first_name': cols[0].text.strip(),
|
104 |
+
'last_name': cols[1].text.strip(),
|
105 |
+
'nickname': cols[2].text.strip(),
|
106 |
+
'height': cols[3].text.strip(),
|
107 |
+
'weight_lbs': cols[4].text.strip(),
|
108 |
+
'reach_in': cols[5].text.strip(),
|
109 |
+
'stance': cols[6].text.strip(),
|
110 |
+
'wins': cols[7].text.strip(),
|
111 |
+
'losses': cols[8].text.strip(),
|
112 |
+
'draws': cols[9].text.strip(),
|
113 |
+
'belt': False if not cols[10].find('img') else True,
|
114 |
+
'url': fighter_link_tag['href']
|
115 |
+
})
|
116 |
+
|
117 |
+
print(f"\n--- Step 2: Scraping details for {len(initial_fighter_list)} fighters in parallel (using up to {MAX_WORKERS} workers) ---")
|
118 |
+
fighters_with_details = []
|
119 |
+
total_fighters = len(initial_fighter_list)
|
120 |
+
|
121 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
122 |
+
results = executor.map(process_fighter, initial_fighter_list)
|
123 |
+
|
124 |
+
for i, fighter_data in enumerate(results):
|
125 |
+
fighters_with_details.append(fighter_data)
|
126 |
+
print(f"Progress: {i + 1}/{total_fighters} fighters scraped.")
|
127 |
+
|
128 |
+
if (i + 1) > 0 and (i + 1) % 50 == 0:
|
129 |
+
print(f"--- Saving progress: {i + 1} fighters saved. ---")
|
130 |
+
# Sort before saving to maintain a consistent order in the file
|
131 |
+
fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
|
132 |
+
with open('output/fighters_data.json', 'w') as f:
|
133 |
+
json.dump(fighters_with_details, f, indent=4)
|
134 |
+
|
135 |
+
# Final sort for the complete dataset
|
136 |
+
fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
|
137 |
+
return fighters_with_details
|
138 |
+
|
139 |
+
if __name__ == "__main__":
|
140 |
+
all_fighters_data = scrape_all_fighters()
|
141 |
+
|
142 |
+
# Create output directory if it doesn't exist
|
143 |
+
import os
|
144 |
+
if not os.path.exists('output'):
|
145 |
+
os.makedirs('output')
|
146 |
+
|
147 |
+
with open('output/fighters_data.json', 'w') as f:
|
148 |
+
json.dump(all_fighters_data, f, indent=4)
|
149 |
+
|
150 |
+
print(f"\nScraping complete. Final data for {len(all_fighters_data)} fighters saved to output/fighters_data.json")
|
src/{scrape_fights.py → scrape/scrape_fights.py}
RENAMED
@@ -2,6 +2,16 @@ import requests
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import json
|
4 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"
|
7 |
|
@@ -69,6 +79,20 @@ def scrape_fight_details(fight_url):
|
|
69 |
|
70 |
return fight_details
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def scrape_event_details(event_url):
|
73 |
print(f"Scraping event: {event_url}")
|
74 |
soup = get_soup(event_url)
|
@@ -83,8 +107,8 @@ def scrape_event_details(event_url):
|
|
83 |
event_details['date'] = list_items[0].text.split(':')[1].strip()
|
84 |
event_details['location'] = list_items[1].text.split(':')[1].strip()
|
85 |
|
86 |
-
#
|
87 |
-
|
88 |
fight_table = soup.find('table', class_='b-fight-details__table')
|
89 |
if fight_table:
|
90 |
rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')
|
@@ -100,21 +124,26 @@ def scrape_event_details(event_url):
|
|
100 |
'method': ' '.join(cols[7].stripped_strings),
|
101 |
'round': cols[8].text.strip(),
|
102 |
'time': cols[9].text.strip(),
|
|
|
103 |
}
|
|
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
118 |
return event_details
|
119 |
|
120 |
def scrape_all_events():
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import json
|
4 |
import time
|
5 |
+
import concurrent.futures
|
6 |
+
|
7 |
+
# --- Configuration ---
|
8 |
+
# The number of parallel threads to use for scraping fight details.
|
9 |
+
# Increase this to scrape faster, but be mindful of rate limits.
|
10 |
+
MAX_WORKERS = 10
|
11 |
+
# The delay in seconds between each request to a fight's detail page.
|
12 |
+
# This is a politeness measure to avoid overwhelming the server.
|
13 |
+
REQUEST_DELAY = 0.1
|
14 |
+
# --- End Configuration ---
|
15 |
|
16 |
BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"
|
17 |
|
|
|
79 |
|
80 |
return fight_details
|
81 |
|
82 |
+
def fetch_fight_details_worker(fight_url):
|
83 |
+
"""
|
84 |
+
Worker function for the thread pool. Scrapes details for a single fight
|
85 |
+
and applies a delay to be polite to the server.
|
86 |
+
"""
|
87 |
+
try:
|
88 |
+
details = scrape_fight_details(fight_url)
|
89 |
+
time.sleep(REQUEST_DELAY)
|
90 |
+
return details
|
91 |
+
except Exception as e:
|
92 |
+
print(f" Could not scrape fight details for {fight_url}: {e}")
|
93 |
+
time.sleep(REQUEST_DELAY) # Also sleep on failure to be safe
|
94 |
+
return None
|
95 |
+
|
96 |
def scrape_event_details(event_url):
|
97 |
print(f"Scraping event: {event_url}")
|
98 |
soup = get_soup(event_url)
|
|
|
107 |
event_details['date'] = list_items[0].text.split(':')[1].strip()
|
108 |
event_details['location'] = list_items[1].text.split(':')[1].strip()
|
109 |
|
110 |
+
# Step 1: Gather base info and URLs for all fights on the event page.
|
111 |
+
fights_to_process = []
|
112 |
fight_table = soup.find('table', class_='b-fight-details__table')
|
113 |
if fight_table:
|
114 |
rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')
|
|
|
124 |
'method': ' '.join(cols[7].stripped_strings),
|
125 |
'round': cols[8].text.strip(),
|
126 |
'time': cols[9].text.strip(),
|
127 |
+
'url': fight_url # Temporarily store the URL for the worker
|
128 |
}
|
129 |
+
fights_to_process.append(fight)
|
130 |
|
131 |
+
# Step 2: Scrape the details for all fights in parallel.
|
132 |
+
fight_urls = [fight['url'] for fight in fights_to_process]
|
133 |
+
completed_fights = []
|
134 |
+
|
135 |
+
if fight_urls:
|
136 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
137 |
+
# The map function maintains the order of results.
|
138 |
+
fight_details_list = executor.map(fetch_fight_details_worker, fight_urls)
|
139 |
+
|
140 |
+
for i, details in enumerate(fight_details_list):
|
141 |
+
fight_data = fights_to_process[i]
|
142 |
+
del fight_data['url'] # Clean up the temporary URL
|
143 |
+
fight_data['details'] = details if details else None
|
144 |
+
completed_fights.append(fight_data)
|
145 |
+
|
146 |
+
event_details['fights'] = completed_fights
|
147 |
return event_details
|
148 |
|
149 |
def scrape_all_events():
|
src/{to_csv.py → scrape/to_csv.py}
RENAMED
@@ -83,5 +83,60 @@ def json_to_csv(json_file_path, csv_file_path):
|
|
83 |
|
84 |
print(f"Successfully converted {json_file_path} to {csv_file_path}")
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
if __name__ == '__main__':
|
87 |
-
json_to_csv('output/ufc_events_detailed.json', 'output/ufc_fights.csv')
|
|
|
|
83 |
|
84 |
print(f"Successfully converted {json_file_path} to {csv_file_path}")
|
85 |
|
86 |
+
def fighters_json_to_csv(json_file_path, csv_file_path):
|
87 |
+
"""
|
88 |
+
Converts a JSON file containing a list of fighter data to a CSV file.
|
89 |
+
It cleans the data by removing unwanted characters and standardizing formats.
|
90 |
+
"""
|
91 |
+
try:
|
92 |
+
with open(json_file_path, 'r', encoding='utf-8') as json_file:
|
93 |
+
data = json.load(json_file)
|
94 |
+
except FileNotFoundError:
|
95 |
+
print(f"Error: The file {json_file_path} was not found.")
|
96 |
+
return
|
97 |
+
except json.JSONDecodeError:
|
98 |
+
print(f"Error: Could not decode JSON from {json_file_path}.")
|
99 |
+
return
|
100 |
+
|
101 |
+
if not data:
|
102 |
+
print(f"Warning: The file {json_file_path} is empty. No CSV will be created.")
|
103 |
+
return
|
104 |
+
|
105 |
+
# Dynamically determine headers by collecting all keys from all records
|
106 |
+
all_keys = set()
|
107 |
+
for item in data:
|
108 |
+
all_keys.update(item.keys())
|
109 |
+
|
110 |
+
# Define a preferred order for the most important columns
|
111 |
+
preferred_headers = [
|
112 |
+
'first_name', 'last_name', 'nickname', 'wins', 'losses', 'draws', 'belt',
|
113 |
+
'height', 'weight_lbs', 'reach_in', 'stance', 'dob', 'slpm',
|
114 |
+
'str_acc', 'sapm', 'str_def', 'td_avg', 'td_acc', 'td_def', 'sub_avg', 'url'
|
115 |
+
]
|
116 |
+
|
117 |
+
# Create the final list of headers, with preferred ones first
|
118 |
+
headers = [h for h in preferred_headers if h in all_keys]
|
119 |
+
headers.extend(sorted([k for k in all_keys if k not in preferred_headers]))
|
120 |
+
|
121 |
+
def clean_value(value):
|
122 |
+
if isinstance(value, str):
|
123 |
+
# Clean data by removing unwanted characters and standardizing units
|
124 |
+
# As requested, this removes '"' and '--'. It also cleans up units.
|
125 |
+
cleaned_value = value.replace('--', '').replace('"', '').replace("'", " ft").replace(' lbs.', '')
|
126 |
+
return cleaned_value.strip()
|
127 |
+
return value
|
128 |
+
|
129 |
+
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
|
130 |
+
writer = csv.DictWriter(csv_file, fieldnames=headers)
|
131 |
+
writer.writeheader()
|
132 |
+
|
133 |
+
for fighter_data in data:
|
134 |
+
# Get a cleaned version of the row, using get() for safety
|
135 |
+
cleaned_row = {key: clean_value(fighter_data.get(key, '')) for key in headers}
|
136 |
+
writer.writerow(cleaned_row)
|
137 |
+
|
138 |
+
print(f"Successfully converted {json_file_path} to {csv_file_path}")
|
139 |
+
|
140 |
if __name__ == '__main__':
|
141 |
+
json_to_csv('output/ufc_events_detailed.json', 'output/ufc_fights.csv')
|
142 |
+
fighters_json_to_csv('output/fighters_data.json', 'output/ufc_fighters_data.csv')
|