Spaces:

AlvaroMros
/

ufc-predictor

Running

App Files Files Community

Alvaro commited on Jul 3

Commit

7036785

1 Parent(s): 5f34bc0

Pipeline

Browse files

Files changed (6) hide show

output/fighters_data.json +3 -0
output/ufc_fighters_data.csv +0 -0
src/scrape/main.py +44 -0
src/scrape/scrape_fighters.py +150 -0
src/{scrape_fights.py → scrape/scrape_fights.py} +44 -15
src/{to_csv.py → scrape/to_csv.py} +56 -1

output/fighters_data.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d35f0b17445b0e887c33d4bcb30f71e8b62bd9749897117ccfbde9069935aa1b
+size 2039299

output/ufc_fighters_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

src/scrape/main.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import json
+from scrape_fights import scrape_all_events
+from scrape_fighters import scrape_all_fighters
+from to_csv import json_to_csv, fighters_json_to_csv
+def main():
+    """
+    Main pipeline to scrape UFC data and convert it to CSV.
+    """
+    # Ensure the output directory exists
+    output_dir = 'output'
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        print(f"Created directory: {output_dir}")
+    # --- File Paths ---
+    events_json_path = os.path.join(output_dir, 'ufc_events_detailed.json')
+    fighters_json_path = os.path.join(output_dir, 'fighters_data.json')
+    fights_csv_path = os.path.join(output_dir, 'ufc_fights.csv')
+    fighters_csv_path = os.path.join(output_dir, 'ufc_fighters_data.csv')
+    # --- Step 1: Scrape Events and Fights ---
+    print("\n--- Starting Events and Fights Scraping ---")
+    # all_events_data = scrape_all_events()
+    # with open(events_json_path, 'w') as f:
+    #     json.dump(all_events_data, f, indent=4)
+    print(f"Scraping for events complete. Data saved to {events_json_path}")
+    # --- Step 2: Scrape Fighters ---
+    print("\n--- Starting Fighters Scraping ---")
+    # all_fighters_data = scrape_all_fighters()
+    # with open(fighters_json_path, 'w') as f:
+    #     json.dump(all_fighters_data, f, indent=4)
+    print(f"Scraping for fighters complete. Data saved to {fighters_json_path}")
+    # --- Step 3: Convert JSON to CSV ---
+    print("\n--- Converting all JSON files to CSV ---")
+    json_to_csv(events_json_path, fights_csv_path)
+    fighters_json_to_csv(fighters_json_path, fighters_csv_path)
+    print("\n--- Pipeline Finished ---")
+if __name__ == '__main__':
+    main()

src/scrape/scrape_fighters.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import requests
+from bs4 import BeautifulSoup
+import json
+import time
+import string
+import concurrent.futures
+# --- Configuration ---
+# The number of parallel threads to use for scraping fighter details.
+# Increase this to scrape faster, but be mindful of rate limits.
+MAX_WORKERS = 10
+# The delay in seconds between each request to a fighter's detail page.
+# This is a politeness measure to avoid overwhelming the server.
+REQUEST_DELAY = 0.1
+# --- End Configuration ---
+BASE_URL = "http://ufcstats.com/statistics/fighters?page=all"
+def get_soup(url):
+    """Fetches and parses a URL into a BeautifulSoup object."""
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return BeautifulSoup(response.text, 'html.parser')
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching {url}: {e}")
+        return None
+def scrape_fighter_details(fighter_url):
+    """Scrapes detailed statistics for a single fighter from their page."""
+    print(f"  Scraping fighter details from: {fighter_url}")
+    soup = get_soup(fighter_url)
+    if not soup:
+        return None
+    details = {}
+    # Career stats are usually in a list format on the fighter's page.
+    # This finds all list items within the career statistics div and extracts the data.
+    career_stats_div = soup.find('div', class_='b-list__info-box_style_small-width')
+    if career_stats_div:
+        stats_list = career_stats_div.find_all('li', class_='b-list__box-list-item')
+        for item in stats_list:
+            text = item.text.strip()
+            if ":" in text:
+                parts = text.split(":", 1)
+                key = parts[0].strip().lower().replace(' ', '_').replace('.', '')
+                value = parts[1].strip()
+                details[key] = value
+    return details
+def process_fighter(fighter_data):
+    """
+    Worker function for the thread pool. Scrapes details for a single fighter,
+    updates the dictionary, and applies a delay.
+    """
+    fighter_url = fighter_data['url']
+    try:
+        details = scrape_fighter_details(fighter_url)
+        if details:
+            fighter_data.update(details)
+    except Exception as e:
+        print(f"    Could not scrape details for {fighter_url}: {e}")
+    time.sleep(REQUEST_DELAY)
+    return fighter_data
+def scrape_all_fighters():
+    """Scrapes all fighters from a-z pages using parallel processing."""
+    # Step 1: Sequentially scrape all fighter list pages. This is fast.
+    initial_fighter_list = []
+    alphabet = string.ascii_lowercase
+    print("--- Step 1: Collecting basic fighter info from all list pages ---")
+    for char in alphabet:
+        page_url = f"http://ufcstats.com/statistics/fighters?char={char}&page=all"
+        print(f"Scanning page: {page_url}")
+        soup = get_soup(page_url)
+        if not soup:
+            continue
+        table = soup.find('table', class_='b-statistics__table')
+        if not table:
+            print(f"Could not find fighters table on page {page_url}")
+            continue
+        fighter_rows = table.find('tbody').find_all('tr')[1:]
+        if not fighter_rows:
+            continue
+        for row in fighter_rows:
+            cols = row.find_all('td')
+            if len(cols) < 11:
+                continue
+            fighter_link_tag = cols[0].find('a')
+            if not fighter_link_tag or not fighter_link_tag.has_attr('href'):
+                continue
+            initial_fighter_list.append({
+                'first_name': cols[0].text.strip(),
+                'last_name': cols[1].text.strip(),
+                'nickname': cols[2].text.strip(),
+                'height': cols[3].text.strip(),
+                'weight_lbs': cols[4].text.strip(),
+                'reach_in': cols[5].text.strip(),
+                'stance': cols[6].text.strip(),
+                'wins': cols[7].text.strip(),
+                'losses': cols[8].text.strip(),
+                'draws': cols[9].text.strip(),
+                'belt': False if not cols[10].find('img') else True,
+                'url': fighter_link_tag['href']
+            })
+    print(f"\n--- Step 2: Scraping details for {len(initial_fighter_list)} fighters in parallel (using up to {MAX_WORKERS} workers) ---")
+    fighters_with_details = []
+    total_fighters = len(initial_fighter_list)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        results = executor.map(process_fighter, initial_fighter_list)
+        for i, fighter_data in enumerate(results):
+            fighters_with_details.append(fighter_data)
+            print(f"Progress: {i + 1}/{total_fighters} fighters scraped.")
+            if (i + 1) > 0 and (i + 1) % 50 == 0:
+                print(f"--- Saving progress: {i + 1} fighters saved. ---")
+                # Sort before saving to maintain a consistent order in the file
+                fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
+                with open('output/fighters_data.json', 'w') as f:
+                    json.dump(fighters_with_details, f, indent=4)
+    # Final sort for the complete dataset
+    fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name']))
+    return fighters_with_details
+if __name__ == "__main__":
+    all_fighters_data = scrape_all_fighters()
+    # Create output directory if it doesn't exist
+    import os
+    if not os.path.exists('output'):
+        os.makedirs('output')
+    with open('output/fighters_data.json', 'w') as f:
+        json.dump(all_fighters_data, f, indent=4)
+    print(f"\nScraping complete. Final data for {len(all_fighters_data)} fighters saved to output/fighters_data.json")

src/{scrape_fights.py → scrape/scrape_fights.py} RENAMED Viewed

@@ -2,6 +2,16 @@ import requests
 from bs4 import BeautifulSoup
 import json
 import time
 BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"
@@ -69,6 +79,20 @@ def scrape_fight_details(fight_url):
     return fight_details
 def scrape_event_details(event_url):
     print(f"Scraping event: {event_url}")
     soup = get_soup(event_url)
@@ -83,8 +107,8 @@ def scrape_event_details(event_url):
     event_details['date'] = list_items[0].text.split(':')[1].strip()
     event_details['location'] = list_items[1].text.split(':')[1].strip()
-    # Extract fights
-    fights = []
     fight_table = soup.find('table', class_='b-fight-details__table')
     if fight_table:
         rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')
@@ -100,21 +124,26 @@ def scrape_event_details(event_url):
                 'method': ' '.join(cols[7].stripped_strings),
                 'round': cols[8].text.strip(),
                 'time': cols[9].text.strip(),
             }
-            try:
-                details = scrape_fight_details(fight_url)
-                if details:
-                    fight['details'] = details
-                else:
-                    fight['details'] = None
-                time.sleep(0.1) # a small delay to be polite to the server
-            except Exception as e:
-                print(f"    Could not scrape fight details for {fight_url}: {e}")
-            fights.append(fight)
-    event_details['fights'] = fights
     return event_details
 def scrape_all_events():

 from bs4 import BeautifulSoup
 import json
 import time
+import concurrent.futures
+# --- Configuration ---
+# The number of parallel threads to use for scraping fight details.
+# Increase this to scrape faster, but be mindful of rate limits.
+MAX_WORKERS = 10
+# The delay in seconds between each request to a fight's detail page.
+# This is a politeness measure to avoid overwhelming the server.
+REQUEST_DELAY = 0.1
+# --- End Configuration ---
 BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"
     return fight_details
+def fetch_fight_details_worker(fight_url):
+    """
+    Worker function for the thread pool. Scrapes details for a single fight
+    and applies a delay to be polite to the server.
+    """
+    try:
+        details = scrape_fight_details(fight_url)
+        time.sleep(REQUEST_DELAY)
+        return details
+    except Exception as e:
+        print(f"    Could not scrape fight details for {fight_url}: {e}")
+        time.sleep(REQUEST_DELAY) # Also sleep on failure to be safe
+        return None
 def scrape_event_details(event_url):
     print(f"Scraping event: {event_url}")
     soup = get_soup(event_url)
     event_details['date'] = list_items[0].text.split(':')[1].strip()
     event_details['location'] = list_items[1].text.split(':')[1].strip()
+    # Step 1: Gather base info and URLs for all fights on the event page.
+    fights_to_process = []
     fight_table = soup.find('table', class_='b-fight-details__table')
     if fight_table:
         rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')
                 'method': ' '.join(cols[7].stripped_strings),
                 'round': cols[8].text.strip(),
                 'time': cols[9].text.strip(),
+                'url': fight_url # Temporarily store the URL for the worker
             }
+            fights_to_process.append(fight)
+    # Step 2: Scrape the details for all fights in parallel.
+    fight_urls = [fight['url'] for fight in fights_to_process]
+    completed_fights = []
+    if fight_urls:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            # The map function maintains the order of results.
+            fight_details_list = executor.map(fetch_fight_details_worker, fight_urls)
+            for i, details in enumerate(fight_details_list):
+                fight_data = fights_to_process[i]
+                del fight_data['url']  # Clean up the temporary URL
+                fight_data['details'] = details if details else None
+                completed_fights.append(fight_data)
+    event_details['fights'] = completed_fights
     return event_details
 def scrape_all_events():

src/{to_csv.py → scrape/to_csv.py} RENAMED Viewed

@@ -83,5 +83,60 @@ def json_to_csv(json_file_path, csv_file_path):
     print(f"Successfully converted {json_file_path} to {csv_file_path}")
 if __name__ == '__main__':
-    json_to_csv('output/ufc_events_detailed.json', 'output/ufc_fights.csv')

     print(f"Successfully converted {json_file_path} to {csv_file_path}")
+def fighters_json_to_csv(json_file_path, csv_file_path):
+    """
+    Converts a JSON file containing a list of fighter data to a CSV file.
+    It cleans the data by removing unwanted characters and standardizing formats.
+    """
+    try:
+        with open(json_file_path, 'r', encoding='utf-8') as json_file:
+            data = json.load(json_file)
+    except FileNotFoundError:
+        print(f"Error: The file {json_file_path} was not found.")
+        return
+    except json.JSONDecodeError:
+        print(f"Error: Could not decode JSON from {json_file_path}.")
+        return
+    if not data:
+        print(f"Warning: The file {json_file_path} is empty. No CSV will be created.")
+        return
+    # Dynamically determine headers by collecting all keys from all records
+    all_keys = set()
+    for item in data:
+        all_keys.update(item.keys())
+    # Define a preferred order for the most important columns
+    preferred_headers = [
+        'first_name', 'last_name', 'nickname', 'wins', 'losses', 'draws', 'belt',
+        'height', 'weight_lbs', 'reach_in', 'stance', 'dob', 'slpm',
+        'str_acc', 'sapm', 'str_def', 'td_avg', 'td_acc', 'td_def', 'sub_avg', 'url'
+    ]
+    # Create the final list of headers, with preferred ones first
+    headers = [h for h in preferred_headers if h in all_keys]
+    headers.extend(sorted([k for k in all_keys if k not in preferred_headers]))
+    def clean_value(value):
+        if isinstance(value, str):
+            # Clean data by removing unwanted characters and standardizing units
+            # As requested, this removes '"' and '--'. It also cleans up units.
+            cleaned_value = value.replace('--', '').replace('"', '').replace("'", " ft").replace(' lbs.', '')
+            return cleaned_value.strip()
+        return value
+    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=headers)
+        writer.writeheader()
+        for fighter_data in data:
+            # Get a cleaned version of the row, using get() for safety
+            cleaned_row = {key: clean_value(fighter_data.get(key, '')) for key in headers}
+            writer.writerow(cleaned_row)
+    print(f"Successfully converted {json_file_path} to {csv_file_path}")
 if __name__ == '__main__':
+    json_to_csv('output/ufc_events_detailed.json', 'output/ufc_fights.csv')
+    fighters_json_to_csv('output/fighters_data.json', 'output/ufc_fighters_data.csv')