Spaces:
Running
Running
import os | |
import json | |
from .scrape_fights import scrape_all_events | |
from .scrape_fighters import scrape_all_fighters | |
from .to_csv import json_to_csv, fighters_json_to_csv | |
from .preprocess import preprocess_fighters_csv | |
from .. import config | |
def main(): | |
""" | |
Main pipeline to scrape UFC data and convert it to CSV. | |
""" | |
# Ensure the output directory exists | |
if not os.path.exists(config.OUTPUT_DIR): | |
os.makedirs(config.OUTPUT_DIR) | |
print(f"Created directory: {config.OUTPUT_DIR}") | |
# --- Step 1: Scrape Events and Fights --- | |
print("\n--- Starting Events and Fights Scraping ---") | |
all_events_data = scrape_all_events() | |
with open(config.EVENTS_JSON_PATH, 'w') as f: | |
json.dump(all_events_data, f, indent=4) | |
print(f"Scraping for events complete. Data saved to {config.EVENTS_JSON_PATH}") | |
# --- Step 2: Scrape Fighters --- | |
print("\n--- Starting Fighters Scraping ---") | |
all_fighters_data = scrape_all_fighters() | |
with open(config.FIGHTERS_JSON_PATH, 'w') as f: | |
json.dump(all_fighters_data, f, indent=4) | |
print(f"Scraping for fighters complete. Data saved to {config.FIGHTERS_JSON_PATH}") | |
# --- Step 3: Convert JSON to CSV --- | |
print("\n--- Converting all JSON files to CSV ---") | |
json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH) | |
fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH) | |
# --- Step 4: Preprocess CSV data --- | |
print("\n--- Preprocessing fighter data (converting height to cm) ---") | |
preprocess_fighters_csv(config.FIGHTERS_CSV_PATH) | |
# --- Step 5: Clean up temporary JSON files --- | |
print("\n--- Deleting temporary JSON files ---") | |
try: | |
if os.path.exists(config.EVENTS_JSON_PATH): | |
os.remove(config.EVENTS_JSON_PATH) | |
print(f"Deleted: {config.EVENTS_JSON_PATH}") | |
if os.path.exists(config.FIGHTERS_JSON_PATH): | |
os.remove(config.FIGHTERS_JSON_PATH) | |
print(f"Deleted: {config.FIGHTERS_JSON_PATH}") | |
except OSError as e: | |
print(f"Error deleting JSON files: {e}") | |
print("\n--- Pipeline Finished ---") | |
if __name__ == '__main__': | |
main() | |