import os import json import pandas as pd import requests from pathlib import Path from huggingface_hub import hf_hub_download from datasets import load_dataset # Constants REPO_ID = "danielrosehill/ifvi_valuefactors_deriv" DATASET_ID = "danielrosehill/ifvi_valuefactors_deriv" HF_API_URL = "https://huggingface.co/api/datasets/danielrosehill/ifvi_valuefactors_deriv/tree/main/data" def get_hf_directory_structure(path=""): """ Get the directory structure from Hugging Face API """ url = f"{HF_API_URL}/{path}" if path else HF_API_URL response = requests.get(url) if response.status_code == 200: return response.json() else: print(f"Error fetching directory structure: {response.status_code}") return [] def download_file_from_hf(file_path): """ Download a file from Hugging Face """ try: # Convert local path to HF path hf_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path downloaded_file = hf_hub_download(repo_id=REPO_ID, filename=f"data/{hf_path}") return downloaded_file except Exception as e: print(f"Error downloading file: {e}") return None def load_dataset_from_hf(path=None): """ Load dataset from Hugging Face """ try: # If path is specified, load specific file, otherwise load the whole dataset if path: dataset = load_dataset(DATASET_ID, data_files={"data": path}, split="data") else: dataset = load_dataset(DATASET_ID) return dataset except Exception as e: print(f"Error loading dataset from HF: {e}") return None def load_json_data(file_path): """ Load JSON data from a file """ try: # Check if file exists locally if os.path.exists(file_path): with open(file_path, 'r') as f: data = json.load(f) return data else: # Try to load from HF datasets try: # Extract the path relative to the dataset relative_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path dataset = load_dataset_from_hf(f"data/{relative_path}") if dataset: # Convert dataset to list/dict format return dataset.to_dict() except Exception as dataset_error: print(f"Error loading from dataset: {dataset_error}") # Fallback to direct download downloaded_file = download_file_from_hf(file_path) if downloaded_file: with open(downloaded_file, 'r') as f: data = json.load(f) return data else: # Fallback to sample data sample_data_path = os.path.join(os.getcwd(), "data/sample_data.json") if os.path.exists(sample_data_path): print(f"Using sample data for {file_path}") with open(sample_data_path, 'r') as f: return json.load(f) return None except Exception as e: print(f"Error loading JSON data: {e}") return None def load_csv_data(file_path): """ Load CSV data from a file """ try: # Check if file exists locally if os.path.exists(file_path): return pd.read_csv(file_path) else: # Try to load from HF datasets try: # Extract the path relative to the dataset relative_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path dataset = load_dataset_from_hf(f"data/{relative_path}") if dataset: # Convert dataset to pandas DataFrame return pd.DataFrame(dataset) except Exception as dataset_error: print(f"Error loading from dataset: {dataset_error}") # Fallback to direct download downloaded_file = download_file_from_hf(file_path) if downloaded_file: return pd.read_csv(downloaded_file) else: return None except Exception as e: print(f"Error loading CSV data: {e}") return None def get_continents(): """ Get list of continents """ continents_path = os.path.join(os.getcwd(), "data/by-region/continental") if os.path.exists(continents_path): return [d for d in os.listdir(continents_path) if os.path.isdir(os.path.join(continents_path, d))] else: # Try to get from HF API try: dir_structure = get_hf_directory_structure("by-region/continental") continents = [item['path'].split('/')[-1] for item in dir_structure if item['type'] == 'directory'] if continents: return sorted(continents) except Exception as e: print(f"Error getting continents from HF API: {e}") # Fallback to hardcoded list return ["Africa", "Asia", "Europe", "North America", "Oceania", "South America"] def get_countries(continent): """ Get list of countries for a continent """ countries_dir = f"data/by-region/continental/{continent}" countries_path = os.path.join(os.getcwd(), countries_dir) if os.path.exists(countries_path): countries = [os.path.splitext(f)[0] for f in os.listdir(countries_path) if os.path.isfile(os.path.join(countries_path, f)) and f.endswith('.json')] return sorted(countries) else: # Try to get from HF API try: dir_structure = get_hf_directory_structure(f"by-region/continental/{continent}") countries = [os.path.splitext(item['path'].split('/')[-1])[0] for item in dir_structure if item['type'] == 'file' and item['path'].endswith('.json')] return sorted(countries) except Exception as e: print(f"Error getting countries from HF API: {e}") return [] def get_impact_types(): """ Get list of impact types """ impact_types_path = os.path.join(os.getcwd(), "data/by-impact-type") if os.path.exists(impact_types_path): impact_types = [d for d in os.listdir(impact_types_path) if os.path.isdir(os.path.join(impact_types_path, d))] # Add single JSON files impact_types.extend([os.path.splitext(f)[0] for f in os.listdir(impact_types_path) if os.path.isfile(os.path.join(impact_types_path, f)) and f.endswith('.json')]) return sorted(impact_types) else: # Try to get from HF API try: dir_structure = get_hf_directory_structure("by-impact-type") impact_types = [] # Add directories impact_types.extend([item['path'].split('/')[-1] for item in dir_structure if item['type'] == 'directory']) # Add JSON files (without extension) impact_types.extend([os.path.splitext(item['path'].split('/')[-1])[0] for item in dir_structure if item['type'] == 'file' and item['path'].endswith('.json')]) if impact_types: return sorted(impact_types) except Exception as e: print(f"Error getting impact types from HF API: {e}") # Fallback to hardcoded list return ["air-pollution", "GHG_Impacts", "waste", "water-consumption", "economic", "ecosystem", "health", "social"] def get_impact_type_data(impact_type): """ Get data for an impact type """ # Map friendly names to directory/file names impact_map = { "Air Pollution": "air-pollution", "GHG Impacts": "GHG_Impacts", "Waste": "waste", "Water Consumption": "water-consumption", "Economic": "economic", "Ecosystem": "ecosystem", "Health": "health", "Social": "social" } # If impact_type is already a directory/file name, use it directly impact_dir = impact_map.get(impact_type, impact_type) # Check if it's a file or directory base_path = os.path.join(os.getcwd(), "data/by-impact-type") impact_path = os.path.join(base_path, impact_dir) if os.path.isfile(f"{impact_path}.json"): # It's a single JSON file return load_json_data(f"{impact_path}.json") elif os.path.isdir(impact_path): # It's a directory, find all JSON files json_files = [f for f in os.listdir(impact_path) if f.endswith('.json')] if json_files: # Return the first JSON file for now return load_json_data(os.path.join(impact_path, json_files[0])) # If not found locally, try to get from HF try: # Try to load from HF datasets dataset_path = f"data/by-impact-type/{impact_dir}.json" dataset = load_dataset_from_hf(dataset_path) if dataset: return dataset.to_dict() # Check if it's a single file downloaded_file = download_file_from_hf(f"data/by-impact-type/{impact_dir}.json") if downloaded_file: return load_json_data(downloaded_file) # Check if it's a directory dir_structure = get_hf_directory_structure(f"by-impact-type/{impact_dir}") if dir_structure: json_files = [item['path'] for item in dir_structure if item['path'].endswith('.json')] if json_files: downloaded_file = download_file_from_hf(f"data/{json_files[0]}") if downloaded_file: return load_json_data(downloaded_file) except Exception as e: print(f"Error getting impact type data from HF: {e}") return None def get_country_data(continent, country): """ Get data for a country """ if not continent or not country: return None file_path = os.path.join(os.getcwd(), f"data/by-region/continental/{continent}/{country}.json") data = load_json_data(file_path) if not data: # Try to get from HF datasets try: dataset_path = f"data/by-region/continental/{continent}/{country}.json" dataset = load_dataset_from_hf(dataset_path) if dataset: return dataset.to_dict() except Exception as dataset_error: print(f"Error loading from dataset: {dataset_error}") # Fallback to direct download try: downloaded_file = download_file_from_hf(f"data/by-region/continental/{continent}/{country}.json") if downloaded_file: data = load_json_data(downloaded_file) except Exception as e: print(f"Error getting country data from HF: {e}") return data def create_summary_stats(data): """ Create summary statistics for data """ if not data: return "No data available for summary statistics." # Convert data to DataFrame if it's a list if isinstance(data, list): df = pd.DataFrame(data) else: # If it's already a DataFrame or another format, try to convert df = pd.DataFrame(data) if 'ValueFactor' in df.columns: stats = { "Count": len(df), "Mean Value Factor": f"${df['ValueFactor'].mean():.2f}", "Median Value Factor": f"${df['ValueFactor'].median():.2f}", "Min Value Factor": f"${df['ValueFactor'].min():.2f}", "Max Value Factor": f"${df['ValueFactor'].max():.2f}", "Standard Deviation": f"${df['ValueFactor'].std():.2f}" } # Create categories summary if available if 'Category' in df.columns: categories = df['Category'].unique() stats["Number of Categories"] = len(categories) stats["Categories"] = ", ".join(categories[:5]) + ("..." if len(categories) > 5 else "") # Create impacts summary if available if 'Impact' in df.columns: impacts = df['Impact'].unique() stats["Number of Impact Types"] = len(impacts) stats["Impact Types"] = ", ".join(impacts[:5]) + ("..." if len(impacts) > 5 else "") return "\n".join([f"**{k}**: {v}" for k, v in stats.items()]) else: return "Value Factor data not available for summary statistics."