Spaces:
Build error
Build error
import os | |
import json | |
import pandas as pd | |
import requests | |
from pathlib import Path | |
from huggingface_hub import hf_hub_download | |
from datasets import load_dataset | |
# Constants | |
REPO_ID = "danielrosehill/ifvi_valuefactors_deriv" | |
DATASET_ID = "danielrosehill/ifvi_valuefactors_deriv" | |
HF_API_URL = "https://huggingface.co/api/datasets/danielrosehill/ifvi_valuefactors_deriv/tree/main/data" | |
def get_hf_directory_structure(path=""): | |
""" | |
Get the directory structure from Hugging Face API | |
""" | |
url = f"{HF_API_URL}/{path}" if path else HF_API_URL | |
response = requests.get(url) | |
if response.status_code == 200: | |
return response.json() | |
else: | |
print(f"Error fetching directory structure: {response.status_code}") | |
return [] | |
def download_file_from_hf(file_path): | |
""" | |
Download a file from Hugging Face | |
""" | |
try: | |
# Convert local path to HF path | |
hf_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path | |
downloaded_file = hf_hub_download(repo_id=REPO_ID, filename=f"data/{hf_path}") | |
return downloaded_file | |
except Exception as e: | |
print(f"Error downloading file: {e}") | |
return None | |
def load_dataset_from_hf(path=None): | |
""" | |
Load dataset from Hugging Face | |
""" | |
try: | |
# If path is specified, load specific file, otherwise load the whole dataset | |
if path: | |
dataset = load_dataset(DATASET_ID, data_files={"data": path}, split="data") | |
else: | |
dataset = load_dataset(DATASET_ID) | |
return dataset | |
except Exception as e: | |
print(f"Error loading dataset from HF: {e}") | |
return None | |
def load_json_data(file_path): | |
""" | |
Load JSON data from a file | |
""" | |
try: | |
# Check if file exists locally | |
if os.path.exists(file_path): | |
with open(file_path, 'r') as f: | |
data = json.load(f) | |
return data | |
else: | |
# Try to load from HF datasets | |
try: | |
# Extract the path relative to the dataset | |
relative_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path | |
dataset = load_dataset_from_hf(f"data/{relative_path}") | |
if dataset: | |
# Convert dataset to list/dict format | |
return dataset.to_dict() | |
except Exception as dataset_error: | |
print(f"Error loading from dataset: {dataset_error}") | |
# Fallback to direct download | |
downloaded_file = download_file_from_hf(file_path) | |
if downloaded_file: | |
with open(downloaded_file, 'r') as f: | |
data = json.load(f) | |
return data | |
else: | |
# Fallback to sample data | |
sample_data_path = os.path.join(os.getcwd(), "data/sample_data.json") | |
if os.path.exists(sample_data_path): | |
print(f"Using sample data for {file_path}") | |
with open(sample_data_path, 'r') as f: | |
return json.load(f) | |
return None | |
except Exception as e: | |
print(f"Error loading JSON data: {e}") | |
return None | |
def load_csv_data(file_path): | |
""" | |
Load CSV data from a file | |
""" | |
try: | |
# Check if file exists locally | |
if os.path.exists(file_path): | |
return pd.read_csv(file_path) | |
else: | |
# Try to load from HF datasets | |
try: | |
# Extract the path relative to the dataset | |
relative_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path | |
dataset = load_dataset_from_hf(f"data/{relative_path}") | |
if dataset: | |
# Convert dataset to pandas DataFrame | |
return pd.DataFrame(dataset) | |
except Exception as dataset_error: | |
print(f"Error loading from dataset: {dataset_error}") | |
# Fallback to direct download | |
downloaded_file = download_file_from_hf(file_path) | |
if downloaded_file: | |
return pd.read_csv(downloaded_file) | |
else: | |
return None | |
except Exception as e: | |
print(f"Error loading CSV data: {e}") | |
return None | |
def get_continents(): | |
""" | |
Get list of continents | |
""" | |
continents_path = os.path.join(os.getcwd(), "data/by-region/continental") | |
if os.path.exists(continents_path): | |
return [d for d in os.listdir(continents_path) if os.path.isdir(os.path.join(continents_path, d))] | |
else: | |
# Try to get from HF API | |
try: | |
dir_structure = get_hf_directory_structure("by-region/continental") | |
continents = [item['path'].split('/')[-1] for item in dir_structure | |
if item['type'] == 'directory'] | |
if continents: | |
return sorted(continents) | |
except Exception as e: | |
print(f"Error getting continents from HF API: {e}") | |
# Fallback to hardcoded list | |
return ["Africa", "Asia", "Europe", "North America", "Oceania", "South America"] | |
def get_countries(continent): | |
""" | |
Get list of countries for a continent | |
""" | |
countries_dir = f"data/by-region/continental/{continent}" | |
countries_path = os.path.join(os.getcwd(), countries_dir) | |
if os.path.exists(countries_path): | |
countries = [os.path.splitext(f)[0] for f in os.listdir(countries_path) | |
if os.path.isfile(os.path.join(countries_path, f)) and f.endswith('.json')] | |
return sorted(countries) | |
else: | |
# Try to get from HF API | |
try: | |
dir_structure = get_hf_directory_structure(f"by-region/continental/{continent}") | |
countries = [os.path.splitext(item['path'].split('/')[-1])[0] for item in dir_structure | |
if item['type'] == 'file' and item['path'].endswith('.json')] | |
return sorted(countries) | |
except Exception as e: | |
print(f"Error getting countries from HF API: {e}") | |
return [] | |
def get_impact_types(): | |
""" | |
Get list of impact types | |
""" | |
impact_types_path = os.path.join(os.getcwd(), "data/by-impact-type") | |
if os.path.exists(impact_types_path): | |
impact_types = [d for d in os.listdir(impact_types_path) | |
if os.path.isdir(os.path.join(impact_types_path, d))] | |
# Add single JSON files | |
impact_types.extend([os.path.splitext(f)[0] for f in os.listdir(impact_types_path) | |
if os.path.isfile(os.path.join(impact_types_path, f)) and f.endswith('.json')]) | |
return sorted(impact_types) | |
else: | |
# Try to get from HF API | |
try: | |
dir_structure = get_hf_directory_structure("by-impact-type") | |
impact_types = [] | |
# Add directories | |
impact_types.extend([item['path'].split('/')[-1] for item in dir_structure | |
if item['type'] == 'directory']) | |
# Add JSON files (without extension) | |
impact_types.extend([os.path.splitext(item['path'].split('/')[-1])[0] for item in dir_structure | |
if item['type'] == 'file' and item['path'].endswith('.json')]) | |
if impact_types: | |
return sorted(impact_types) | |
except Exception as e: | |
print(f"Error getting impact types from HF API: {e}") | |
# Fallback to hardcoded list | |
return ["air-pollution", "GHG_Impacts", "waste", "water-consumption", | |
"economic", "ecosystem", "health", "social"] | |
def get_impact_type_data(impact_type): | |
""" | |
Get data for an impact type | |
""" | |
# Map friendly names to directory/file names | |
impact_map = { | |
"Air Pollution": "air-pollution", | |
"GHG Impacts": "GHG_Impacts", | |
"Waste": "waste", | |
"Water Consumption": "water-consumption", | |
"Economic": "economic", | |
"Ecosystem": "ecosystem", | |
"Health": "health", | |
"Social": "social" | |
} | |
# If impact_type is already a directory/file name, use it directly | |
impact_dir = impact_map.get(impact_type, impact_type) | |
# Check if it's a file or directory | |
base_path = os.path.join(os.getcwd(), "data/by-impact-type") | |
impact_path = os.path.join(base_path, impact_dir) | |
if os.path.isfile(f"{impact_path}.json"): | |
# It's a single JSON file | |
return load_json_data(f"{impact_path}.json") | |
elif os.path.isdir(impact_path): | |
# It's a directory, find all JSON files | |
json_files = [f for f in os.listdir(impact_path) if f.endswith('.json')] | |
if json_files: | |
# Return the first JSON file for now | |
return load_json_data(os.path.join(impact_path, json_files[0])) | |
# If not found locally, try to get from HF | |
try: | |
# Try to load from HF datasets | |
dataset_path = f"data/by-impact-type/{impact_dir}.json" | |
dataset = load_dataset_from_hf(dataset_path) | |
if dataset: | |
return dataset.to_dict() | |
# Check if it's a single file | |
downloaded_file = download_file_from_hf(f"data/by-impact-type/{impact_dir}.json") | |
if downloaded_file: | |
return load_json_data(downloaded_file) | |
# Check if it's a directory | |
dir_structure = get_hf_directory_structure(f"by-impact-type/{impact_dir}") | |
if dir_structure: | |
json_files = [item['path'] for item in dir_structure if item['path'].endswith('.json')] | |
if json_files: | |
downloaded_file = download_file_from_hf(f"data/{json_files[0]}") | |
if downloaded_file: | |
return load_json_data(downloaded_file) | |
except Exception as e: | |
print(f"Error getting impact type data from HF: {e}") | |
return None | |
def get_country_data(continent, country): | |
""" | |
Get data for a country | |
""" | |
if not continent or not country: | |
return None | |
file_path = os.path.join(os.getcwd(), f"data/by-region/continental/{continent}/{country}.json") | |
data = load_json_data(file_path) | |
if not data: | |
# Try to get from HF datasets | |
try: | |
dataset_path = f"data/by-region/continental/{continent}/{country}.json" | |
dataset = load_dataset_from_hf(dataset_path) | |
if dataset: | |
return dataset.to_dict() | |
except Exception as dataset_error: | |
print(f"Error loading from dataset: {dataset_error}") | |
# Fallback to direct download | |
try: | |
downloaded_file = download_file_from_hf(f"data/by-region/continental/{continent}/{country}.json") | |
if downloaded_file: | |
data = load_json_data(downloaded_file) | |
except Exception as e: | |
print(f"Error getting country data from HF: {e}") | |
return data | |
def create_summary_stats(data): | |
""" | |
Create summary statistics for data | |
""" | |
if not data: | |
return "No data available for summary statistics." | |
# Convert data to DataFrame if it's a list | |
if isinstance(data, list): | |
df = pd.DataFrame(data) | |
else: | |
# If it's already a DataFrame or another format, try to convert | |
df = pd.DataFrame(data) | |
if 'ValueFactor' in df.columns: | |
stats = { | |
"Count": len(df), | |
"Mean Value Factor": f"${df['ValueFactor'].mean():.2f}", | |
"Median Value Factor": f"${df['ValueFactor'].median():.2f}", | |
"Min Value Factor": f"${df['ValueFactor'].min():.2f}", | |
"Max Value Factor": f"${df['ValueFactor'].max():.2f}", | |
"Standard Deviation": f"${df['ValueFactor'].std():.2f}" | |
} | |
# Create categories summary if available | |
if 'Category' in df.columns: | |
categories = df['Category'].unique() | |
stats["Number of Categories"] = len(categories) | |
stats["Categories"] = ", ".join(categories[:5]) + ("..." if len(categories) > 5 else "") | |
# Create impacts summary if available | |
if 'Impact' in df.columns: | |
impacts = df['Impact'].unique() | |
stats["Number of Impact Types"] = len(impacts) | |
stats["Impact Types"] = ", ".join(impacts[:5]) + ("..." if len(impacts) > 5 else "") | |
return "\n".join([f"**{k}**: {v}" for k, v in stats.items()]) | |
else: | |
return "Value Factor data not available for summary statistics." | |