danielrosehill's picture
Initial commit for Hugging Face Space with dataset integration
131dd6d
import os
import json
import pandas as pd
import requests
from pathlib import Path
from huggingface_hub import hf_hub_download
from datasets import load_dataset
# Constants
REPO_ID = "danielrosehill/ifvi_valuefactors_deriv"
DATASET_ID = "danielrosehill/ifvi_valuefactors_deriv"
HF_API_URL = "https://huggingface.co/api/datasets/danielrosehill/ifvi_valuefactors_deriv/tree/main/data"
def get_hf_directory_structure(path=""):
"""
Get the directory structure from Hugging Face API
"""
url = f"{HF_API_URL}/{path}" if path else HF_API_URL
response = requests.get(url)
if response.status_code == 200:
return response.json()
else:
print(f"Error fetching directory structure: {response.status_code}")
return []
def download_file_from_hf(file_path):
"""
Download a file from Hugging Face
"""
try:
# Convert local path to HF path
hf_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path
downloaded_file = hf_hub_download(repo_id=REPO_ID, filename=f"data/{hf_path}")
return downloaded_file
except Exception as e:
print(f"Error downloading file: {e}")
return None
def load_dataset_from_hf(path=None):
"""
Load dataset from Hugging Face
"""
try:
# If path is specified, load specific file, otherwise load the whole dataset
if path:
dataset = load_dataset(DATASET_ID, data_files={"data": path}, split="data")
else:
dataset = load_dataset(DATASET_ID)
return dataset
except Exception as e:
print(f"Error loading dataset from HF: {e}")
return None
def load_json_data(file_path):
"""
Load JSON data from a file
"""
try:
# Check if file exists locally
if os.path.exists(file_path):
with open(file_path, 'r') as f:
data = json.load(f)
return data
else:
# Try to load from HF datasets
try:
# Extract the path relative to the dataset
relative_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path
dataset = load_dataset_from_hf(f"data/{relative_path}")
if dataset:
# Convert dataset to list/dict format
return dataset.to_dict()
except Exception as dataset_error:
print(f"Error loading from dataset: {dataset_error}")
# Fallback to direct download
downloaded_file = download_file_from_hf(file_path)
if downloaded_file:
with open(downloaded_file, 'r') as f:
data = json.load(f)
return data
else:
# Fallback to sample data
sample_data_path = os.path.join(os.getcwd(), "data/sample_data.json")
if os.path.exists(sample_data_path):
print(f"Using sample data for {file_path}")
with open(sample_data_path, 'r') as f:
return json.load(f)
return None
except Exception as e:
print(f"Error loading JSON data: {e}")
return None
def load_csv_data(file_path):
"""
Load CSV data from a file
"""
try:
# Check if file exists locally
if os.path.exists(file_path):
return pd.read_csv(file_path)
else:
# Try to load from HF datasets
try:
# Extract the path relative to the dataset
relative_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path
dataset = load_dataset_from_hf(f"data/{relative_path}")
if dataset:
# Convert dataset to pandas DataFrame
return pd.DataFrame(dataset)
except Exception as dataset_error:
print(f"Error loading from dataset: {dataset_error}")
# Fallback to direct download
downloaded_file = download_file_from_hf(file_path)
if downloaded_file:
return pd.read_csv(downloaded_file)
else:
return None
except Exception as e:
print(f"Error loading CSV data: {e}")
return None
def get_continents():
"""
Get list of continents
"""
continents_path = os.path.join(os.getcwd(), "data/by-region/continental")
if os.path.exists(continents_path):
return [d for d in os.listdir(continents_path) if os.path.isdir(os.path.join(continents_path, d))]
else:
# Try to get from HF API
try:
dir_structure = get_hf_directory_structure("by-region/continental")
continents = [item['path'].split('/')[-1] for item in dir_structure
if item['type'] == 'directory']
if continents:
return sorted(continents)
except Exception as e:
print(f"Error getting continents from HF API: {e}")
# Fallback to hardcoded list
return ["Africa", "Asia", "Europe", "North America", "Oceania", "South America"]
def get_countries(continent):
"""
Get list of countries for a continent
"""
countries_dir = f"data/by-region/continental/{continent}"
countries_path = os.path.join(os.getcwd(), countries_dir)
if os.path.exists(countries_path):
countries = [os.path.splitext(f)[0] for f in os.listdir(countries_path)
if os.path.isfile(os.path.join(countries_path, f)) and f.endswith('.json')]
return sorted(countries)
else:
# Try to get from HF API
try:
dir_structure = get_hf_directory_structure(f"by-region/continental/{continent}")
countries = [os.path.splitext(item['path'].split('/')[-1])[0] for item in dir_structure
if item['type'] == 'file' and item['path'].endswith('.json')]
return sorted(countries)
except Exception as e:
print(f"Error getting countries from HF API: {e}")
return []
def get_impact_types():
"""
Get list of impact types
"""
impact_types_path = os.path.join(os.getcwd(), "data/by-impact-type")
if os.path.exists(impact_types_path):
impact_types = [d for d in os.listdir(impact_types_path)
if os.path.isdir(os.path.join(impact_types_path, d))]
# Add single JSON files
impact_types.extend([os.path.splitext(f)[0] for f in os.listdir(impact_types_path)
if os.path.isfile(os.path.join(impact_types_path, f)) and f.endswith('.json')])
return sorted(impact_types)
else:
# Try to get from HF API
try:
dir_structure = get_hf_directory_structure("by-impact-type")
impact_types = []
# Add directories
impact_types.extend([item['path'].split('/')[-1] for item in dir_structure
if item['type'] == 'directory'])
# Add JSON files (without extension)
impact_types.extend([os.path.splitext(item['path'].split('/')[-1])[0] for item in dir_structure
if item['type'] == 'file' and item['path'].endswith('.json')])
if impact_types:
return sorted(impact_types)
except Exception as e:
print(f"Error getting impact types from HF API: {e}")
# Fallback to hardcoded list
return ["air-pollution", "GHG_Impacts", "waste", "water-consumption",
"economic", "ecosystem", "health", "social"]
def get_impact_type_data(impact_type):
"""
Get data for an impact type
"""
# Map friendly names to directory/file names
impact_map = {
"Air Pollution": "air-pollution",
"GHG Impacts": "GHG_Impacts",
"Waste": "waste",
"Water Consumption": "water-consumption",
"Economic": "economic",
"Ecosystem": "ecosystem",
"Health": "health",
"Social": "social"
}
# If impact_type is already a directory/file name, use it directly
impact_dir = impact_map.get(impact_type, impact_type)
# Check if it's a file or directory
base_path = os.path.join(os.getcwd(), "data/by-impact-type")
impact_path = os.path.join(base_path, impact_dir)
if os.path.isfile(f"{impact_path}.json"):
# It's a single JSON file
return load_json_data(f"{impact_path}.json")
elif os.path.isdir(impact_path):
# It's a directory, find all JSON files
json_files = [f for f in os.listdir(impact_path) if f.endswith('.json')]
if json_files:
# Return the first JSON file for now
return load_json_data(os.path.join(impact_path, json_files[0]))
# If not found locally, try to get from HF
try:
# Try to load from HF datasets
dataset_path = f"data/by-impact-type/{impact_dir}.json"
dataset = load_dataset_from_hf(dataset_path)
if dataset:
return dataset.to_dict()
# Check if it's a single file
downloaded_file = download_file_from_hf(f"data/by-impact-type/{impact_dir}.json")
if downloaded_file:
return load_json_data(downloaded_file)
# Check if it's a directory
dir_structure = get_hf_directory_structure(f"by-impact-type/{impact_dir}")
if dir_structure:
json_files = [item['path'] for item in dir_structure if item['path'].endswith('.json')]
if json_files:
downloaded_file = download_file_from_hf(f"data/{json_files[0]}")
if downloaded_file:
return load_json_data(downloaded_file)
except Exception as e:
print(f"Error getting impact type data from HF: {e}")
return None
def get_country_data(continent, country):
"""
Get data for a country
"""
if not continent or not country:
return None
file_path = os.path.join(os.getcwd(), f"data/by-region/continental/{continent}/{country}.json")
data = load_json_data(file_path)
if not data:
# Try to get from HF datasets
try:
dataset_path = f"data/by-region/continental/{continent}/{country}.json"
dataset = load_dataset_from_hf(dataset_path)
if dataset:
return dataset.to_dict()
except Exception as dataset_error:
print(f"Error loading from dataset: {dataset_error}")
# Fallback to direct download
try:
downloaded_file = download_file_from_hf(f"data/by-region/continental/{continent}/{country}.json")
if downloaded_file:
data = load_json_data(downloaded_file)
except Exception as e:
print(f"Error getting country data from HF: {e}")
return data
def create_summary_stats(data):
"""
Create summary statistics for data
"""
if not data:
return "No data available for summary statistics."
# Convert data to DataFrame if it's a list
if isinstance(data, list):
df = pd.DataFrame(data)
else:
# If it's already a DataFrame or another format, try to convert
df = pd.DataFrame(data)
if 'ValueFactor' in df.columns:
stats = {
"Count": len(df),
"Mean Value Factor": f"${df['ValueFactor'].mean():.2f}",
"Median Value Factor": f"${df['ValueFactor'].median():.2f}",
"Min Value Factor": f"${df['ValueFactor'].min():.2f}",
"Max Value Factor": f"${df['ValueFactor'].max():.2f}",
"Standard Deviation": f"${df['ValueFactor'].std():.2f}"
}
# Create categories summary if available
if 'Category' in df.columns:
categories = df['Category'].unique()
stats["Number of Categories"] = len(categories)
stats["Categories"] = ", ".join(categories[:5]) + ("..." if len(categories) > 5 else "")
# Create impacts summary if available
if 'Impact' in df.columns:
impacts = df['Impact'].unique()
stats["Number of Impact Types"] = len(impacts)
stats["Impact Types"] = ", ".join(impacts[:5]) + ("..." if len(impacts) > 5 else "")
return "\n".join([f"**{k}**: {v}" for k, v in stats.items()])
else:
return "Value Factor data not available for summary statistics."