Spaces:

danielrosehill
/

IFVI-Value-Factors-Navigator

Build error

App Files Files Community

IFVI-Value-Factors-Navigator / utils.py

danielrosehill

Initial commit for Hugging Face Space with dataset integration

131dd6d 6 months ago

raw

history blame contribute delete

12.6 kB

	import os
	import json
	import pandas as pd
	import requests
	from pathlib import Path
	from huggingface_hub import hf_hub_download
	from datasets import load_dataset

	# Constants
	REPO_ID = "danielrosehill/ifvi_valuefactors_deriv"
	DATASET_ID = "danielrosehill/ifvi_valuefactors_deriv"
	HF_API_URL = "https://huggingface.co/api/datasets/danielrosehill/ifvi_valuefactors_deriv/tree/main/data"

	def get_hf_directory_structure(path=""):
	"""
	Get the directory structure from Hugging Face API
	"""
	url = f"{HF_API_URL}/{path}" if path else HF_API_URL
	response = requests.get(url)
	if response.status_code == 200:
	return response.json()
	else:
	print(f"Error fetching directory structure: {response.status_code}")
	return []

	def download_file_from_hf(file_path):
	"""
	Download a file from Hugging Face
	"""
	try:
	# Convert local path to HF path
	hf_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path
	downloaded_file = hf_hub_download(repo_id=REPO_ID, filename=f"data/{hf_path}")
	return downloaded_file
	except Exception as e:
	print(f"Error downloading file: {e}")
	return None

	def load_dataset_from_hf(path=None):
	"""
	Load dataset from Hugging Face
	"""
	try:
	# If path is specified, load specific file, otherwise load the whole dataset
	if path:
	dataset = load_dataset(DATASET_ID, data_files={"data": path}, split="data")
	else:
	dataset = load_dataset(DATASET_ID)
	return dataset
	except Exception as e:
	print(f"Error loading dataset from HF: {e}")
	return None

	def load_json_data(file_path):
	"""
	Load JSON data from a file
	"""
	try:
	# Check if file exists locally
	if os.path.exists(file_path):
	with open(file_path, 'r') as f:
	data = json.load(f)
	return data
	else:
	# Try to load from HF datasets
	try:
	# Extract the path relative to the dataset
	relative_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path
	dataset = load_dataset_from_hf(f"data/{relative_path}")
	if dataset:
	# Convert dataset to list/dict format
	return dataset.to_dict()
	except Exception as dataset_error:
	print(f"Error loading from dataset: {dataset_error}")

	# Fallback to direct download
	downloaded_file = download_file_from_hf(file_path)
	if downloaded_file:
	with open(downloaded_file, 'r') as f:
	data = json.load(f)
	return data
	else:
	# Fallback to sample data
	sample_data_path = os.path.join(os.getcwd(), "data/sample_data.json")
	if os.path.exists(sample_data_path):
	print(f"Using sample data for {file_path}")
	with open(sample_data_path, 'r') as f:
	return json.load(f)
	return None
	except Exception as e:
	print(f"Error loading JSON data: {e}")
	return None

	def load_csv_data(file_path):
	"""
	Load CSV data from a file
	"""
	try:
	# Check if file exists locally
	if os.path.exists(file_path):
	return pd.read_csv(file_path)
	else:
	# Try to load from HF datasets
	try:
	# Extract the path relative to the dataset
	relative_path = file_path.split('/data/')[1] if '/data/' in file_path else file_path
	dataset = load_dataset_from_hf(f"data/{relative_path}")
	if dataset:
	# Convert dataset to pandas DataFrame
	return pd.DataFrame(dataset)
	except Exception as dataset_error:
	print(f"Error loading from dataset: {dataset_error}")

	# Fallback to direct download
	downloaded_file = download_file_from_hf(file_path)
	if downloaded_file:
	return pd.read_csv(downloaded_file)
	else:
	return None
	except Exception as e:
	print(f"Error loading CSV data: {e}")
	return None

	def get_continents():
	"""
	Get list of continents
	"""
	continents_path = os.path.join(os.getcwd(), "data/by-region/continental")
	if os.path.exists(continents_path):
	return [d for d in os.listdir(continents_path) if os.path.isdir(os.path.join(continents_path, d))]
	else:
	# Try to get from HF API
	try:
	dir_structure = get_hf_directory_structure("by-region/continental")
	continents = [item['path'].split('/')[-1] for item in dir_structure
	if item['type'] == 'directory']
	if continents:
	return sorted(continents)
	except Exception as e:
	print(f"Error getting continents from HF API: {e}")

	# Fallback to hardcoded list
	return ["Africa", "Asia", "Europe", "North America", "Oceania", "South America"]

	def get_countries(continent):
	"""
	Get list of countries for a continent
	"""
	countries_dir = f"data/by-region/continental/{continent}"
	countries_path = os.path.join(os.getcwd(), countries_dir)

	if os.path.exists(countries_path):
	countries = [os.path.splitext(f)[0] for f in os.listdir(countries_path)
	if os.path.isfile(os.path.join(countries_path, f)) and f.endswith('.json')]
	return sorted(countries)
	else:
	# Try to get from HF API
	try:
	dir_structure = get_hf_directory_structure(f"by-region/continental/{continent}")
	countries = [os.path.splitext(item['path'].split('/')[-1])[0] for item in dir_structure
	if item['type'] == 'file' and item['path'].endswith('.json')]
	return sorted(countries)
	except Exception as e:
	print(f"Error getting countries from HF API: {e}")
	return []

	def get_impact_types():
	"""
	Get list of impact types
	"""
	impact_types_path = os.path.join(os.getcwd(), "data/by-impact-type")
	if os.path.exists(impact_types_path):
	impact_types = [d for d in os.listdir(impact_types_path)
	if os.path.isdir(os.path.join(impact_types_path, d))]
	# Add single JSON files
	impact_types.extend([os.path.splitext(f)[0] for f in os.listdir(impact_types_path)
	if os.path.isfile(os.path.join(impact_types_path, f)) and f.endswith('.json')])
	return sorted(impact_types)
	else:
	# Try to get from HF API
	try:
	dir_structure = get_hf_directory_structure("by-impact-type")
	impact_types = []

	# Add directories
	impact_types.extend([item['path'].split('/')[-1] for item in dir_structure
	if item['type'] == 'directory'])

	# Add JSON files (without extension)
	impact_types.extend([os.path.splitext(item['path'].split('/')[-1])[0] for item in dir_structure
	if item['type'] == 'file' and item['path'].endswith('.json')])

	if impact_types:
	return sorted(impact_types)
	except Exception as e:
	print(f"Error getting impact types from HF API: {e}")

	# Fallback to hardcoded list
	return ["air-pollution", "GHG_Impacts", "waste", "water-consumption",
	"economic", "ecosystem", "health", "social"]

	def get_impact_type_data(impact_type):
	"""
	Get data for an impact type
	"""
	# Map friendly names to directory/file names
	impact_map = {
	"Air Pollution": "air-pollution",
	"GHG Impacts": "GHG_Impacts",
	"Waste": "waste",
	"Water Consumption": "water-consumption",
	"Economic": "economic",
	"Ecosystem": "ecosystem",
	"Health": "health",
	"Social": "social"
	}

	# If impact_type is already a directory/file name, use it directly
	impact_dir = impact_map.get(impact_type, impact_type)

	# Check if it's a file or directory
	base_path = os.path.join(os.getcwd(), "data/by-impact-type")
	impact_path = os.path.join(base_path, impact_dir)

	if os.path.isfile(f"{impact_path}.json"):
	# It's a single JSON file
	return load_json_data(f"{impact_path}.json")
	elif os.path.isdir(impact_path):
	# It's a directory, find all JSON files
	json_files = [f for f in os.listdir(impact_path) if f.endswith('.json')]
	if json_files:
	# Return the first JSON file for now
	return load_json_data(os.path.join(impact_path, json_files[0]))

	# If not found locally, try to get from HF
	try:
	# Try to load from HF datasets
	dataset_path = f"data/by-impact-type/{impact_dir}.json"
	dataset = load_dataset_from_hf(dataset_path)
	if dataset:
	return dataset.to_dict()

	# Check if it's a single file
	downloaded_file = download_file_from_hf(f"data/by-impact-type/{impact_dir}.json")
	if downloaded_file:
	return load_json_data(downloaded_file)

	# Check if it's a directory
	dir_structure = get_hf_directory_structure(f"by-impact-type/{impact_dir}")
	if dir_structure:
	json_files = [item['path'] for item in dir_structure if item['path'].endswith('.json')]
	if json_files:
	downloaded_file = download_file_from_hf(f"data/{json_files[0]}")
	if downloaded_file:
	return load_json_data(downloaded_file)
	except Exception as e:
	print(f"Error getting impact type data from HF: {e}")

	return None

	def get_country_data(continent, country):
	"""
	Get data for a country
	"""
	if not continent or not country:
	return None

	file_path = os.path.join(os.getcwd(), f"data/by-region/continental/{continent}/{country}.json")
	data = load_json_data(file_path)

	if not data:
	# Try to get from HF datasets
	try:
	dataset_path = f"data/by-region/continental/{continent}/{country}.json"
	dataset = load_dataset_from_hf(dataset_path)
	if dataset:
	return dataset.to_dict()
	except Exception as dataset_error:
	print(f"Error loading from dataset: {dataset_error}")

	# Fallback to direct download
	try:
	downloaded_file = download_file_from_hf(f"data/by-region/continental/{continent}/{country}.json")
	if downloaded_file:
	data = load_json_data(downloaded_file)
	except Exception as e:
	print(f"Error getting country data from HF: {e}")

	return data

	def create_summary_stats(data):
	"""
	Create summary statistics for data
	"""
	if not data:
	return "No data available for summary statistics."

	# Convert data to DataFrame if it's a list
	if isinstance(data, list):
	df = pd.DataFrame(data)
	else:
	# If it's already a DataFrame or another format, try to convert
	df = pd.DataFrame(data)

	if 'ValueFactor' in df.columns:
	stats = {
	"Count": len(df),
	"Mean Value Factor": f"${df['ValueFactor'].mean():.2f}",
	"Median Value Factor": f"${df['ValueFactor'].median():.2f}",
	"Min Value Factor": f"${df['ValueFactor'].min():.2f}",
	"Max Value Factor": f"${df['ValueFactor'].max():.2f}",
	"Standard Deviation": f"${df['ValueFactor'].std():.2f}"
	}

	# Create categories summary if available
	if 'Category' in df.columns:
	categories = df['Category'].unique()
	stats["Number of Categories"] = len(categories)
	stats["Categories"] = ", ".join(categories[:5]) + ("..." if len(categories) > 5 else "")

	# Create impacts summary if available
	if 'Impact' in df.columns:
	impacts = df['Impact'].unique()
	stats["Number of Impact Types"] = len(impacts)
	stats["Impact Types"] = ", ".join(impacts[:5]) + ("..." if len(impacts) > 5 else "")

	return "\n".join([f"{k}: {v}" for k, v in stats.items()])
	else:
	return "Value Factor data not available for summary statistics."