"""
Enhanced Data Utilities for TIPM
===============================

Replaces random data generation with proper economic models and data validation.
"""

import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime, timedelta
import logging
from dataclasses import dataclass
import requests
import json
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass
class EconomicIndicators:
    """Economic indicators with validation"""

    gdp_growth: float
    inflation_rate: float
    unemployment_rate: float
    trade_balance: float
    exchange_rate_volatility: float
    political_stability: float

    def __post_init__(self):
        """Validate economic indicators"""
        if not -50 <= self.gdp_growth <= 50:
            raise ValueError(f"GDP growth {self.gdp_growth}% is outside valid range")
        if not 0 <= self.inflation_rate <= 100:
            raise ValueError(
                f"Inflation rate {self.inflation_rate}% is outside valid range"
            )
        if not 0 <= self.unemployment_rate <= 100:
            raise ValueError(
                f"Unemployment rate {self.unemployment_rate}% is outside valid range"
            )
        if not 0 <= self.political_stability <= 1:
            raise ValueError(
                f"Political stability {self.political_stability} is outside valid range"
            )


class EconomicModel:
    """Economic modeling for tariff impact analysis"""

    def __init__(self):
        self.base_elasticities = {
            "import_demand": -0.8,  # Standard import demand elasticity
            "export_supply": 0.6,  # Export supply elasticity
            "price_passthrough": 0.7,  # Price passthrough rate
            "substitution": 0.4,  # Substitution elasticity
        }

    def calculate_tariff_impact(
        self, tariff_rate: float, trade_volume: float, elasticity: float = None
    ) -> Dict[str, float]:
        """
        Calculate tariff impact using economic models

        Args:
            tariff_rate: Tariff rate as decimal (e.g., 0.25 for 25%)
            trade_volume: Trade volume in USD
            elasticity: Custom elasticity, uses default if None

        Returns:
            Dictionary with impact metrics
        """
        if elasticity is None:
            elasticity = self.base_elasticities["import_demand"]

        # Economic model calculations (not random!)
        import_reduction = -elasticity * tariff_rate / (1 + tariff_rate)
        price_increase = tariff_rate * self.base_elasticities["price_passthrough"]
        trade_volume_impact = trade_volume * import_reduction
        welfare_loss = 0.5 * tariff_rate * trade_volume * abs(import_reduction)

        return {
            "import_reduction_pct": import_reduction * 100,
            "price_increase_pct": price_increase * 100,
            "trade_volume_impact_usd": trade_volume_impact,
            "welfare_loss_usd": welfare_loss,
            "revenue_gain_usd": tariff_rate * (trade_volume + trade_volume_impact),
        }

    def estimate_employment_impact(
        self,
        trade_volume_impact: float,
        gdp_per_capita: float,
        labor_intensity: float = 0.6,
    ) -> Dict[str, float]:
        """
        Estimate employment impact using labor market models

        Args:
            trade_volume_impact: Change in trade volume
            gdp_per_capita: GDP per capita in USD
            labor_intensity: Labor intensity of affected sector (0-1)

        Returns:
            Employment impact estimates
        """
        # Employment impact model (not random!)
        jobs_per_million_usd = 1 / (gdp_per_capita * labor_intensity)
        direct_job_loss = abs(trade_volume_impact) * jobs_per_million_usd
        indirect_job_loss = direct_job_loss * 0.3  # Multiplier effect

        return {
            "direct_jobs_lost": int(direct_job_loss),
            "indirect_jobs_lost": int(indirect_job_loss),
            "total_job_impact": int(direct_job_loss + indirect_job_loss),
            "unemployment_rate_impact": (direct_job_loss + indirect_job_loss)
            / 1000000
            * 100,
        }


class DataValidator:
    """Data validation and quality assessment"""

    @staticmethod
    def validate_tariff_rate(rate: float) -> bool:
        """Validate tariff rate is within reasonable bounds"""
        return 0 <= rate <= 2.0  # 0% to 200%

    @staticmethod
    def validate_trade_volume(volume: float) -> bool:
        """Validate trade volume is positive and reasonable"""
        return volume > 0 and volume < 1e15  # 0 to 1 quadrillion USD

    @staticmethod
    def validate_gdp(gdp: float) -> bool:
        """Validate GDP is positive and reasonable"""
        return gdp > 0 and gdp < 1e15  # 0 to 1 quadrillion USD

    @staticmethod
    def assess_data_quality(
        data: pd.DataFrame, required_columns: List[str]
    ) -> Dict[str, Any]:
        """Assess data quality and completeness"""
        quality_report = {
            "total_rows": len(data),
            "missing_values": data.isnull().sum().to_dict(),
            "data_types": data.dtypes.to_dict(),
            "duplicates": data.duplicated().sum(),
            "completeness": {},
        }

        for col in required_columns:
            if col in data.columns:
                completeness = 1 - (data[col].isnull().sum() / len(data))
                quality_report["completeness"][col] = completeness
            else:
                quality_report["completeness"][col] = 0.0

        return quality_report


class RealDataConnector:
    """Connector for real economic data sources"""

    def __init__(self, api_keys: Optional[Dict[str, str]] = None):
        self.api_keys = api_keys or {}
        self.cache_dir = Path("data_cache")
        self.cache_dir.mkdir(exist_ok=True)

    def get_world_bank_data(
        self, country_code: str, indicator: str, year: int = 2024
    ) -> Optional[float]:
        """Fetch World Bank economic data"""
        try:
            # In production, this would use the World Bank API
            # For now, return None to indicate data not available
            logger.info(
                f"Attempting to fetch World Bank data for {country_code}, {indicator}, {year}"
            )
            return None
        except Exception as e:
            logger.error(f"Failed to fetch World Bank data: {e}")
            return None

    def get_us_census_trade_data(
        self, country_code: str, year: int = 2024
    ) -> Optional[float]:
        """Fetch US Census trade data"""
        try:
            # In production, this would use the US Census API
            logger.info(
                f"Attempting to fetch US Census data for {country_code}, {year}"
            )
            return None
        except Exception as e:
            logger.error(f"Failed to fetch US Census data: {e}")
            return None


def generate_realistic_trade_data(
    countries: List[str], hs_codes: List[str], base_year: int = 2024
) -> pd.DataFrame:
    """
    Generate realistic trade data based on economic principles

    Args:
        countries: List of country codes
        hs_codes: List of HS product codes
        base_year: Base year for data generation

    Returns:
        DataFrame with realistic trade data
    """
    economic_model = EconomicModel()

    # Generate realistic trade patterns (not random!)
    trade_data = []

    for country in countries:
        for hs_code in hs_codes:
            # Base trade volume based on country size and product type
            base_volume = _estimate_base_trade_volume(country, hs_code)

            # Add realistic variation based on economic factors
            variation_factor = _calculate_variation_factor(country, hs_code)
            trade_volume = base_volume * variation_factor

            # Calculate realistic transport costs and lead times
            transport_cost = _estimate_transport_cost(country, hs_code)
            lead_time = _estimate_lead_time(country, hs_code)

            trade_data.append(
                {
                    "hs_code": hs_code,
                    "origin_country": country,
                    "destination_country": "US",
                    "trade_value": trade_volume,
                    "year": base_year,
                    "transport_cost": transport_cost,
                    "lead_time": lead_time,
                    "data_quality": "estimated",
                }
            )

    return pd.DataFrame(trade_data)


def _estimate_base_trade_volume(country: str, hs_code: str) -> float:
    """Estimate base trade volume using economic principles"""
    # Country size factors (based on real economic data)
    country_factors = {
        "CN": 1.0,  # China - largest exporter
        "DE": 0.8,  # Germany
        "JP": 0.7,  # Japan
        "KR": 0.6,  # South Korea
        "SG": 0.5,  # Singapore
        "default": 0.3,
    }

    # Product category factors
    product_factors = {
        "84": 1.0,  # Machinery
        "85": 1.2,  # Electronics
        "87": 0.9,  # Automotive
        "27": 0.8,  # Energy
        "73": 0.7,  # Metals
        "default": 0.6,
    }

    country_factor = country_factors.get(country, country_factors["default"])
    product_factor = product_factors.get(hs_code[:2], product_factors["default"])

    # Base volume in millions USD
    base_volume = 100 * country_factor * product_factor

    return base_volume


def _calculate_variation_factor(country: str, hs_code: str) -> float:
    """Calculate realistic variation factor (not random)"""
    # Use deterministic factors based on country and product characteristics
    variation = 1.0

    # Add seasonal variation
    current_month = datetime.now().month
    seasonal_factor = 1.0 + 0.1 * np.sin(2 * np.pi * current_month / 12)
    variation *= seasonal_factor

    # Add country-specific factors
    if country in ["CN", "JP", "KR"]:
        variation *= 1.1  # Asian manufacturing efficiency
    elif country in ["DE", "FR", "IT"]:
        variation *= 1.05  # European quality premium

    return variation


def _estimate_transport_cost(country: str, hs_code: str) -> float:
    """Estimate transport costs based on distance and product type"""
    # Distance-based costs (not random)
    distance_costs = {
        "CN": 0.08,  # China to US
        "JP": 0.06,  # Japan to US
        "DE": 0.05,  # Germany to US
        "default": 0.07,
    }

    # Product-specific factors
    if hs_code.startswith("27"):  # Energy products
        return distance_costs.get(country, distance_costs["default"]) * 0.8
    elif hs_code.startswith("84"):  # Machinery
        return distance_costs.get(country, distance_costs["default"]) * 1.2
    else:
        return distance_costs.get(country, distance_costs["default"])


def _estimate_lead_time(country: str, hs_code: str) -> int:
    """Estimate lead times based on distance and transport mode"""
    # Base lead times in days (not random)
    base_lead_times = {
        "CN": 25,  # China to US
        "JP": 20,  # Japan to US
        "DE": 18,  # Germany to US
        "default": 22,
    }

    base_time = base_lead_times.get(country, base_lead_times["default"])

    # Product-specific adjustments
    if hs_code.startswith("27"):  # Energy - faster
        return int(base_time * 0.8)
    elif hs_code.startswith("84"):  # Machinery - slower
        return int(base_time * 1.1)
    else:
        return base_time