from typing import List

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)

EGT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    # pcqm4mv1 now deprecated
    "graphormer-base": "https://huggingface.co/clefourrier/graphormer-base-pcqm4mv2/resolve/main/config.json",
    # See all Graphormer models at https://huggingface.co/models?filter=graphormer
}


class EGTConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`~EGTModel`]. It is used to instantiate an
    EGT model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the EGT
    [graphormer-base-pcqm4mv1](https://huggingface.co/graphormer-base-pcqm4mv1) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        feat_size (`int`, *optional*, defaults to 768):
            Node feature size.
        edge_feat_size (`int`, *optional*, defaults to 64):
            Edge feature size.
        num_heads (`int`, *optional*, defaults to 32):
            Number of attention heads, by which :attr: `feat_size` is divisible.
        num_layers (`int`, *optional*, defaults to 30):
            Number of layers.
        dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability.
        attn_dropout (`float`, *optional*, defaults to 0.3):
            Attention dropout probability.
        activation (`str`, *optional*, defaults to 'ELU'):
            Activation function.
        egt_simple (`bool`, *optional*, defaults to False):
            If `False`, update the edge embedding.
        upto_hop (`int`, *optional*, defaults to 16):
            Maximum distance between nodes in the distance matrices.
        mlp_ratios (`List[float]`, *optional*, defaults to [1., 1.]):
            Ratios of inner dimensions with respect to the input dimension in MLP output block.
        num_virtual_nodes (`int`, *optional*, defaults to 4):
            Number of virtual nodes in EGT model, aggregated to graph embedding in the readout function.
        svd_pe_size (`int`, *optional*, defaults to 8):
            SVD positional encoding size.
        num_classes (`int`, *optional*, defaults to 1):
            Number of target classes or labels, set to n for binary classification of n tasks.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        traceable (`bool`, *optional*, defaults to `False`):
            Changes return value of the encoder's inner_state to stacked tensors.

        Example:
            ```python
            >>> from transformers import EGTForGraphClassification, EGTConfig

            >>> # Initializing a EGT graphormer-base-pcqm4mv2 style configuration
            >>> configuration = EGTConfig()

            >>> # Initializing a model from the graphormer-base-pcqm4mv1 style configuration
            >>> model = EGTForGraphClassification(configuration)

            >>> # Accessing the model configuration
            >>> configuration = model.config
            ```
    """
    model_type = "egt"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        feat_size: int = 768,
        edge_feat_size: int = 64,
        num_heads: int = 32,
        num_layers: int = 30,
        dropout: float = 0.0,
        attn_dropout: float = 0.3,
        activation: str = "ELU",
        egt_simple: bool = False,
        upto_hop: int = 16,
        mlp_ratios: List[float] = [1.0, 1.0],
        num_virtual_nodes: int = 4,
        svd_pe_size: int = 8,
        num_classes: int = 1,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        **kwargs,
    ):
        self.feat_size = feat_size
        self.edge_feat_size = edge_feat_size
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout = dropout
        self.attn_dropout = attn_dropout
        self.activation = activation
        self.egt_simple = egt_simple
        self.upto_hop = upto_hop
        self.mlp_ratios = mlp_ratios
        self.num_virtual_nodes = num_virtual_nodes
        self.svd_pe_size = svd_pe_size
        self.num_classes = num_classes

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )