class EnsembleDebertaConfig(PretrainedConfig):
    model_type = "ensemble-deberta"
    def __init__(self, num_models=3, **kwargs):
        super().__init__(**kwargs)
        self.num_models = num_models

class EnsembleDeberta(PreTrainedModel):
    config_class = EnsembleDebertaConfig
    base_model_prefix = "ensemble-deberta"

    def __init__(self, model_paths=None, config=None):
        # If no config is provided, initialize a default one
        if config is None:
            config = EnsembleDebertaConfig()
        super().__init__(config)

        # Create a ModuleList to hold the individual models
        # Can pass additional kwargs (e.g. num_labels) if needed
        if model_paths is None:
            raise ValueError("Please provide a list of model paths.")
        # Load each model from the provided paths
        self.models = nn.ModuleList([
            DebertaV2ForSequenceClassification.from_pretrained(path)
            for path in model_paths
        ])
        if len(self.models) != self.config.num_models:
            # Update config if needed
            self.config.num_models = len(self.models)

        # Automatically load the tokenizer from the first submodel (as all submodels use the same tokenizer)
        self.tokenizer = AutoTokenizer.from_pretrained(save_directory)

    def forward(self, **kwargs):
        '''
        Forward pass that obtains the logits from each model, computes softmax probabilities and averages them (soft voting).
        '''
        # Collect softmax probabilities from each model
        probs = []
        for model in self.models:
            # Each model is assumed to return a ModelOutput with a 'logits' attribute
            outputs = model(**kwargs)
            logits = outputs.logits  # shape: (batch_size, num_labels)
            # Compute softmax for each model
            probs.append(torch.nn.functional.softmax(logits, dim=-1))

        # Stack and average the probabilities
        avg_probs = torch.stack(probs, dim=0).mean(dim=0)
        return avg_probs

    def save_pretrained(self, save_directory, **kwargs):
        '''
        Saves the ensemble model and its tokenizer. Each submodel is saved in its own subdirectory.
        '''
        os.makedirs(save_directory, exist_ok=True)
        # Save the ensemble configuration
        self.config.save_pretrained(save_directory)

        # Save each submodel to its own subdirectory
        for idx, model in enumerate(self.models):
            sub_dir = os.path.join(save_directory, f"model_{idx}")
            model.save_pretrained(sub_dir)

        # Save the tokenizer
        self.tokenizer.save_pretrained(save_directory)

        print(f"Ensemble saved to {save_directory}.")

    @classmethod
    def from_pretrained(cls, save_directory, **kwargs):
        '''
        Loads the ensemble model.
        '''
        # Load the ensemble configuration
        config = EnsembleDebertaConfig.from_pretrained(save_directory)
        num_models = config.num_models

        # Determine the paths for each submodel
        model_paths = [os.path.join(save_directory, f"model_{idx}") for idx in range(num_models)]

        # Initialize the ensemble
        ensemble = cls(model_paths=model_paths, config=config)

        return ensemble