import os import pickle import joblib import numpy as np import torch from transformers import PreTrainedModel from .configuration_sm_subgroup_classifier import SmSubgroupClassifierConfig class SmSubgroupClassifier(PreTrainedModel): config_class = SmSubgroupClassifierConfig def __init__(self, config): super().__init__(config) self.config = config self._loaded_classifiers = {} self.model_dir = None def _get_available_models(self): """Discover available models by checking what directories exist""" if not self.model_dir: return [] available = [] if os.path.exists(self.model_dir): for item in os.listdir(self.model_dir): item_path = os.path.join(self.model_dir, item) if os.path.isdir(item_path) and "_" in item: # Check if it has the required files required_files = ["model.pkl", "scaler.pkl", "metadata.pkl"] if all( os.path.exists(os.path.join(item_path, f)) for f in required_files ): available.append(item) return available def _load_classifier(self, model_key): """Load a specific classifier by model key (e.g., 'en_OP-ob')""" if model_key in self._loaded_classifiers: return self._loaded_classifiers[model_key] available_models = self._get_available_models() if model_key not in available_models: raise ValueError( f"Model '{model_key}' not available. Available: {available_models}" ) # Path to classifier classifier_path = os.path.join(self.model_dir, model_key) # Load components classifier = joblib.load(os.path.join(classifier_path, "model.pkl")) scaler = joblib.load(os.path.join(classifier_path, "scaler.pkl")) with open(os.path.join(classifier_path, "metadata.pkl"), "rb") as f: metadata = pickle.load(f) classifier_info = { "classifier": classifier, "scaler": scaler, "class_names": metadata["class_names"], } self._loaded_classifiers[model_key] = classifier_info return classifier_info def forward(self, language, model_name, embeddings): """ Args: language: Language code (en, fi, sv) model_name: Model name (OP-ob, NA, etc.) embeddings: Pre-computed embeddings """ # Create model key model_key = f"{language}_{model_name}" # Convert embeddings to numpy if needed if torch.is_tensor(embeddings): embeddings = embeddings.detach().cpu().numpy() if embeddings.ndim == 1: embeddings = embeddings.reshape(1, -1) # Load classifier classifier_info = self._load_classifier(model_key) # Scale and predict embeddings_scaled = classifier_info["scaler"].transform(embeddings) predictions = classifier_info["classifier"].predict(embeddings_scaled) probabilities = classifier_info["classifier"].predict_proba(embeddings_scaled) # Format results - just use class names and probabilities results = [] for pred, probs in zip(predictions, probabilities): predicted_class_name = classifier_info["class_names"][pred] # Get all class probabilities all_probs = { classifier_info["class_names"][i]: float(prob) for i, prob in enumerate(probs) } results.append( { "predicted_class": predicted_class_name, "confidence": float(max(probs)), "all_probabilities": all_probs, } ) return { "language": language, "model_name": model_name, "model_key": model_key, "predictions": results[0] if len(results) == 1 else results, } @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): model = super().from_pretrained(pretrained_model_name_or_path, **kwargs) model.model_dir = pretrained_model_name_or_path return model