Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.DS_Store +0 -0
.gitignore +207 -0
config.json +8 -0
configuration_sm_subgroup_classifier.py +8 -0
fi_na-nb/metadata.pkl +3 -0
fi_na-nb/model.pkl +3 -0
fi_na-nb/scaler.pkl +3 -0
fi_na-nb/training_details.txt +39 -0
modeling_sm_subgroup_classifier.py +125 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,207 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "architectures": ["SmSubgroupClassifier"],
+  "auto_map": {
+    "AutoConfig": "configuration_sm_subgroup_classifier.SmSubgroupClassifierConfig",
+    "AutoModel": "modeling_sm_subgroup_classifier.SmSubgroupClassifier"
+  },
+  "model_type": "sm_subgroup_classifier"
+}

configuration_sm_subgroup_classifier.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from transformers import PretrainedConfig
+class SmSubgroupClassifierConfig(PretrainedConfig):
+    model_type = "sm_subgroup_classifier"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)

fi_na-nb/metadata.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcebec77f65e7a4edab907a0680e37b2ee48f4c384a7ff2dee7d00dc88f60749
+size 76

fi_na-nb/model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a8b8d946ea6c2bb415f6f36b9cd5b08c5fda7c1faa67c44d093d2fdcb46e4
+size 9055

fi_na-nb/scaler.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4193fa89889dce247553136aab6945fbbc389254eb8c9d203588a7fb04cdd7a1
+size 25191

fi_na-nb/training_details.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+Training Details for fi_na-nb
+========================================
+Language: fi
+Model Name: na-nb
+Training Date: 2025-09-18 15:02:16
+Data Summary:
+- Total samples: 218088
+- Training samples: 174470
+- Test samples: 43618
+- Embedding dimension: 1024
+Classes:
+- Number of classes: 2
+- Class names: with_comments, general
+- Class distribution: {'with_comments': 24862, 'general': 193226}
+Cross-Validation Results:
+- CV folds: 5
+- CV scores: [0.9941823809250874, 0.9942396973691752, 0.9939817733707801, 0.9939817733707801, 0.9939817733707801]
+- CV mean: 0.9941
+- CV std: 0.0001
+- CV confidence interval: 0.9941 ± 0.0002
+Final Performance:
+- Test accuracy: 0.9945
+Model Configuration:
+- Algorithm: Logistic Regression
+- Regularization (C): 1.0
+- Feature scaling: StandardScaler
+- Random state: 42
+Files:
+- Classifier: model.pkl
+- Scaler: scaler.pkl
+- Metadata: metadata.pkl
+- This file: training_details.txt

modeling_sm_subgroup_classifier.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import pickle
+import joblib
+import numpy as np
+import torch
+from transformers import PreTrainedModel
+from .configuration_sm_subgroup_classifier import SmSubgroupClassifierConfig
+class SmSubgroupClassifier(PreTrainedModel):
+    config_class = SmSubgroupClassifierConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self._loaded_classifiers = {}
+        self.model_dir = None
+    def _get_available_models(self):
+        """Discover available models by checking what directories exist"""
+        if not self.model_dir:
+            return []
+        available = []
+        if os.path.exists(self.model_dir):
+            for item in os.listdir(self.model_dir):
+                item_path = os.path.join(self.model_dir, item)
+                if os.path.isdir(item_path) and "_" in item:
+                    # Check if it has the required files
+                    required_files = ["model.pkl", "scaler.pkl", "metadata.pkl"]
+                    if all(
+                        os.path.exists(os.path.join(item_path, f))
+                        for f in required_files
+                    ):
+                        available.append(item)
+        return available
+    def _load_classifier(self, model_key):
+        """Load a specific classifier by model key (e.g., 'en_OP-ob')"""
+        if model_key in self._loaded_classifiers:
+            return self._loaded_classifiers[model_key]
+        available_models = self._get_available_models()
+        if model_key not in available_models:
+            raise ValueError(
+                f"Model '{model_key}' not available. Available: {available_models}"
+            )
+        # Path to classifier
+        classifier_path = os.path.join(self.model_dir, model_key)
+        # Load components
+        classifier = joblib.load(os.path.join(classifier_path, "model.pkl"))
+        scaler = joblib.load(os.path.join(classifier_path, "scaler.pkl"))
+        with open(os.path.join(classifier_path, "metadata.pkl"), "rb") as f:
+            metadata = pickle.load(f)
+        classifier_info = {
+            "classifier": classifier,
+            "scaler": scaler,
+            "class_names": metadata["class_names"],
+        }
+        self._loaded_classifiers[model_key] = classifier_info
+        return classifier_info
+    def forward(self, language, model_name, embeddings):
+        """
+        Args:
+            language: Language code (en, fi, sv)
+            model_name: Model name (OP-ob, NA, etc.)
+            embeddings: Pre-computed embeddings
+        """
+        # Create model key
+        model_key = f"{language}_{model_name}"
+        # Convert embeddings to numpy if needed
+        if torch.is_tensor(embeddings):
+            embeddings = embeddings.detach().cpu().numpy()
+        if embeddings.ndim == 1:
+            embeddings = embeddings.reshape(1, -1)
+        # Load classifier
+        classifier_info = self._load_classifier(model_key)
+        # Scale and predict
+        embeddings_scaled = classifier_info["scaler"].transform(embeddings)
+        predictions = classifier_info["classifier"].predict(embeddings_scaled)
+        probabilities = classifier_info["classifier"].predict_proba(embeddings_scaled)
+        # Format results - just use class names and probabilities
+        results = []
+        for pred, probs in zip(predictions, probabilities):
+            predicted_class_name = classifier_info["class_names"][pred]
+            # Get all class probabilities
+            all_probs = {
+                classifier_info["class_names"][i]: float(prob)
+                for i, prob in enumerate(probs)
+            }
+            results.append(
+                {
+                    "predicted_class": predicted_class_name,
+                    "confidence": float(max(probs)),
+                    "all_probabilities": all_probs,
+                }
+            )
+        return {
+            "language": language,
+            "model_name": model_name,
+            "model_key": model_key,
+            "predictions": results[0] if len(results) == 1 else results,
+        }
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        model = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model.model_dir = pretrained_model_name_or_path
+        return model