erikhenriksson commited on
Commit
2911649
·
verified ·
1 Parent(s): a49a602

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": ["SmSubgroupClassifier"],
3
+ "auto_map": {
4
+ "AutoConfig": "configuration_sm_subgroup_classifier.SmSubgroupClassifierConfig",
5
+ "AutoModel": "modeling_sm_subgroup_classifier.SmSubgroupClassifier"
6
+ },
7
+ "model_type": "sm_subgroup_classifier"
8
+ }
configuration_sm_subgroup_classifier.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class SmSubgroupClassifierConfig(PretrainedConfig):
5
+ model_type = "sm_subgroup_classifier"
6
+
7
+ def __init__(self, **kwargs):
8
+ super().__init__(**kwargs)
fi_na-nb/metadata.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcebec77f65e7a4edab907a0680e37b2ee48f4c384a7ff2dee7d00dc88f60749
3
+ size 76
fi_na-nb/model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da0a8b8d946ea6c2bb415f6f36b9cd5b08c5fda7c1faa67c44d093d2fdcb46e4
3
+ size 9055
fi_na-nb/scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4193fa89889dce247553136aab6945fbbc389254eb8c9d203588a7fb04cdd7a1
3
+ size 25191
fi_na-nb/training_details.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Training Details for fi_na-nb
2
+ ========================================
3
+
4
+ Language: fi
5
+ Model Name: na-nb
6
+ Training Date: 2025-09-18 15:02:16
7
+
8
+ Data Summary:
9
+ - Total samples: 218088
10
+ - Training samples: 174470
11
+ - Test samples: 43618
12
+ - Embedding dimension: 1024
13
+
14
+ Classes:
15
+ - Number of classes: 2
16
+ - Class names: with_comments, general
17
+ - Class distribution: {'with_comments': 24862, 'general': 193226}
18
+
19
+ Cross-Validation Results:
20
+ - CV folds: 5
21
+ - CV scores: [0.9941823809250874, 0.9942396973691752, 0.9939817733707801, 0.9939817733707801, 0.9939817733707801]
22
+ - CV mean: 0.9941
23
+ - CV std: 0.0001
24
+ - CV confidence interval: 0.9941 ± 0.0002
25
+
26
+ Final Performance:
27
+ - Test accuracy: 0.9945
28
+
29
+ Model Configuration:
30
+ - Algorithm: Logistic Regression
31
+ - Regularization (C): 1.0
32
+ - Feature scaling: StandardScaler
33
+ - Random state: 42
34
+
35
+ Files:
36
+ - Classifier: model.pkl
37
+ - Scaler: scaler.pkl
38
+ - Metadata: metadata.pkl
39
+ - This file: training_details.txt
modeling_sm_subgroup_classifier.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+
4
+ import joblib
5
+ import numpy as np
6
+ import torch
7
+ from transformers import PreTrainedModel
8
+
9
+ from .configuration_sm_subgroup_classifier import SmSubgroupClassifierConfig
10
+
11
+
12
+ class SmSubgroupClassifier(PreTrainedModel):
13
+ config_class = SmSubgroupClassifierConfig
14
+
15
+ def __init__(self, config):
16
+ super().__init__(config)
17
+ self.config = config
18
+ self._loaded_classifiers = {}
19
+ self.model_dir = None
20
+
21
+ def _get_available_models(self):
22
+ """Discover available models by checking what directories exist"""
23
+ if not self.model_dir:
24
+ return []
25
+
26
+ available = []
27
+ if os.path.exists(self.model_dir):
28
+ for item in os.listdir(self.model_dir):
29
+ item_path = os.path.join(self.model_dir, item)
30
+ if os.path.isdir(item_path) and "_" in item:
31
+ # Check if it has the required files
32
+ required_files = ["model.pkl", "scaler.pkl", "metadata.pkl"]
33
+ if all(
34
+ os.path.exists(os.path.join(item_path, f))
35
+ for f in required_files
36
+ ):
37
+ available.append(item)
38
+ return available
39
+
40
+ def _load_classifier(self, model_key):
41
+ """Load a specific classifier by model key (e.g., 'en_OP-ob')"""
42
+ if model_key in self._loaded_classifiers:
43
+ return self._loaded_classifiers[model_key]
44
+
45
+ available_models = self._get_available_models()
46
+ if model_key not in available_models:
47
+ raise ValueError(
48
+ f"Model '{model_key}' not available. Available: {available_models}"
49
+ )
50
+
51
+ # Path to classifier
52
+ classifier_path = os.path.join(self.model_dir, model_key)
53
+
54
+ # Load components
55
+ classifier = joblib.load(os.path.join(classifier_path, "model.pkl"))
56
+ scaler = joblib.load(os.path.join(classifier_path, "scaler.pkl"))
57
+
58
+ with open(os.path.join(classifier_path, "metadata.pkl"), "rb") as f:
59
+ metadata = pickle.load(f)
60
+
61
+ classifier_info = {
62
+ "classifier": classifier,
63
+ "scaler": scaler,
64
+ "class_names": metadata["class_names"],
65
+ }
66
+
67
+ self._loaded_classifiers[model_key] = classifier_info
68
+ return classifier_info
69
+
70
+ def forward(self, language, model_name, embeddings):
71
+ """
72
+ Args:
73
+ language: Language code (en, fi, sv)
74
+ model_name: Model name (OP-ob, NA, etc.)
75
+ embeddings: Pre-computed embeddings
76
+ """
77
+ # Create model key
78
+ model_key = f"{language}_{model_name}"
79
+
80
+ # Convert embeddings to numpy if needed
81
+ if torch.is_tensor(embeddings):
82
+ embeddings = embeddings.detach().cpu().numpy()
83
+
84
+ if embeddings.ndim == 1:
85
+ embeddings = embeddings.reshape(1, -1)
86
+
87
+ # Load classifier
88
+ classifier_info = self._load_classifier(model_key)
89
+
90
+ # Scale and predict
91
+ embeddings_scaled = classifier_info["scaler"].transform(embeddings)
92
+ predictions = classifier_info["classifier"].predict(embeddings_scaled)
93
+ probabilities = classifier_info["classifier"].predict_proba(embeddings_scaled)
94
+
95
+ # Format results - just use class names and probabilities
96
+ results = []
97
+ for pred, probs in zip(predictions, probabilities):
98
+ predicted_class_name = classifier_info["class_names"][pred]
99
+
100
+ # Get all class probabilities
101
+ all_probs = {
102
+ classifier_info["class_names"][i]: float(prob)
103
+ for i, prob in enumerate(probs)
104
+ }
105
+
106
+ results.append(
107
+ {
108
+ "predicted_class": predicted_class_name,
109
+ "confidence": float(max(probs)),
110
+ "all_probabilities": all_probs,
111
+ }
112
+ )
113
+
114
+ return {
115
+ "language": language,
116
+ "model_name": model_name,
117
+ "model_key": model_key,
118
+ "predictions": results[0] if len(results) == 1 else results,
119
+ }
120
+
121
+ @classmethod
122
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
123
+ model = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
124
+ model.model_dir = pretrained_model_name_or_path
125
+ return model