jeffmeloy commited on
Commit
b391c98
·
verified ·
1 Parent(s): 03c6a9b

Update ner_merge.py

Browse files
Files changed (1) hide show
  1. ner_merge.py +403 -437
ner_merge.py CHANGED
@@ -1,437 +1,403 @@
1
- """
2
- NER (Normalized Effective Rank) quantifies dimensional utilization across layers using entropy analysis of singular value distributions. NER calculation involves Singular Value Decomposition (SVD) of weight matrix A. Singular values form a probability distribution through normalization. Entropy H of this distribution yields the Effective Rank (ERank) as 2^H. Normalizing by maximum possible entropy H_max produces a value between 0 and 1, measuring dimensional utilization efficiency.
3
-
4
- Run the script with:
5
- python mastermerge.py --config mastermerge_config.yaml (optional)
6
-
7
- The script loads configuration, processes each model by downloading, loading weights, normalizing layers, calculating NER for each layer, using NER to identify the optimal layer, finally creating a composite model with the highest ner in each layer.
8
-
9
- **License**
10
- Use, modify, and distribute as you see fit. Good luck with that shit.
11
- Copyright 2024, nobody. No rights reserved.
12
- """
13
-
14
- import torch
15
- import json
16
- import argparse
17
- import shutil
18
- from tqdm import tqdm
19
- import os
20
- import yaml
21
- from typing import Optional
22
- from datetime import datetime
23
- import matplotlib.pyplot as plt
24
- from torch.cuda.amp import autocast
25
- from huggingface_hub import snapshot_download
26
- from transformers import AutoModelForCausalLM
27
- from transformers import AutoConfig
28
-
29
-
30
- def download_model(model_name: str, models_dir: str) -> Optional[str]:
31
- """Download model from Hugging Face Hub."""
32
- local_path = os.path.join(models_dir, model_name.replace("/", "_"))
33
- if not os.path.exists(local_path):
34
- print(f"Downloading {model_name} to {local_path}")
35
- try:
36
- snapshot_download(
37
- repo_id=model_name,
38
- local_dir=local_path,
39
- local_dir_use_symlinks=False,
40
- revision="main",
41
- )
42
- print(f"Successfully downloaded {model_name}")
43
- except Exception as e:
44
- print(f"Error downloading {model_name}: {e}")
45
- return None
46
- else:
47
- print(f"Model {model_name} already exists at {local_path}")
48
-
49
- return local_path
50
-
51
-
52
- def load_model(model_path: str, device: str = "cuda") -> Optional[AutoModelForCausalLM]:
53
- """Load model from local path."""
54
- try:
55
- return AutoModelForCausalLM.from_pretrained(
56
- model_path,
57
- torch_dtype=torch.bfloat16,
58
- low_cpu_mem_usage=True,
59
- trust_remote_code=True,
60
- device_map=device,
61
- )
62
- except Exception as e:
63
- print(f"Error loading model: {e}")
64
- return None
65
-
66
-
67
- def calculate_normalized_effective_rank(A: torch.Tensor) -> float:
68
- """ "Calculate the Normalized Effective Rank (NER) of a matrix."""
69
- try:
70
- # get the singular values
71
- if A.dtype != torch.float32:
72
- A = A.float()
73
- if A.dim() == 1:
74
- A = A.unsqueeze(0)
75
- if 1 in A.shape:
76
- S = A.abs().view(-1)
77
- else:
78
- S = torch.linalg.svdvals(A)
79
- S = S[S > 1e-12]
80
- if S.numel() == 0:
81
- return 0.0
82
-
83
- # normalize the singular values
84
- S_sum = S.sum()
85
- S /= S_sum
86
-
87
- # calculate and return normalized effective rank
88
- log_S = torch.log2(S)
89
- H = -torch.dot(S, log_S)
90
- H_max = torch.log2(
91
- torch.tensor(float(S.numel()), dtype=torch.float32, device=S.device)
92
- )
93
- return (H / H_max).item() if H_max > 0 else 0.0
94
- except Exception as e:
95
- print(f"Error calculating NER: {e}")
96
- return 0.0
97
-
98
-
99
- def normalize_tensor(A: torch.Tensor) -> torch.Tensor:
100
- """Normalize input tensor."""
101
- A_min, A_max = A.min(), A.max()
102
- return (A - A_min) / max(A_max - A_min, 1e-10)
103
-
104
-
105
- def save_metrics_to_json(model_name: str, layer_metrics: dict, output_dir: str) -> None:
106
- model_name_slug = model_name.replace("/", "-").replace("_", "-")
107
- filename = os.path.join(output_dir, f"metrics_results_{model_name_slug}.json")
108
- with open(filename, "w") as f:
109
- json.dump(layer_metrics, f, indent=4)
110
- print(f"Metrics saved to {filename}")
111
-
112
-
113
- def load_config(config_path: str) -> dict:
114
- """Load configuration from YAML file."""
115
- with open(config_path, "r") as file:
116
- return yaml.safe_load(file)
117
-
118
-
119
- def metric_file_exists(model_name: str, metric_dir: str) -> bool:
120
- """Check if metric file already exists for the given model."""
121
- model_name_slug = model_name.replace("/", "-").replace("_", "-")
122
- filename = os.path.join(metric_dir, f"metrics_results_{model_name_slug}.json")
123
- return os.path.exists(filename)
124
-
125
-
126
- def load_all_metrics(config: dict) -> dict:
127
- """Load all metrics from the metric directory."""
128
- all_metrics = {}
129
- for model_name in [config["base_model"]] + config["fine_tuned_models"]:
130
- model_name_slug = model_name.replace("/", "-").replace("_", "-")
131
- filename = os.path.join(
132
- config["metric_dir"], f"metrics_results_{model_name_slug}.json"
133
- )
134
- with open(filename, "r") as f:
135
- all_metrics[model_name] = json.load(f)
136
- return all_metrics
137
-
138
-
139
- def identify_common_layers(all_metrics: dict) -> list:
140
- """Identify common layers across all models."""
141
- layer_sets = [set(model_metrics.keys()) for model_metrics in all_metrics.values()]
142
- common_layers = set.intersection(*layer_sets)
143
- return list(common_layers)
144
-
145
-
146
- def identify_layers(all_metrics: dict) -> list:
147
- """Identify the superset of layers across all models, maintaining their relative order."""
148
- superset_layers = []
149
- added_layers = set()
150
- for model_metrics in all_metrics.values():
151
- for layer in model_metrics.keys():
152
- if layer not in added_layers:
153
- superset_layers.append(layer)
154
- added_layers.add(layer)
155
- return superset_layers
156
-
157
-
158
- def select_best_layers(common_layers: list, all_metrics: dict) -> dict:
159
- """Select best layers"""
160
- layer_selection = {}
161
- for layer in common_layers:
162
- best_model = max(
163
- all_metrics.keys(), key=lambda model: all_metrics[model][layer]["ner"]
164
- )
165
- layer_selection[layer] = best_model
166
-
167
- print("Selected layers:")
168
- print(json.dumps(layer_selection, indent=4))
169
- return layer_selection
170
-
171
-
172
- def save_composite_model(
173
- composite_model: AutoModelForCausalLM, layer_selection: dict, config: dict
174
- ) -> None:
175
- """Save composite model to the output directory."""
176
- date_str = datetime.now().strftime("%Y%m%d_%H%M%S")
177
- output_name = f"composite_model_{date_str}"
178
- output_dir = os.path.join(config["output_dir"], output_name)
179
- os.makedirs(output_dir, exist_ok=True)
180
- composite_model.save_pretrained(output_dir)
181
- generate_merge_report(layer_selection, output_dir, config)
182
-
183
- # Copy tokenizer files from the base model to the output directory
184
- base_model_path = os.path.join(
185
- config["models_dir"], config["base_model"].replace("/", "_")
186
- )
187
- tokenizer_files = ["tokenizer_config.json", "tokenizer.json", "vocab.json"]
188
-
189
- for file in tokenizer_files:
190
- src_path = os.path.join(base_model_path, file)
191
- dst_path = os.path.join(output_dir, file)
192
- if os.path.exists(src_path):
193
- shutil.copy2(src_path, dst_path)
194
- else:
195
- print(f"Warning: {file} not found in the base model directory.")
196
-
197
- print(f"Composite model and tokenizer files saved to: {output_dir}")
198
-
199
-
200
- def generate_merge_report(layer_selection: dict, output_dir, config: dict) -> None:
201
- """Generate merge report and save to the output directory."""
202
- report = {
203
- "base_model": config["base_model"],
204
- "fine_tuned_models": config["fine_tuned_models"],
205
- "layer_selection": layer_selection,
206
- }
207
- report_file = os.path.join(output_dir, "merge_report.json")
208
- with open(report_file, "w") as f:
209
- json.dump(report, f, indent=4)
210
- print(f"Merge report saved to {report_file}")
211
- print(json.dumps(report, indent=4))
212
-
213
-
214
- def create_composite_model(
215
- base_model_name: str, layer_selection: dict, config: dict
216
- ) -> AutoModelForCausalLM:
217
- """Create composite model by merging selected layers."""
218
- models_dir = config["models_dir"]
219
- base_model_path = os.path.join(models_dir, base_model_name.replace("/", "_"))
220
- base_model = load_model(base_model_path)
221
-
222
- for layer_name, source_model_name in layer_selection.items():
223
- print(f"Processing: {source_model_name} - {layer_name}")
224
- source_model_path = os.path.join(
225
- models_dir, source_model_name.replace("/", "_")
226
- )
227
- source_model = load_model(source_model_path, device="cpu")
228
-
229
- layer_parts = layer_name.split(".")
230
- source_layer = source_model
231
- for part in layer_parts:
232
- source_layer = getattr(source_layer, part)
233
- source_layer = source_layer.to("cuda")
234
-
235
- target_layer = base_model
236
- for part in layer_parts[:-1]:
237
- target_layer = getattr(target_layer, part)
238
- setattr(target_layer, layer_parts[-1], source_layer)
239
-
240
- print("Added to layer to composite model")
241
- del source_model, source_layer, part, target_layer, layer_parts
242
- torch.cuda.empty_cache()
243
-
244
- return base_model
245
-
246
-
247
- def get_num_layers(model_path: str) -> int:
248
- """Dynamically determine the number of layers in the model."""
249
- config = AutoConfig.from_pretrained(model_path)
250
- if hasattr(config, "num_hidden_layers"):
251
- return config.num_hidden_layers
252
- elif hasattr(config, "n_layer"):
253
- return config.n_layer
254
- else:
255
- raise ValueError("Could not determine the number of layers in the model.")
256
-
257
-
258
- def get_model_metrics(config: dict) -> None:
259
- """Get metrics for all models in the configuration."""
260
- models_dir = config["models_dir"]
261
- os.makedirs(models_dir, exist_ok=True)
262
- os.makedirs(config["output_dir"], exist_ok=True)
263
- models = [config["base_model"]] + config["fine_tuned_models"]
264
- metrics = ["ner"]
265
-
266
- for model_name in models:
267
- if metric_file_exists(model_name, config["metric_dir"]):
268
- print(f"Metric file for {model_name} already exists. Skipping...")
269
- continue
270
-
271
- local_model_path = download_model(model_name, models_dir)
272
- if not local_model_path:
273
- print(f"Skipping failed model: {model_name}")
274
- continue
275
-
276
- layer_metrics = process_model(model_name, local_model_path, metrics, config)
277
- save_metrics_to_json(model_name, layer_metrics, config["metric_dir"])
278
-
279
-
280
- @torch.inference_mode()
281
- def process_model(
282
- model_name: str, local_model_path: str, metrics: list, config: dict
283
- ) -> dict:
284
- """Process a single model to calculate and save metrics."""
285
- print(f"Processing model: {model_name}")
286
- with autocast(enabled=True):
287
- model = load_model(local_model_path)
288
- if not model:
289
- print(f"Failed to load model: {model_name}")
290
- return
291
-
292
- all_layers, layer_names = collect_and_normalize_weights(model)
293
- del model
294
- torch.cuda.synchronize()
295
- torch.cuda.empty_cache()
296
-
297
- layer_metrics = calculate_metrics_for_layers(layer_names, all_layers, metrics)
298
- del all_layers
299
- torch.cuda.synchronize()
300
- torch.cuda.empty_cache()
301
-
302
- save_metrics_to_json(model_name, layer_metrics, config["metric_dir"])
303
- plot_normalized_metrics(layer_metrics, model_name, config["metric_dir"])
304
-
305
- return layer_metrics
306
-
307
-
308
- def collect_and_normalize_weights(
309
- model: AutoModelForCausalLM,
310
- ) -> tuple[list[torch.Tensor], list[str]]:
311
- """Collect and normalize all layers from the model (only normalize once)."""
312
- all_layers = [
313
- module.weight.data
314
- for name, module in model.named_modules()
315
- if hasattr(module, "weight")
316
- ]
317
-
318
- for i, layer in enumerate(all_layers): # Normalize weights
319
- if layer.ndim < 2:
320
- layer = layer.unsqueeze(0) # Make it at least 2D
321
- layer = normalize_tensor(layer.to(torch.float32))
322
- all_layers[i] = layer.to(torch.bfloat16) # Back to bfloat16 and original device
323
-
324
- layer_names = [
325
- name for name, module in model.named_modules() if hasattr(module, "weight")
326
- ]
327
- return all_layers, layer_names
328
-
329
-
330
- def calculate_metrics_for_layers(
331
- layer_names: list[str], normalized_layers: list[torch.Tensor], metrics: list[str]
332
- ) -> dict:
333
- """Calculate metrics for each layer."""
334
- layer_metrics = {}
335
- with torch.no_grad():
336
- for idx, (name, normalized_layer) in enumerate(
337
- tqdm(zip(layer_names, normalized_layers), desc="Processing:")
338
- ):
339
- print(f" Layer: {name}")
340
- layer_metrics[name] = {}
341
-
342
- print(f"Layer {name} shape: {normalized_layer.shape}")
343
- for metric in metrics:
344
- print(f"Calculating {metric} for layer {name}")
345
- try:
346
- result = calculate_normalized_effective_rank(normalized_layer)
347
- except Exception as e:
348
- print(f"Error calculating {metric} for layer {name}: {e}")
349
- result = 0.0
350
- layer_metrics[name][metric] = result
351
- print(f"{metric} for layer {name}: {result}")
352
-
353
- torch.cuda.empty_cache()
354
- return layer_metrics
355
-
356
-
357
- def normalize_metrics(metrics: dict) -> dict:
358
- """Normalize each metric to be between 0 and 1."""
359
- normalized = {metric: [] for metric in next(iter(metrics.values())).keys()}
360
-
361
- for metric in normalized.keys():
362
- values = [layer_metrics[metric] for layer_metrics in metrics.values()]
363
- min_val, max_val = min(values), max(values)
364
- normalized[metric] = [
365
- 0 if max_val == min_val else (v - min_val) / (max_val - min_val)
366
- for v in values
367
- ]
368
- return normalized
369
-
370
-
371
- def plot_normalized_metrics(metrics: dict, model_name: str, output_dir: str):
372
- """Plot normalized metrics for each layer and save as a PNG file."""
373
- normalized = normalize_metrics(metrics)
374
- layers = list(metrics.keys())
375
-
376
- plt.figure(figsize=(10, 10)) # This will give us a 768x768 pixel image at 96 DPI
377
- for metric, values in normalized.items():
378
- plt.plot(values, label=metric)
379
-
380
- plt.xlabel("Layers")
381
- plt.ylabel("Normalized Metric Value")
382
- plt.title(f"Normalized Metrics Across Layers - {model_name}")
383
- plt.legend()
384
-
385
- # Set x-axis ticks
386
- num_layers = len(layers)
387
- if num_layers > 20:
388
- step = num_layers // 10
389
- plt.xticks(range(0, num_layers, step), layers[::step], rotation=45, ha="right")
390
- else:
391
- plt.xticks(range(num_layers), layers, rotation=45, ha="right")
392
-
393
- # Save the plot as a PNG file
394
- plt.tight_layout()
395
- model_name_slug = model_name.replace("/", "-").replace("_", "-")
396
- filename = os.path.join(output_dir, f"metrics_plot_{model_name_slug}.png")
397
- plt.savefig(filename, dpi=96, bbox_inches="tight")
398
- plt.close()
399
-
400
- print(f"Metrics plot saved to {filename}")
401
-
402
-
403
- def merge_models(config: dict) -> None:
404
- """Merge models based on the given configuration."""
405
- all_metrics = load_all_metrics(config)
406
- layers = identify_layers(all_metrics)
407
- layer_selection = select_best_layers(layers, all_metrics)
408
- layer_selection = dict(sorted(layer_selection.items()))
409
- composite_model = create_composite_model(
410
- config["base_model"], layer_selection, config
411
- )
412
- save_composite_model(composite_model, layer_selection, config)
413
-
414
-
415
- def main(config_path: str) -> None:
416
- """Main function to run the model merging process."""
417
- config = load_config(config_path)
418
-
419
- get_model_metrics(config)
420
- print("Metric calculation completed.")
421
-
422
- merge_models(config)
423
- print(f"Saved composite model and merge report to: {config['output_dir']}")
424
-
425
-
426
- if __name__ == "__main__":
427
- parser = argparse.ArgumentParser(
428
- description="mastermerge: Advanced model merging tool"
429
- )
430
- parser.add_argument(
431
- "--config",
432
- type=str,
433
- default="mastermerge_config.yaml",
434
- help="Path to configuration file",
435
- )
436
- args = parser.parse_args()
437
- main(args.config)
 
1
+ """
2
+ NER (Normalized Effective Rank) quantifies dimensional utilization across layers using entropy analysis of singular value distributions. NER calculation involves Singular Value Decomposition (SVD) of weight matrix A. Singular values form a probability distribution through normalization. Entropy H of this distribution yields the Effective Rank (ERank) as 2^H. Normalizing by maximum possible entropy H_max produces a value between 0 and 1, measuring dimensional utilization efficiency.
3
+
4
+ Run the script with:
5
+ python mastermerge.py --config mastermerge_config.yaml (optional)
6
+
7
+ The script loads configuration, processes each model by downloading, loading weights, normalizing layers, calculating NER for each layer, using NER to identify the optimal layer, finally creating a composite model with the highest ner in each layer.
8
+
9
+ **License**
10
+ Use, modify, and distribute as you see fit. Good luck with that shit.
11
+ Copyright 2024, nobody. No rights reserved.
12
+ """
13
+
14
+ import torch
15
+ import json
16
+ import argparse
17
+ import shutil
18
+ from tqdm import tqdm
19
+ import os
20
+ import yaml
21
+ from typing import Optional
22
+ from datetime import datetime
23
+ from torch.cuda.amp import autocast
24
+ from huggingface_hub import snapshot_download
25
+ from transformers import AutoModelForCausalLM
26
+ from transformers import AutoConfig
27
+
28
+
29
+ def download_model(model_name: str, models_dir: str) -> Optional[str]:
30
+ """Download model from Hugging Face Hub."""
31
+ local_path = os.path.join(models_dir, model_name.replace("/", "_"))
32
+ if not os.path.exists(local_path):
33
+ print(f"Downloading {model_name} to {local_path}")
34
+ try:
35
+ snapshot_download(
36
+ repo_id=model_name,
37
+ local_dir=local_path,
38
+ local_dir_use_symlinks=False,
39
+ revision="main",
40
+ )
41
+ print(f"Successfully downloaded {model_name}")
42
+ except Exception as e:
43
+ print(f"Error downloading {model_name}: {e}")
44
+ return None
45
+ else:
46
+ print(f"Model {model_name} already exists at {local_path}")
47
+
48
+ return local_path
49
+
50
+
51
+ def load_model(model_path: str, device: str = "cuda") -> Optional[AutoModelForCausalLM]:
52
+ """Load model from local path."""
53
+ try:
54
+ return AutoModelForCausalLM.from_pretrained(
55
+ model_path,
56
+ torch_dtype=torch.bfloat16,
57
+ low_cpu_mem_usage=True,
58
+ trust_remote_code=True,
59
+ device_map=device,
60
+ )
61
+ except Exception as e:
62
+ print(f"Error loading model: {e}")
63
+ return None
64
+
65
+
66
+ def calculate_normalized_effective_rank(A: torch.Tensor) -> float:
67
+ """ "Calculate the Normalized Effective Rank (NER) of a matrix."""
68
+ try:
69
+ # get the singular values
70
+ if A.dtype != torch.float32:
71
+ A = A.float()
72
+ if A.dim() == 1:
73
+ A = A.unsqueeze(0)
74
+ if 1 in A.shape:
75
+ S = A.abs().view(-1)
76
+ else:
77
+ S = torch.linalg.svdvals(A)
78
+ S = S[S > 1e-12]
79
+ if S.numel() == 0:
80
+ return 0.0
81
+
82
+ # normalize the singular values
83
+ S_sum = S.sum()
84
+ S /= S_sum
85
+
86
+ # calculate and return normalized effective rank
87
+ log_S = torch.log2(S)
88
+ H = -torch.dot(S, log_S)
89
+ H_max = torch.log2(
90
+ torch.tensor(float(S.numel()), dtype=torch.float32, device=S.device)
91
+ )
92
+ return (H / H_max).item() if H_max > 0 else 0.0
93
+ except Exception as e:
94
+ print(f"Error calculating NER: {e}")
95
+ return 0.0
96
+
97
+
98
+ def normalize_tensor(A: torch.Tensor) -> torch.Tensor:
99
+ """Normalize input tensor."""
100
+ A_min, A_max = A.min(), A.max()
101
+ return (A - A_min) / max(A_max - A_min, 1e-10)
102
+
103
+
104
+ def save_metrics_to_json(model_name: str, layer_metrics: dict, output_dir: str) -> None:
105
+ model_name_slug = model_name.replace("/", "-").replace("_", "-")
106
+ filename = os.path.join(output_dir, f"metrics_results_{model_name_slug}.json")
107
+ with open(filename, "w") as f:
108
+ json.dump(layer_metrics, f, indent=4)
109
+ print(f"Metrics saved to {filename}")
110
+
111
+
112
+ def load_config(config_path: str) -> dict:
113
+ """Load configuration from YAML file."""
114
+ with open(config_path, "r") as file:
115
+ return yaml.safe_load(file)
116
+
117
+
118
+ def metric_file_exists(model_name: str, metric_dir: str) -> bool:
119
+ """Check if metric file already exists for the given model."""
120
+ model_name_slug = model_name.replace("/", "-").replace("_", "-")
121
+ filename = os.path.join(metric_dir, f"metrics_results_{model_name_slug}.json")
122
+ return os.path.exists(filename)
123
+
124
+
125
+ def load_all_metrics(config: dict) -> dict:
126
+ """Load all metrics from the metric directory."""
127
+ all_metrics = {}
128
+ for model_name in [config["base_model"]] + config["fine_tuned_models"]:
129
+ model_name_slug = model_name.replace("/", "-").replace("_", "-")
130
+ filename = os.path.join(
131
+ config["metric_dir"], f"metrics_results_{model_name_slug}.json"
132
+ )
133
+ with open(filename, "r") as f:
134
+ all_metrics[model_name] = json.load(f)
135
+ return all_metrics
136
+
137
+
138
+ def identify_common_layers(all_metrics: dict) -> list:
139
+ """Identify common layers across all models."""
140
+ layer_sets = [set(model_metrics.keys()) for model_metrics in all_metrics.values()]
141
+ common_layers = set.intersection(*layer_sets)
142
+ return list(common_layers)
143
+
144
+
145
+ def identify_layers(all_metrics: dict) -> list:
146
+ """Identify the superset of layers across all models, maintaining their relative order."""
147
+ superset_layers = []
148
+ added_layers = set()
149
+ for model_metrics in all_metrics.values():
150
+ for layer in model_metrics.keys():
151
+ if layer not in added_layers:
152
+ superset_layers.append(layer)
153
+ added_layers.add(layer)
154
+ return superset_layers
155
+
156
+
157
+ def select_best_layers(common_layers: list, all_metrics: dict) -> dict:
158
+ """Select best layers"""
159
+ layer_selection = {}
160
+ for layer in common_layers:
161
+ best_model = max(
162
+ all_metrics.keys(), key=lambda model: all_metrics[model][layer]["ner"]
163
+ )
164
+ layer_selection[layer] = best_model
165
+
166
+ print("Selected layers:")
167
+ print(json.dumps(layer_selection, indent=4))
168
+ return layer_selection
169
+
170
+
171
+ def save_composite_model(
172
+ composite_model: AutoModelForCausalLM, layer_selection: dict, config: dict
173
+ ) -> None:
174
+ """Save composite model to the output directory."""
175
+ date_str = datetime.now().strftime("%Y%m%d_%H%M%S")
176
+ output_name = f"composite_model_{date_str}"
177
+ output_dir = os.path.join(config["output_dir"], output_name)
178
+ os.makedirs(output_dir, exist_ok=True)
179
+ composite_model.save_pretrained(output_dir)
180
+ generate_merge_report(layer_selection, output_dir, config)
181
+
182
+ # Copy tokenizer files from the base model to the output directory
183
+ base_model_path = os.path.join(
184
+ config["models_dir"], config["base_model"].replace("/", "_")
185
+ )
186
+ tokenizer_files = ["tokenizer_config.json", "tokenizer.json", "vocab.json"]
187
+
188
+ for file in tokenizer_files:
189
+ src_path = os.path.join(base_model_path, file)
190
+ dst_path = os.path.join(output_dir, file)
191
+ if os.path.exists(src_path):
192
+ shutil.copy2(src_path, dst_path)
193
+ else:
194
+ print(f"Warning: {file} not found in the base model directory.")
195
+
196
+ print(f"Composite model and tokenizer files saved to: {output_dir}")
197
+
198
+
199
+ def generate_merge_report(layer_selection: dict, output_dir, config: dict) -> None:
200
+ """Generate merge report and save to the output directory."""
201
+ report = {
202
+ "base_model": config["base_model"],
203
+ "fine_tuned_models": config["fine_tuned_models"],
204
+ "layer_selection": layer_selection,
205
+ }
206
+ report_file = os.path.join(output_dir, "merge_report.json")
207
+ with open(report_file, "w") as f:
208
+ json.dump(report, f, indent=4)
209
+ print(f"Merge report saved to {report_file}")
210
+ print(json.dumps(report, indent=4))
211
+
212
+
213
+ def create_composite_model(
214
+ base_model_name: str, layer_selection: dict, config: dict
215
+ ) -> AutoModelForCausalLM:
216
+ """Create composite model by merging selected layers."""
217
+ models_dir = config["models_dir"]
218
+ base_model_path = os.path.join(models_dir, base_model_name.replace("/", "_"))
219
+ base_model = load_model(base_model_path)
220
+
221
+ for layer_name, source_model_name in layer_selection.items():
222
+ print(f"Processing: {source_model_name} - {layer_name}")
223
+ source_model_path = os.path.join(
224
+ models_dir, source_model_name.replace("/", "_")
225
+ )
226
+ source_model = load_model(source_model_path, device="cpu")
227
+
228
+ layer_parts = layer_name.split(".")
229
+ source_layer = source_model
230
+ for part in layer_parts:
231
+ source_layer = getattr(source_layer, part)
232
+ source_layer = source_layer.to("cuda")
233
+
234
+ target_layer = base_model
235
+ for part in layer_parts[:-1]:
236
+ target_layer = getattr(target_layer, part)
237
+ setattr(target_layer, layer_parts[-1], source_layer)
238
+
239
+ print("Added to layer to composite model")
240
+ del source_model, source_layer, part, target_layer, layer_parts
241
+ torch.cuda.empty_cache()
242
+
243
+ return base_model
244
+
245
+
246
+ def get_num_layers(model_path: str) -> int:
247
+ """Dynamically determine the number of layers in the model."""
248
+ config = AutoConfig.from_pretrained(model_path)
249
+ if hasattr(config, "num_hidden_layers"):
250
+ return config.num_hidden_layers
251
+ elif hasattr(config, "n_layer"):
252
+ return config.n_layer
253
+ else:
254
+ raise ValueError("Could not determine the number of layers in the model.")
255
+
256
+
257
+ def get_model_metrics(config: dict) -> None:
258
+ """Get metrics for all models in the configuration."""
259
+ models_dir = config["models_dir"]
260
+ os.makedirs(models_dir, exist_ok=True)
261
+ os.makedirs(config["output_dir"], exist_ok=True)
262
+ models = [config["base_model"]] + config["fine_tuned_models"]
263
+ metrics = ["ner"]
264
+
265
+ for model_name in models:
266
+ if metric_file_exists(model_name, config["metric_dir"]):
267
+ print(f"Metric file for {model_name} already exists. Skipping...")
268
+ continue
269
+
270
+ local_model_path = download_model(model_name, models_dir)
271
+ if not local_model_path:
272
+ print(f"Skipping failed model: {model_name}")
273
+ continue
274
+
275
+ layer_metrics = process_model(model_name, local_model_path, metrics, config)
276
+ save_metrics_to_json(model_name, layer_metrics, config["metric_dir"])
277
+
278
+
279
+ @torch.inference_mode()
280
+ def process_model(
281
+ model_name: str, local_model_path: str, metrics: list, config: dict
282
+ ) -> dict:
283
+ """Process a single model to calculate and save metrics."""
284
+ print(f"Processing model: {model_name}")
285
+ with autocast(enabled=True):
286
+ model = load_model(local_model_path)
287
+ if not model:
288
+ print(f"Failed to load model: {model_name}")
289
+ return
290
+
291
+ all_layers, layer_names = collect_and_normalize_weights(model)
292
+ del model
293
+ torch.cuda.synchronize()
294
+ torch.cuda.empty_cache()
295
+
296
+ layer_metrics = calculate_metrics_for_layers(layer_names, all_layers, metrics)
297
+ del all_layers
298
+ torch.cuda.synchronize()
299
+ torch.cuda.empty_cache()
300
+
301
+ save_metrics_to_json(model_name, layer_metrics, config["metric_dir"])
302
+
303
+ return layer_metrics
304
+
305
+
306
+ def collect_and_normalize_weights(
307
+ model: AutoModelForCausalLM,
308
+ ) -> tuple[list[torch.Tensor], list[str]]:
309
+ """Collect and normalize all layers from the model (only normalize once)."""
310
+ all_layers = [
311
+ module.weight.data
312
+ for name, module in model.named_modules()
313
+ if hasattr(module, "weight")
314
+ ]
315
+
316
+ for i, layer in enumerate(all_layers): # Normalize weights
317
+ if layer.ndim < 2:
318
+ layer = layer.unsqueeze(0) # Make it at least 2D
319
+ layer = normalize_tensor(layer.to(torch.float32))
320
+ all_layers[i] = layer.to(torch.bfloat16) # Back to bfloat16 and original device
321
+
322
+ layer_names = [
323
+ name for name, module in model.named_modules() if hasattr(module, "weight")
324
+ ]
325
+ return all_layers, layer_names
326
+
327
+
328
+ def calculate_metrics_for_layers(
329
+ layer_names: list[str], normalized_layers: list[torch.Tensor], metrics: list[str]
330
+ ) -> dict:
331
+ """Calculate metrics for each layer."""
332
+ layer_metrics = {}
333
+ with torch.no_grad():
334
+ for idx, (name, normalized_layer) in enumerate(
335
+ tqdm(zip(layer_names, normalized_layers), desc="Processing:")
336
+ ):
337
+ print(f" Layer: {name}")
338
+ layer_metrics[name] = {}
339
+
340
+ print(f"Layer {name} shape: {normalized_layer.shape}")
341
+ for metric in metrics:
342
+ print(f"Calculating {metric} for layer {name}")
343
+ try:
344
+ result = calculate_normalized_effective_rank(normalized_layer)
345
+ except Exception as e:
346
+ print(f"Error calculating {metric} for layer {name}: {e}")
347
+ result = 0.0
348
+ layer_metrics[name][metric] = result
349
+ print(f"{metric} for layer {name}: {result}")
350
+
351
+ torch.cuda.empty_cache()
352
+ return layer_metrics
353
+
354
+
355
+ def normalize_metrics(metrics: dict) -> dict:
356
+ """Normalize each metric to be between 0 and 1."""
357
+ normalized = {metric: [] for metric in next(iter(metrics.values())).keys()}
358
+
359
+ for metric in normalized.keys():
360
+ values = [layer_metrics[metric] for layer_metrics in metrics.values()]
361
+ min_val, max_val = min(values), max(values)
362
+ normalized[metric] = [
363
+ 0 if max_val == min_val else (v - min_val) / (max_val - min_val)
364
+ for v in values
365
+ ]
366
+ return normalized
367
+
368
+
369
+ def merge_models(config: dict) -> None:
370
+ """Merge models based on the given configuration."""
371
+ all_metrics = load_all_metrics(config)
372
+ layers = identify_layers(all_metrics)
373
+ layer_selection = select_best_layers(layers, all_metrics)
374
+ layer_selection = dict(sorted(layer_selection.items()))
375
+ composite_model = create_composite_model(
376
+ config["base_model"], layer_selection, config
377
+ )
378
+ save_composite_model(composite_model, layer_selection, config)
379
+
380
+
381
+ def main(config_path: str) -> None:
382
+ """Main function to run the model merging process."""
383
+ config = load_config(config_path)
384
+
385
+ get_model_metrics(config)
386
+ print("Metric calculation completed.")
387
+
388
+ merge_models(config)
389
+ print(f"Saved composite model and merge report to: {config['output_dir']}")
390
+
391
+
392
+ if __name__ == "__main__":
393
+ parser = argparse.ArgumentParser(
394
+ description="mastermerge: Advanced model merging tool"
395
+ )
396
+ parser.add_argument(
397
+ "--config",
398
+ type=str,
399
+ default="mastermerge_config.yaml",
400
+ help="Path to configuration file",
401
+ )
402
+ args = parser.parse_args()
403
+ main(args.config)