base_model: Qwen/QwQ-32B | |
gate_mode: random | |
architecture: qwen | |
dtype: bfloat16 | |
experts: | |
- source_model: Qwen/QwQ-32B | |
- source_model: Qwen/QwQ-32B | |
- source_model: Qwen/QwQ-32B | |
- source_model: Qwen/QwQ-32B | |
- source_model: Qwen/QwQ-32B | |
- source_model: Qwen/QwQ-32B | |
- source_model: Qwen/QwQ-32B | |
- source_model: Qwen/QwQ-32B | |
shared_experts: | |
- source_model: Qwen/QwQ-32B | |
residual_scale: 0.1 # downweight output from shared expert to prevent overcooking the model | |