HKUSTAudio
/

Llasa-3B

Model card Files Files and versions

add AIBOM

#27

by RiccardoDav - opened Jun 12

base: refs/heads/main

←

from: refs/pr/27

Discussion Files changed

Files changed (1) hide show

HKUSTAudio_Llasa-3B.json +60 -0

HKUSTAudio_Llasa-3B.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+    "bomFormat": "CycloneDX",
+    "specVersion": "1.6",
+    "serialNumber": "urn:uuid:17cdbb99-2ed4-4bc9-9737-da2b355b4ee0",
+    "version": 1,
+    "metadata": {
+        "timestamp": "2025-06-05T09:39:45.028801+00:00",
+        "component": {
+            "type": "machine-learning-model",
+            "bom-ref": "HKUSTAudio/Llasa-3B-99325ba3-05db-50d6-b483-d40c189f187b",
+            "name": "HKUSTAudio/Llasa-3B",
+            "externalReferences": [
+                {
+                    "url": "https://huggingface.co/HKUSTAudio/Llasa-3B",
+                    "type": "documentation"
+                }
+            ],
+            "modelCard": {
+                "modelParameters": {
+                    "task": "text-to-speech",
+                    "architectureFamily": "llama",
+                    "modelArchitecture": "LlamaForCausalLM"
+                },
+                "properties": [
+                    {
+                        "name": "base_model",
+                        "value": "meta-llama/Llama-3.2-3B-Instruct"
+                    }
+                ]
+            },
+            "authors": [
+                {
+                    "name": "HKUSTAudio"
+                }
+            ],
+            "licenses": [
+                {
+                    "license": {
+                        "id": "CC-BY-NC-4.0",
+                        "url": "https://spdx.org/licenses/CC-BY-NC-4.0.html"
+                    }
+                }
+            ],
+            "description": "Our model, Llasa, is a text-to-speech (TTS) system that extends the text-based LLaMA (1B,3B, and 8B) language model by incorporating speech tokens from the XCodec2 codebook,which contains 65,536 tokens. We trained Llasa on a dataset comprising 250,000 hours of Chinese-English speech data.The model is capable of generating speech **either solely from input text or by utilizing a given speech prompt.**The method is seamlessly compatible with the Llama framework, making training TTS similar as training LLM (convert audios into single-codebook tokens and simply view it as a special language). It opens the possiblity of existing method for compression, acceleration and finetuning for LLM to be applied.",
+            "tags": [
+                "safetensors",
+                "llama",
+                "Text-to-Speech",
+                "text-to-speech",
+                "zh",
+                "en",
+                "arxiv:2502.04128",
+                "base_model:meta-llama/Llama-3.2-3B-Instruct",
+                "base_model:finetune:meta-llama/Llama-3.2-3B-Instruct",
+                "license:cc-by-nc-4.0",
+                "region:us"
+            ]
+        }
+    }
+}