Files changed (1) hide show
  1. HKUSTAudio_Llasa-3B.json +60 -0
HKUSTAudio_Llasa-3B.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bomFormat": "CycloneDX",
3
+ "specVersion": "1.6",
4
+ "serialNumber": "urn:uuid:17cdbb99-2ed4-4bc9-9737-da2b355b4ee0",
5
+ "version": 1,
6
+ "metadata": {
7
+ "timestamp": "2025-06-05T09:39:45.028801+00:00",
8
+ "component": {
9
+ "type": "machine-learning-model",
10
+ "bom-ref": "HKUSTAudio/Llasa-3B-99325ba3-05db-50d6-b483-d40c189f187b",
11
+ "name": "HKUSTAudio/Llasa-3B",
12
+ "externalReferences": [
13
+ {
14
+ "url": "https://huggingface.co/HKUSTAudio/Llasa-3B",
15
+ "type": "documentation"
16
+ }
17
+ ],
18
+ "modelCard": {
19
+ "modelParameters": {
20
+ "task": "text-to-speech",
21
+ "architectureFamily": "llama",
22
+ "modelArchitecture": "LlamaForCausalLM"
23
+ },
24
+ "properties": [
25
+ {
26
+ "name": "base_model",
27
+ "value": "meta-llama/Llama-3.2-3B-Instruct"
28
+ }
29
+ ]
30
+ },
31
+ "authors": [
32
+ {
33
+ "name": "HKUSTAudio"
34
+ }
35
+ ],
36
+ "licenses": [
37
+ {
38
+ "license": {
39
+ "id": "CC-BY-NC-4.0",
40
+ "url": "https://spdx.org/licenses/CC-BY-NC-4.0.html"
41
+ }
42
+ }
43
+ ],
44
+ "description": "Our model, Llasa, is a text-to-speech (TTS) system that extends the text-based LLaMA (1B,3B, and 8B) language model by incorporating speech tokens from the XCodec2 codebook,which contains 65,536 tokens. We trained Llasa on a dataset comprising 250,000 hours of Chinese-English speech data.The model is capable of generating speech **either solely from input text or by utilizing a given speech prompt.**The method is seamlessly compatible with the Llama framework, making training TTS similar as training LLM (convert audios into single-codebook tokens and simply view it as a special language). It opens the possiblity of existing method for compression, acceleration and finetuning for LLM to be applied.",
45
+ "tags": [
46
+ "safetensors",
47
+ "llama",
48
+ "Text-to-Speech",
49
+ "text-to-speech",
50
+ "zh",
51
+ "en",
52
+ "arxiv:2502.04128",
53
+ "base_model:meta-llama/Llama-3.2-3B-Instruct",
54
+ "base_model:finetune:meta-llama/Llama-3.2-3B-Instruct",
55
+ "license:cc-by-nc-4.0",
56
+ "region:us"
57
+ ]
58
+ }
59
+ }
60
+ }