add AIBOM
#27
by
RiccardoDav
- opened
- HKUSTAudio_Llasa-3B.json +60 -0
HKUSTAudio_Llasa-3B.json
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bomFormat": "CycloneDX",
|
3 |
+
"specVersion": "1.6",
|
4 |
+
"serialNumber": "urn:uuid:17cdbb99-2ed4-4bc9-9737-da2b355b4ee0",
|
5 |
+
"version": 1,
|
6 |
+
"metadata": {
|
7 |
+
"timestamp": "2025-06-05T09:39:45.028801+00:00",
|
8 |
+
"component": {
|
9 |
+
"type": "machine-learning-model",
|
10 |
+
"bom-ref": "HKUSTAudio/Llasa-3B-99325ba3-05db-50d6-b483-d40c189f187b",
|
11 |
+
"name": "HKUSTAudio/Llasa-3B",
|
12 |
+
"externalReferences": [
|
13 |
+
{
|
14 |
+
"url": "https://huggingface.co/HKUSTAudio/Llasa-3B",
|
15 |
+
"type": "documentation"
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"modelCard": {
|
19 |
+
"modelParameters": {
|
20 |
+
"task": "text-to-speech",
|
21 |
+
"architectureFamily": "llama",
|
22 |
+
"modelArchitecture": "LlamaForCausalLM"
|
23 |
+
},
|
24 |
+
"properties": [
|
25 |
+
{
|
26 |
+
"name": "base_model",
|
27 |
+
"value": "meta-llama/Llama-3.2-3B-Instruct"
|
28 |
+
}
|
29 |
+
]
|
30 |
+
},
|
31 |
+
"authors": [
|
32 |
+
{
|
33 |
+
"name": "HKUSTAudio"
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"licenses": [
|
37 |
+
{
|
38 |
+
"license": {
|
39 |
+
"id": "CC-BY-NC-4.0",
|
40 |
+
"url": "https://spdx.org/licenses/CC-BY-NC-4.0.html"
|
41 |
+
}
|
42 |
+
}
|
43 |
+
],
|
44 |
+
"description": "Our model, Llasa, is a text-to-speech (TTS) system that extends the text-based LLaMA (1B,3B, and 8B) language model by incorporating speech tokens from the XCodec2 codebook,which contains 65,536 tokens. We trained Llasa on a dataset comprising 250,000 hours of Chinese-English speech data.The model is capable of generating speech **either solely from input text or by utilizing a given speech prompt.**The method is seamlessly compatible with the Llama framework, making training TTS similar as training LLM (convert audios into single-codebook tokens and simply view it as a special language). It opens the possiblity of existing method for compression, acceleration and finetuning for LLM to be applied.",
|
45 |
+
"tags": [
|
46 |
+
"safetensors",
|
47 |
+
"llama",
|
48 |
+
"Text-to-Speech",
|
49 |
+
"text-to-speech",
|
50 |
+
"zh",
|
51 |
+
"en",
|
52 |
+
"arxiv:2502.04128",
|
53 |
+
"base_model:meta-llama/Llama-3.2-3B-Instruct",
|
54 |
+
"base_model:finetune:meta-llama/Llama-3.2-3B-Instruct",
|
55 |
+
"license:cc-by-nc-4.0",
|
56 |
+
"region:us"
|
57 |
+
]
|
58 |
+
}
|
59 |
+
}
|
60 |
+
}
|