SaraAlthubaiti
/

TinyOctopus

Audio-Text-to-Text

Safetensors

Arabic

English

Model card Files Files and versions Community

SaraAlthubaiti commited on 9 days ago

Commit

33ac3a6

verified ·

1 Parent(s): 3447959

...

Browse files

Files changed (2) hide show

decode_config.yaml +58 -0
test_prompt.json +31 -0

decode_config.yaml ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (2024) Tsinghua University, Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+model:
+  # paths
+  llama_path: "DeepSeek-R1-Distill-Qwen-1.5B/"
+  whisper_path: "distil-large-v3/"
+  beats_path: "BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt"
+  ckpt: "tiny_all_tasks_319.pth"
+  freeze_whisper: True
+  freeze_beats: True
+  # window-level Q-Former
+  use_speech_Qformer: True
+  freeze_speech_QFormer: False
+  window_level_Qformer: True
+  num_speech_query_token: 1
+  second_per_window: 0.333333
+  second_stride: 0.333333
+  speech_llama_proj_model: ""
+  freeze_speech_llama_proj: False
+  # LoRA
+  lora: True
+  lora_rank: 8
+  lora_alpha: 32
+  lora_dropout: 0.1
+  multi_prompt: True
+  prompt_template: "USER: {}\nASSISTANT:"
+  prompt_path: "prompts/train_prompt.json"
+  test_prompt_path: "prompts/test_prompt.json"
+  max_txt_len: 300
+  end_sym: "</s>"
+generate:
+  max_new_tokens: 200
+  num_beams: 4
+  do_sample: False
+  min_length: 1
+  temperature: 1.0
+  top_p: 0.9
+  repetition_penalty: 1.0
+  length_penalty: 1.0

test_prompt.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "asr": "<Speech><SpeechHere></Speech> Recognize the speech and give me the transcription.",
+    "gender_recognition": "<Speech><SpeechHere></Speech> What is the gender of the speaker?",
+    "dialect_identification": "<Speech><SpeechHere></Speech> What is the dialect of the speaker?",
+    "asr_zh": "<Speech><SpeechHere></Speech> 请将语音中的内容写下来。",
+    "summarization": "<Speech><SpeechHere></Speech> Could you capture the main points of this audio in a short summary?",
+    "translation_ae": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into English.",
+    "asr_de": "<Speech><SpeechHere></Speech> Hören Sie sich die Rede an und schreiben Sie ihren Inhalt auf.",
+    "translation_ec": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into Chinese.",
+    "audiocaption": "<Speech><SpeechHere></Speech> Please describe the audio.",
+    "audiocaption_v2": "<Speech><SpeechHere></Speech> Please write down what your hear in the audio.",
+    "QA": "<Speech><SpeechHere></Speech> {}",
+    "gender_QA": "<Speech><SpeechHere></Speech> {}",
+    "phone_recognition": "<Speech><SpeechHere></Speech> Provide the phonetic transcription for the speech.",
+    "speech_query": "<Speech><SpeechHere></Speech> Please answer the question in detail.",
+    "emotion_recognition": "<Speech><SpeechHere></Speech> Describe the emotion of the speaker in one word.",
+    "lyrics_recognition": "<Speech><SpeechHere></Speech> Listen to the song and write down its content.",
+    "audio_speech_description": "<Speech><SpeechHere></Speech> Describe the speech and the background audio",
+    "speaker_verification": "<Speech><SpeechHere></Speech> Do you only hear the same person talking? Answer yes or no.",
+    "fluent_speech_audio": "<Speech><SpeechHere></Speech> Describe the background audio and the speech in a fluent sentence.",
+    "speech_separation": "<Speech><SpeechHere></Speech> Please write down what you hear each person says.",
+    "audio_story_telling": "<Speech><SpeechHere></Speech> Based on the audio, write a story in detail. Your story should be highly related to the audio.",
+    "speech_audio_query": "<Speech><SpeechHere></Speech> Please answer the speaker's question in detail based on the background sound.",
+    "slot_filling": "<Speech><SpeechHere></Speech> According to the speech, what is the {}?",
+    "music_description": "<Speech><SpeechHere></Speech> Listen to this music clip and describe the music.",
+    "translation_en2ja": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into Japanese.",
+    "translation_en2de": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into German.",
+    "speech_audio_coreasoning": "<Speech><SpeechHere></Speech> Use your strong reasoning skills to answer the speaker's question in detail based on the background sound.",
+    "keywords": "<Speech><SpeechHere></Speech> Give me only three keywords of the text.",
+    "speaker_diarization_asr": "<Speech><SpeechHere></Speech> Please recognize each speaker and transcribe their speech content."
+}