3v324v23 commited on
Commit
686e87a
·
1 Parent(s): 7063fbc

add usage samples to readme

Browse files
README.md CHANGED
@@ -1155,7 +1155,14 @@ model.tts.float()
1155
 
1156
  ```python
1157
  mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
1158
- audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
 
 
 
 
 
 
 
1159
  msgs = [{'role': 'user', 'content': [mimick_prompt, audio_input]}]
1160
  res = model.chat(
1161
  msgs=msgs,
@@ -1165,7 +1172,7 @@ res = model.chat(
1165
  use_tts_template=True,
1166
  temperature=0.3,
1167
  generate_audio=True,
1168
- output_audio_path='output.wav', # save the tts result to output_audio_path
1169
  )
1170
  ```
1171
 
@@ -1177,7 +1184,7 @@ A general usage scenario of `MiniCPM-o-2.6` is role-playing a specific character
1177
 
1178
 
1179
  ```python
1180
- ref_audio, _ = librosa.load('./assets/voice_01.wav', sr=16000, mono=True) # load the reference audio
1181
  sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
1182
 
1183
  # round one
@@ -1191,7 +1198,7 @@ res = model.chat(
1191
  use_tts_template=True,
1192
  generate_audio=True,
1193
  temperature=0.3,
1194
- output_audio_path='result.wav',
1195
  )
1196
 
1197
  # round two
@@ -1206,7 +1213,7 @@ res = model.chat(
1206
  use_tts_template=True,
1207
  generate_audio=True,
1208
  temperature=0.3,
1209
- output_audio_path='result_round_2.wav',
1210
  )
1211
  print(res)
1212
  ```
@@ -1215,11 +1222,12 @@ print(res)
1215
 
1216
  #### Speech Conversation as an AI Assistant
1217
 
1218
- An enhanced feature of `MiniCPM-o-2.6` is to act as an AI assistant, but only with limited choice of voices. In this mode, `MiniCPM-o-2.6` is **less human-like and more like a voice assistant**. But it is more instruction-following.
1219
 
1220
  ```python
 
1221
  sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
1222
- user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
1223
 
1224
  # round one
1225
  msgs = [sys_prompt, user_question]
@@ -1231,7 +1239,7 @@ res = model.chat(
1231
  use_tts_template=True,
1232
  generate_audio=True,
1233
  temperature=0.3,
1234
- output_audio_path='result.wav',
1235
  )
1236
 
1237
  # round two
@@ -1246,7 +1254,7 @@ res = model.chat(
1246
  use_tts_template=True,
1247
  generate_audio=True,
1248
  temperature=0.3,
1249
- output_audio_path='result_round_2.wav',
1250
  )
1251
  print(res)
1252
  ```
@@ -1272,7 +1280,7 @@ res = model.chat(
1272
  use_tts_template=True,
1273
  generate_audio=True,
1274
  temperature=0.3,
1275
- output_audio_path='result.wav',
1276
  )
1277
  ```
1278
 
@@ -1284,6 +1292,7 @@ res = model.chat(
1284
 
1285
 
1286
  ```python
 
1287
  sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
1288
  text_prompt = f"Please read the text below."
1289
  user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]}
@@ -1297,7 +1306,7 @@ res = model.chat(
1297
  use_tts_template=True,
1298
  generate_audio=True,
1299
  temperature=0.3,
1300
- output_audio_path='result.wav',
1301
  )
1302
 
1303
  ```
@@ -1308,7 +1317,6 @@ res = model.chat(
1308
 
1309
  `MiniCPM-o-2.6` can also be used to address various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.
1310
 
1311
-
1312
  For audio-to-text tasks, you can use the following prompts:
1313
 
1314
  - ASR with ZH(same as AST en2zh): `请仔细听这段音频片段,并将其内容逐字记录。`
@@ -1319,7 +1327,7 @@ For audio-to-text tasks, you can use the following prompts:
1319
 
1320
  ```python
1321
  task_prompt = "Please listen to the audio snippet carefully and transcribe the content." + "\n" # can change to other prompts.
1322
- audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
1323
 
1324
  msgs = [{'role': 'user', 'content': [task_prompt, audio_input]}]
1325
 
@@ -1331,7 +1339,7 @@ res = model.chat(
1331
  use_tts_template=True,
1332
  generate_audio=True,
1333
  temperature=0.3,
1334
- output_audio_path='result.wav',
1335
  )
1336
  print(res)
1337
  ```
 
1155
 
1156
  ```python
1157
  mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
1158
+ audio_input, _ = librosa.load('./assets/input_examples/Trump_WEF_2018_10s.mp3', sr=16000, mono=True) # load the audio to be mimicked
1159
+
1160
+ # can also try `./assets/input_examples/cxk_original.wav`,
1161
+ # `./assets/input_examples/fast-pace.wav`,
1162
+ # `./assets/input_examples/chi-english-1.wav`
1163
+ # `./assets/input_examples/exciting-emotion.wav`
1164
+ # for different aspects of speech-centric features.
1165
+
1166
  msgs = [{'role': 'user', 'content': [mimick_prompt, audio_input]}]
1167
  res = model.chat(
1168
  msgs=msgs,
 
1172
  use_tts_template=True,
1173
  temperature=0.3,
1174
  generate_audio=True,
1175
+ output_audio_path='output_mimick.wav', # save the tts result to output_audio_path
1176
  )
1177
  ```
1178
 
 
1184
 
1185
 
1186
  ```python
1187
+ ref_audio, _ = librosa.load('./assets/input_examples/icl_20.wav', sr=16000, mono=True) # load the reference audio
1188
  sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
1189
 
1190
  # round one
 
1198
  use_tts_template=True,
1199
  generate_audio=True,
1200
  temperature=0.3,
1201
+ output_audio_path='result_roleplay_round_1.wav',
1202
  )
1203
 
1204
  # round two
 
1213
  use_tts_template=True,
1214
  generate_audio=True,
1215
  temperature=0.3,
1216
+ output_audio_path='result_roleplay_round_2.wav',
1217
  )
1218
  print(res)
1219
  ```
 
1222
 
1223
  #### Speech Conversation as an AI Assistant
1224
 
1225
+ An enhanced feature of `MiniCPM-o-2.6` is to act as an AI assistant, but only with limited choice of voices. In this mode, `MiniCPM-o-2.6` is **less human-like and more like a voice assistant**. In this mode, the model is more instruction-following. For demo, you are suggested to use `assistant_default_female_voice`, `assistant_male_voice`. Other voices may work but not as stable as the default voices.
1226
 
1227
  ```python
1228
+ ref_audio, _ = librosa.load('./assets/input_examples/assistant_default_female_voice.wav', sr=16000, mono=True) # or use `./assets/input_examples/assistant_male_voice.wav`
1229
  sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
1230
+ user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # load the user's audio question
1231
 
1232
  # round one
1233
  msgs = [sys_prompt, user_question]
 
1239
  use_tts_template=True,
1240
  generate_audio=True,
1241
  temperature=0.3,
1242
+ output_audio_path='result_assistant_round_1.wav',
1243
  )
1244
 
1245
  # round two
 
1254
  use_tts_template=True,
1255
  generate_audio=True,
1256
  temperature=0.3,
1257
+ output_audio_path='result_assistant_round_2.wav',
1258
  )
1259
  print(res)
1260
  ```
 
1280
  use_tts_template=True,
1281
  generate_audio=True,
1282
  temperature=0.3,
1283
+ output_audio_path='result_voice_creation.wav',
1284
  )
1285
  ```
1286
 
 
1292
 
1293
 
1294
  ```python
1295
+ ref_audio, _ = librosa.load('./assets/input_examples/icl_20.wav', sr=16000, mono=True) # load the reference audio
1296
  sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
1297
  text_prompt = f"Please read the text below."
1298
  user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]}
 
1306
  use_tts_template=True,
1307
  generate_audio=True,
1308
  temperature=0.3,
1309
+ output_audio_path='result_voice_cloning.wav',
1310
  )
1311
 
1312
  ```
 
1317
 
1318
  `MiniCPM-o-2.6` can also be used to address various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.
1319
 
 
1320
  For audio-to-text tasks, you can use the following prompts:
1321
 
1322
  - ASR with ZH(same as AST en2zh): `请仔细听这段音频片段,并将其内容逐字记录。`
 
1327
 
1328
  ```python
1329
  task_prompt = "Please listen to the audio snippet carefully and transcribe the content." + "\n" # can change to other prompts.
1330
+ audio_input, _ = librosa.load('./assets/input_examples/audio_understanding.mp3', sr=16000, mono=True) # load the audio to be captioned
1331
 
1332
  msgs = [{'role': 'user', 'content': [task_prompt, audio_input]}]
1333
 
 
1339
  use_tts_template=True,
1340
  generate_audio=True,
1341
  temperature=0.3,
1342
+ output_audio_path='result_audio_understanding.wav',
1343
  )
1344
  print(res)
1345
  ```
assets/input_examples/assistant_default_female_voice.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ee6f84892e693bd2bb478608db0c9a2459b936af3283697b006cfd969c75484
3
+ size 224044
assets/input_examples/assistant_male_voice.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6b5eff26be104bbfc039f31f8cebcd6f329c275ccafa01234856ec1a964e999
3
+ size 144044
assets/input_examples/icl_20.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53892ece071342958403bc5643f84169a30b89cc0fc79eb69508bfa11dd85e68
3
+ size 618528