ymgong3616 commited on
Commit
a1dd728
·
1 Parent(s): 92d093b

update space

Browse files
Files changed (1) hide show
  1. app.py +21 -29
app.py CHANGED
@@ -35,32 +35,35 @@ SAMPLE_RATE = feature_extractor.sampling_rate
35
  SEED = 42
36
 
37
 
38
- def accumulate_and_convert_to_mp3(audio_chunks, sampling_rate):
39
- # Concatenate all chunks
40
- accumulated_audio = np.concatenate(audio_chunks, axis=0)
41
-
42
- # Normalize the entire audio at once
43
- if np.issubdtype(accumulated_audio.dtype, np.floating):
44
- max_val = np.max(np.abs(accumulated_audio))
45
- accumulated_audio = (accumulated_audio / max_val) * 32767
46
- accumulated_audio = accumulated_audio.astype(np.int16)
47
-
48
- # Create an audio segment from the complete numpy array
49
  audio_segment = AudioSegment(
50
- accumulated_audio.tobytes(),
51
  frame_rate=sampling_rate,
52
- sample_width=accumulated_audio.dtype.itemsize,
53
  channels=1
54
  )
55
 
56
- # Export to MP3 with high quality
57
  mp3_io = io.BytesIO()
58
  audio_segment.export(mp3_io, format="mp3", bitrate="320k")
 
 
59
  mp3_bytes = mp3_io.getvalue()
60
  mp3_io.close()
61
 
62
  return mp3_bytes
63
 
 
 
 
 
64
  def generate_response(audio):
65
  gr.Info("Transcribing Audio", duration=5)
66
  question = client.automatic_speech_recognition(audio).text
@@ -76,7 +79,9 @@ def generate_response(audio):
76
 
77
  @spaces.GPU
78
  def read_response(answer):
79
- play_steps_in_s = 10.0
 
 
80
  play_steps = int(frame_rate * play_steps_in_s)
81
 
82
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
@@ -97,23 +102,10 @@ def read_response(answer):
97
  set_seed(SEED)
98
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
99
  thread.start()
100
-
101
- # Accumulate audio chunks
102
- audio_chunks = []
103
  start = time.time()
104
  for new_audio in streamer:
105
  print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds after {time.time() - start} seconds")
106
- audio_chunks.append(new_audio)
107
- # Yield None for the audio to maintain the streaming interface
108
- yield answer, None
109
-
110
- # Convert the accumulated audio to MP3 at the end
111
- if audio_chunks:
112
- final_mp3 = accumulate_and_convert_to_mp3(audio_chunks, sampling_rate)
113
- yield answer, final_mp3
114
-
115
- sampling_rate = model.audio_encoder.config.sampling_rate
116
- frame_rate = model.audio_encoder.config.frame_rate
117
 
118
 
119
  with gr.Blocks() as block:
 
35
  SEED = 42
36
 
37
 
38
+ def numpy_to_mp3(audio_array, sampling_rate):
39
+ # Normalize audio_array if it's floating-point
40
+ if np.issubdtype(audio_array.dtype, np.floating):
41
+ max_val = np.max(np.abs(audio_array))
42
+ audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
43
+ audio_array = audio_array.astype(np.int16)
44
+
45
+ # Create an audio segment from the numpy array
 
 
 
46
  audio_segment = AudioSegment(
47
+ audio_array.tobytes(),
48
  frame_rate=sampling_rate,
49
+ sample_width=audio_array.dtype.itemsize,
50
  channels=1
51
  )
52
 
53
+ # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
54
  mp3_io = io.BytesIO()
55
  audio_segment.export(mp3_io, format="mp3", bitrate="320k")
56
+
57
+ # Get the MP3 bytes
58
  mp3_bytes = mp3_io.getvalue()
59
  mp3_io.close()
60
 
61
  return mp3_bytes
62
 
63
+ sampling_rate = model.audio_encoder.config.sampling_rate
64
+ frame_rate = model.audio_encoder.config.frame_rate
65
+
66
+
67
  def generate_response(audio):
68
  gr.Info("Transcribing Audio", duration=5)
69
  question = client.automatic_speech_recognition(audio).text
 
79
 
80
  @spaces.GPU
81
  def read_response(answer):
82
+
83
+
84
+ play_steps_in_s = 6.0
85
  play_steps = int(frame_rate * play_steps_in_s)
86
 
87
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
 
102
  set_seed(SEED)
103
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
104
  thread.start()
 
 
 
105
  start = time.time()
106
  for new_audio in streamer:
107
  print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds after {time.time() - start} seconds")
108
+ yield answer, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
 
 
 
 
 
 
 
 
 
 
109
 
110
 
111
  with gr.Blocks() as block: