vumichien commited on
Commit
cd09ca8
·
1 Parent(s): 0f2bf45

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +58 -2
utils.py CHANGED
@@ -1,15 +1,19 @@
 
1
  from io import BytesIO
2
  import base64
3
  from PIL import Image
4
  import cv2
5
  import numpy as np
6
- from gtts import gTTS
 
 
7
 
8
  def tts(text: str, language="ja", encode=False) -> object:
9
  """Converts text into autoplay html.
10
  Args:
11
  text (str): generated answer of bot
12
  language (str): language of text
 
13
  Returns:
14
  html: autoplay object
15
  """
@@ -25,6 +29,25 @@ def tts(text: str, language="ja", encode=False) -> object:
25
  return "temp.mp3"
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def read_image_file(file) -> Image.Image:
29
  image = Image.open(BytesIO(file))
30
  return image
@@ -54,4 +77,37 @@ def base64_to_pil(img_str):
54
  def get_hist(image):
55
  hist = cv2.calcHist([np.array(image)], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
56
  hist = cv2.normalize(hist, hist).flatten()
57
- return hist
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gtts import gTTS
2
  from io import BytesIO
3
  import base64
4
  from PIL import Image
5
  import cv2
6
  import numpy as np
7
+ import subprocess
8
+ from speech_recognition import AudioFile, Recognizer
9
+
10
 
11
  def tts(text: str, language="ja", encode=False) -> object:
12
  """Converts text into autoplay html.
13
  Args:
14
  text (str): generated answer of bot
15
  language (str): language of text
16
+ encode (bool): if True, return base64 encoded string
17
  Returns:
18
  html: autoplay object
19
  """
 
29
  return "temp.mp3"
30
 
31
 
32
+ def stt(audio: object, language='ja') -> str:
33
+ """Converts speech to text.
34
+ Args:
35
+ audio: record of user speech
36
+ language (str): language of text
37
+ Returns:
38
+ text (str): recognized speech of user
39
+ """
40
+ # Create a Recognizer object
41
+ r = Recognizer()
42
+ # Open the audio file
43
+ with AudioFile(audio) as source:
44
+ # Listen for the data (load audio to memory)
45
+ audio_data = r.record(source)
46
+ # Transcribe the audio using Google's speech-to-text API
47
+ text = r.recognize_google(audio_data, language=language)
48
+ return text
49
+
50
+
51
  def read_image_file(file) -> Image.Image:
52
  image = Image.open(BytesIO(file))
53
  return image
 
77
  def get_hist(image):
78
  hist = cv2.calcHist([np.array(image)], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
79
  hist = cv2.normalize(hist, hist).flatten()
80
+ return hist
81
+
82
+
83
+ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
84
+ """
85
+ Helper function to read an audio file through ffmpeg.
86
+ """
87
+ ar = f"{sampling_rate}"
88
+ ac = "1"
89
+ format_for_conversion = "f32le"
90
+ ffmpeg_command = [
91
+ "ffmpeg",
92
+ "-i",
93
+ "pipe:0",
94
+ "-ac",
95
+ ac,
96
+ "-ar",
97
+ ar,
98
+ "-f",
99
+ format_for_conversion,
100
+ "-hide_banner",
101
+ "-loglevel",
102
+ "quiet",
103
+ "pipe:1",
104
+ ]
105
+
106
+ try:
107
+ ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
108
+ except FileNotFoundError:
109
+ raise ValueError("ffmpeg was not found but is required to load audio files from filename")
110
+ output_stream = ffmpeg_process.communicate(bpayload)
111
+ out_bytes = output_stream[0]
112
+ audio = np.frombuffer(out_bytes, np.float32)
113
+ return audio