rbcurzon commited on
Commit
c81e356
·
verified ·
1 Parent(s): 13abff5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -21
app.py CHANGED
@@ -14,6 +14,9 @@ from transformers import VitsModel, AutoTokenizer
14
  import numpy as np
15
  import scipy
16
  from IPython.display import Audio
 
 
 
17
 
18
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
19
 
@@ -37,6 +40,28 @@ app = FastAPI(
37
  description="Process and transcribe audio in real-time using Whisper"
38
  )
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def translate(text, srcLang, tgtLang):
41
  sys_instruct = "You are a professional translator. Generate a translation of the text and return only the result. Return only the translated text."
42
  response = client.models.generate_content(
@@ -50,8 +75,8 @@ def translate(text, srcLang, tgtLang):
50
  @app.post("/translateAudio/")
51
  async def translate_audio(
52
  file: UploadFile = File(...),
53
- srcLang: str = Form(...),
54
- tgtLang: str = Form(...)
55
  ):
56
 
57
  try:
@@ -60,14 +85,6 @@ async def translate_audio(
60
  f.write(content)
61
  print(f"Successfully uploaded {file.filename}")
62
 
63
- wav = read_audio(file.filename)
64
- speech_timestamps = get_speech_timestamps(wav, model)
65
- save_audio(
66
- "only_speech.wav",
67
- collect_chunks(speech_timestamps, wav),
68
- sampling_rate=16000
69
- )
70
-
71
  generate_kwargs = {
72
  "language": "tagalog",
73
  "return_timestamps": True,
@@ -75,24 +92,19 @@ async def translate_audio(
75
  # "initial_prompt": "The sentence may be cut off, do not make up words to fill in the rest of the sentence."
76
  }
77
 
 
 
78
  result = pipe(
79
- "only_speech.wav", # Transcribe audio
80
  batch_size=8,
81
  return_timestamps=True,
82
  generate_kwargs=generate_kwargs
83
  )
84
  print(result)
85
-
86
- timestamp = result['chunks'][0]['timestamp']
87
- end_time = timestamp[1]
88
- if end_time is None:
89
- raise Exception("The speech is difficult to understand.")
90
 
91
- translatedResult = translate(result['text'], srcLang=srcLang, tgtLang=tgtLang)
92
-
93
  result_dict = {
94
  "transcribed_text": result['text'],
95
- "translated_text": translatedResult,
96
  "srcLang": srcLang,
97
  "tgtLang": tgtLang
98
  }
@@ -109,8 +121,8 @@ async def translate_audio(
109
  file.file.close()
110
  if os.path.exists(file.filename):
111
  os.remove(file.filename)
112
- if os.path.exists("only_speech.wav"):
113
- os.remove("only_speech.wav")
114
 
115
 
116
  @app.post("/translateText/")
 
14
  import numpy as np
15
  import scipy
16
  from IPython.display import Audio
17
+ import uuid
18
+ import os
19
+ import tempfile
20
 
21
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
22
 
 
40
  description="Process and transcribe audio in real-time using Whisper"
41
  )
42
 
43
+ def remove_silence(filename):
44
+ wav = read_audio(filename)
45
+ speech_timestamps = get_speech_timestamps(wav, model)
46
+ temp_file = create_temp_filename()
47
+ save_audio(
48
+ temp_file,
49
+ collect_chunks(speech_timestamps, wav),
50
+ sampling_rate=16000
51
+ )
52
+
53
+ return temp_file
54
+
55
+ def create_temp_filename():
56
+ # Step 1: Generate a unique file name using uuid
57
+ unique_id = str(uuid.uuid4())
58
+ temp_file_name = f"{unique_id}.wav"
59
+
60
+ # Step 2: Create a temporary file
61
+ temp_file_path = os.path.join(tempfile.gettempdir(), temp_file_name)
62
+
63
+ return temp_file_path
64
+
65
  def translate(text, srcLang, tgtLang):
66
  sys_instruct = "You are a professional translator. Generate a translation of the text and return only the result. Return only the translated text."
67
  response = client.models.generate_content(
 
75
  @app.post("/translateAudio/")
76
  async def translate_audio(
77
  file: UploadFile = File(...),
78
+ srcLang: str = Form("Tagalog"),
79
+ tgtLang: str = Form("Cebuano"))
80
  ):
81
 
82
  try:
 
85
  f.write(content)
86
  print(f"Successfully uploaded {file.filename}")
87
 
 
 
 
 
 
 
 
 
88
  generate_kwargs = {
89
  "language": "tagalog",
90
  "return_timestamps": True,
 
92
  # "initial_prompt": "The sentence may be cut off, do not make up words to fill in the rest of the sentence."
93
  }
94
 
95
+ temp_file = remove_silence(file.filename)
96
+
97
  result = pipe(
98
+ temp_file,
99
  batch_size=8,
100
  return_timestamps=True,
101
  generate_kwargs=generate_kwargs
102
  )
103
  print(result)
 
 
 
 
 
104
 
 
 
105
  result_dict = {
106
  "transcribed_text": result['text'],
107
+ "translated_text": translate(result['text'], srcLang=srcLang, tgtLang=tgtLang),
108
  "srcLang": srcLang,
109
  "tgtLang": tgtLang
110
  }
 
121
  file.file.close()
122
  if os.path.exists(file.filename):
123
  os.remove(file.filename)
124
+ if os.path.exists(tempfile):
125
+ os.remove(tempfile)
126
 
127
 
128
  @app.post("/translateText/")