rbcurzon_laptop commited on
Commit
30bd972
·
1 Parent(s): 9b436d2

refactor: enhance ASR model initialization and update translation configuration

Browse files
Files changed (1) hide show
  1. app.py +18 -5
app.py CHANGED
@@ -16,7 +16,7 @@ from fastapi import FastAPI, UploadFile, File, HTTPException, Form
16
  from fastapi.responses import FileResponse
17
  from fastapi.middleware.cors import CORSMiddleware
18
  from starlette.background import BackgroundTask
19
- from transformers import pipeline, VitsModel, VitsTokenizer
20
 
21
  # External service imports
22
  from google import genai
@@ -36,12 +36,23 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
36
  async def lifespan(app: FastAPI):
37
  # Load models once at startup and store in app.state
38
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
39
  model_id = "rbcurzon/whisper-large-v3-turbo"
 
 
 
 
 
 
 
 
40
  app.state.pipe = pipeline(
41
  "automatic-speech-recognition",
42
- model=model_id,
43
- chunk_length_s=30,
44
- device=device
 
 
45
  )
46
  app.state.vad_model = load_silero_vad()
47
  app.state.client = genai.Client(api_key=os.environ.get("GENAI_API_KEY"))
@@ -81,6 +92,7 @@ def translate(text, srcLang, tgtLang):
81
  config=types.GenerateContentConfig(
82
  system_instruction=f"You are an expert translator. Your task is to translate from {srcLang} to {tgtLang}. You must provide ONLY the translated text. Do not include any explanations, additional commentary, or conversational language. Just the translated text.",
83
  thinking_config=types.ThinkingConfig(thinking_budget=0), # Disables thinking
 
84
  )
85
  )
86
  return response.text
@@ -116,10 +128,11 @@ async def translate_audio(
116
  "num_beams": 1,
117
  "condition_on_prev_tokens": False,
118
  "compression_ratio_threshold": 1.35,
119
- "temperature": 0.0,
120
  "logprob_threshold": -1.0,
121
  "no_speech_threshold": 0.6,
122
  "return_timestamps": True,
 
123
  }
124
 
125
  temp_file = remove_silence(file.filename)
 
16
  from fastapi.responses import FileResponse
17
  from fastapi.middleware.cors import CORSMiddleware
18
  from starlette.background import BackgroundTask
19
+ from transformers import pipeline, VitsModel, VitsTokenizer, AutoModelForSpeechSeq2Seq, AutoProcessor
20
 
21
  # External service imports
22
  from google import genai
 
36
  async def lifespan(app: FastAPI):
37
  # Load models once at startup and store in app.state
38
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
39
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
40
  model_id = "rbcurzon/whisper-large-v3-turbo"
41
+
42
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
43
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
44
+ )
45
+ model.to(device)
46
+
47
+ processor = AutoProcessor.from_pretrained(model_id)
48
+
49
  app.state.pipe = pipeline(
50
  "automatic-speech-recognition",
51
+ model=model,
52
+ tokenizer=processor.tokenizer,
53
+ feature_extractor=processor.feature_extractor,
54
+ torch_dtype=torch_dtype,
55
+ device=device,
56
  )
57
  app.state.vad_model = load_silero_vad()
58
  app.state.client = genai.Client(api_key=os.environ.get("GENAI_API_KEY"))
 
92
  config=types.GenerateContentConfig(
93
  system_instruction=f"You are an expert translator. Your task is to translate from {srcLang} to {tgtLang}. You must provide ONLY the translated text. Do not include any explanations, additional commentary, or conversational language. Just the translated text.",
94
  thinking_config=types.ThinkingConfig(thinking_budget=0), # Disables thinking
95
+ temperature=0.6,
96
  )
97
  )
98
  return response.text
 
128
  "num_beams": 1,
129
  "condition_on_prev_tokens": False,
130
  "compression_ratio_threshold": 1.35,
131
+ "temperature": 0.0, # reduce temperature for more deterministic output
132
  "logprob_threshold": -1.0,
133
  "no_speech_threshold": 0.6,
134
  "return_timestamps": True,
135
+ "language": "tl"
136
  }
137
 
138
  temp_file = remove_silence(file.filename)