Spaces:
Running
Running
rbcurzon_laptop
commited on
Commit
·
30bd972
1
Parent(s):
9b436d2
refactor: enhance ASR model initialization and update translation configuration
Browse files
app.py
CHANGED
@@ -16,7 +16,7 @@ from fastapi import FastAPI, UploadFile, File, HTTPException, Form
|
|
16 |
from fastapi.responses import FileResponse
|
17 |
from fastapi.middleware.cors import CORSMiddleware
|
18 |
from starlette.background import BackgroundTask
|
19 |
-
from transformers import pipeline, VitsModel, VitsTokenizer
|
20 |
|
21 |
# External service imports
|
22 |
from google import genai
|
@@ -36,12 +36,23 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|
36 |
async def lifespan(app: FastAPI):
|
37 |
# Load models once at startup and store in app.state
|
38 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
39 |
model_id = "rbcurzon/whisper-large-v3-turbo"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
app.state.pipe = pipeline(
|
41 |
"automatic-speech-recognition",
|
42 |
-
model=
|
43 |
-
|
44 |
-
|
|
|
|
|
45 |
)
|
46 |
app.state.vad_model = load_silero_vad()
|
47 |
app.state.client = genai.Client(api_key=os.environ.get("GENAI_API_KEY"))
|
@@ -81,6 +92,7 @@ def translate(text, srcLang, tgtLang):
|
|
81 |
config=types.GenerateContentConfig(
|
82 |
system_instruction=f"You are an expert translator. Your task is to translate from {srcLang} to {tgtLang}. You must provide ONLY the translated text. Do not include any explanations, additional commentary, or conversational language. Just the translated text.",
|
83 |
thinking_config=types.ThinkingConfig(thinking_budget=0), # Disables thinking
|
|
|
84 |
)
|
85 |
)
|
86 |
return response.text
|
@@ -116,10 +128,11 @@ async def translate_audio(
|
|
116 |
"num_beams": 1,
|
117 |
"condition_on_prev_tokens": False,
|
118 |
"compression_ratio_threshold": 1.35,
|
119 |
-
"temperature": 0.0,
|
120 |
"logprob_threshold": -1.0,
|
121 |
"no_speech_threshold": 0.6,
|
122 |
"return_timestamps": True,
|
|
|
123 |
}
|
124 |
|
125 |
temp_file = remove_silence(file.filename)
|
|
|
16 |
from fastapi.responses import FileResponse
|
17 |
from fastapi.middleware.cors import CORSMiddleware
|
18 |
from starlette.background import BackgroundTask
|
19 |
+
from transformers import pipeline, VitsModel, VitsTokenizer, AutoModelForSpeechSeq2Seq, AutoProcessor
|
20 |
|
21 |
# External service imports
|
22 |
from google import genai
|
|
|
36 |
async def lifespan(app: FastAPI):
|
37 |
# Load models once at startup and store in app.state
|
38 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
39 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
40 |
model_id = "rbcurzon/whisper-large-v3-turbo"
|
41 |
+
|
42 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
43 |
+
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
44 |
+
)
|
45 |
+
model.to(device)
|
46 |
+
|
47 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
48 |
+
|
49 |
app.state.pipe = pipeline(
|
50 |
"automatic-speech-recognition",
|
51 |
+
model=model,
|
52 |
+
tokenizer=processor.tokenizer,
|
53 |
+
feature_extractor=processor.feature_extractor,
|
54 |
+
torch_dtype=torch_dtype,
|
55 |
+
device=device,
|
56 |
)
|
57 |
app.state.vad_model = load_silero_vad()
|
58 |
app.state.client = genai.Client(api_key=os.environ.get("GENAI_API_KEY"))
|
|
|
92 |
config=types.GenerateContentConfig(
|
93 |
system_instruction=f"You are an expert translator. Your task is to translate from {srcLang} to {tgtLang}. You must provide ONLY the translated text. Do not include any explanations, additional commentary, or conversational language. Just the translated text.",
|
94 |
thinking_config=types.ThinkingConfig(thinking_budget=0), # Disables thinking
|
95 |
+
temperature=0.6,
|
96 |
)
|
97 |
)
|
98 |
return response.text
|
|
|
128 |
"num_beams": 1,
|
129 |
"condition_on_prev_tokens": False,
|
130 |
"compression_ratio_threshold": 1.35,
|
131 |
+
"temperature": 0.0, # reduce temperature for more deterministic output
|
132 |
"logprob_threshold": -1.0,
|
133 |
"no_speech_threshold": 0.6,
|
134 |
"return_timestamps": True,
|
135 |
+
"language": "tl"
|
136 |
}
|
137 |
|
138 |
temp_file = remove_silence(file.filename)
|