Update app.py
Browse files
app.py
CHANGED
@@ -20,9 +20,9 @@ generate_kwargs = {
|
|
20 |
"num_beams": 1,
|
21 |
"no_repeat_ngram_size": 5,
|
22 |
"max_new_tokens": 64,
|
23 |
-
"return_timestamps": True, # Necesario para obtener los tiempos
|
24 |
}
|
25 |
|
|
|
26 |
model_dict = {
|
27 |
"whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
|
28 |
"kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
|
@@ -42,18 +42,6 @@ pipe_dict = {
|
|
42 |
logger.success("Pipelines initialized!")
|
43 |
|
44 |
|
45 |
-
def save_as_srt(transcription, timestamps, output_path):
|
46 |
-
"""Genera un archivo .srt a partir de las transcripciones y sus marcas de tiempo."""
|
47 |
-
with open(output_path, "w", encoding="utf-8") as f:
|
48 |
-
for idx, (text, (start, end)) in enumerate(zip(transcription, timestamps)):
|
49 |
-
start_time = time.strftime('%H:%M:%S', time.gmtime(start)) + f",{int(start % 1 * 1000):03d}"
|
50 |
-
end_time = time.strftime('%H:%M:%S', time.gmtime(end)) + f",{int(end % 1 * 1000):03d}"
|
51 |
-
|
52 |
-
f.write(f"{idx + 1}\n")
|
53 |
-
f.write(f"{start_time} --> {end_time}\n")
|
54 |
-
f.write(f"{text}\n\n")
|
55 |
-
|
56 |
-
|
57 |
@spaces.GPU
|
58 |
def transcribe_common(audio: str, model: str) -> str:
|
59 |
if not audio:
|
@@ -61,7 +49,7 @@ def transcribe_common(audio: str, model: str) -> str:
|
|
61 |
filename = Path(audio).name
|
62 |
logger.info(f"Model: {model}")
|
63 |
logger.info(f"Audio: {filename}")
|
64 |
-
|
65 |
try:
|
66 |
y, sr = librosa.load(audio, mono=True, sr=16000)
|
67 |
except Exception as e:
|
@@ -73,22 +61,23 @@ def transcribe_common(audio: str, model: str) -> str:
|
|
73 |
|
74 |
duration = librosa.get_duration(y=y, sr=sr)
|
75 |
logger.info(f"Duration: {duration:.2f}s")
|
|
|
|
|
|
|
76 |
|
77 |
start_time = time.time()
|
78 |
-
result = pipe_dict[model](y, generate_kwargs=
|
79 |
end_time = time.time()
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
83 |
|
84 |
-
logger.success(f"Finished in {end_time - start_time:.2f}s\n{transcription}")
|
85 |
-
|
86 |
-
# Guardar resultado en un archivo .srt
|
87 |
-
output_path = f"{Path(filename).stem}.srt"
|
88 |
-
save_as_srt([chunk["text"] for chunk in timestamps], [(chunk["timestamp_start"], chunk["timestamp_end"]) for chunk in timestamps], output_path)
|
89 |
-
|
90 |
logger.info(f"Transcription saved to {output_path}")
|
91 |
-
return
|
|
|
92 |
|
93 |
|
94 |
def transcribe_others(audio) -> tuple[str, str]:
|
@@ -115,7 +104,6 @@ generate_kwargs = {
|
|
115 |
"num_beams": 1,
|
116 |
"no_repeat_ngram_size": 5,
|
117 |
"max_new_tokens": 64, # η΅ζγι·γγ¨γγ―ιδΈγ§ζγ‘εγγγ
|
118 |
-
"return_timestamps": True, # Para incluir tiempos
|
119 |
}
|
120 |
```
|
121 |
"""
|
|
|
20 |
"num_beams": 1,
|
21 |
"no_repeat_ngram_size": 5,
|
22 |
"max_new_tokens": 64,
|
|
|
23 |
}
|
24 |
|
25 |
+
|
26 |
model_dict = {
|
27 |
"whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
|
28 |
"kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
|
|
|
42 |
logger.success("Pipelines initialized!")
|
43 |
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
@spaces.GPU
|
46 |
def transcribe_common(audio: str, model: str) -> str:
|
47 |
if not audio:
|
|
|
49 |
filename = Path(audio).name
|
50 |
logger.info(f"Model: {model}")
|
51 |
logger.info(f"Audio: {filename}")
|
52 |
+
|
53 |
try:
|
54 |
y, sr = librosa.load(audio, mono=True, sr=16000)
|
55 |
except Exception as e:
|
|
|
61 |
|
62 |
duration = librosa.get_duration(y=y, sr=sr)
|
63 |
logger.info(f"Duration: {duration:.2f}s")
|
64 |
+
kwargs = generate_kwargs.copy()
|
65 |
+
if duration > 30:
|
66 |
+
kwargs["return_timestamps"] = True
|
67 |
|
68 |
start_time = time.time()
|
69 |
+
result = pipe_dict[model](y, generate_kwargs=kwargs)["text"]
|
70 |
end_time = time.time()
|
71 |
+
logger.success(f"Finished in {end_time - start_time:.2f}s\n{result}")
|
72 |
+
|
73 |
+
# Guardar resultado en un archivo .str
|
74 |
+
output_path = f"{Path(filename).stem}.str"
|
75 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
76 |
+
f.write(result)
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
logger.info(f"Transcription saved to {output_path}")
|
79 |
+
return result
|
80 |
+
|
81 |
|
82 |
|
83 |
def transcribe_others(audio) -> tuple[str, str]:
|
|
|
104 |
"num_beams": 1,
|
105 |
"no_repeat_ngram_size": 5,
|
106 |
"max_new_tokens": 64, # η΅ζγι·γγ¨γγ―ιδΈγ§ζγ‘εγγγ
|
|
|
107 |
}
|
108 |
```
|
109 |
"""
|