Yazael commited on
Commit
1885313
Β·
verified Β·
1 Parent(s): 9a0ea31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -26
app.py CHANGED
@@ -20,9 +20,9 @@ generate_kwargs = {
20
  "num_beams": 1,
21
  "no_repeat_ngram_size": 5,
22
  "max_new_tokens": 64,
23
- "return_timestamps": True, # Necesario para obtener los tiempos
24
  }
25
 
 
26
  model_dict = {
27
  "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
28
  "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
@@ -42,18 +42,6 @@ pipe_dict = {
42
  logger.success("Pipelines initialized!")
43
 
44
 
45
- def save_as_srt(transcription, timestamps, output_path):
46
- """Genera un archivo .srt a partir de las transcripciones y sus marcas de tiempo."""
47
- with open(output_path, "w", encoding="utf-8") as f:
48
- for idx, (text, (start, end)) in enumerate(zip(transcription, timestamps)):
49
- start_time = time.strftime('%H:%M:%S', time.gmtime(start)) + f",{int(start % 1 * 1000):03d}"
50
- end_time = time.strftime('%H:%M:%S', time.gmtime(end)) + f",{int(end % 1 * 1000):03d}"
51
-
52
- f.write(f"{idx + 1}\n")
53
- f.write(f"{start_time} --> {end_time}\n")
54
- f.write(f"{text}\n\n")
55
-
56
-
57
  @spaces.GPU
58
  def transcribe_common(audio: str, model: str) -> str:
59
  if not audio:
@@ -61,7 +49,7 @@ def transcribe_common(audio: str, model: str) -> str:
61
  filename = Path(audio).name
62
  logger.info(f"Model: {model}")
63
  logger.info(f"Audio: {filename}")
64
-
65
  try:
66
  y, sr = librosa.load(audio, mono=True, sr=16000)
67
  except Exception as e:
@@ -73,22 +61,23 @@ def transcribe_common(audio: str, model: str) -> str:
73
 
74
  duration = librosa.get_duration(y=y, sr=sr)
75
  logger.info(f"Duration: {duration:.2f}s")
 
 
 
76
 
77
  start_time = time.time()
78
- result = pipe_dict[model](y, generate_kwargs=generate_kwargs)
79
  end_time = time.time()
80
-
81
- transcription = result["text"]
82
- timestamps = result["chunks"] # Esto contiene las marcas de tiempo
 
 
 
83
 
84
- logger.success(f"Finished in {end_time - start_time:.2f}s\n{transcription}")
85
-
86
- # Guardar resultado en un archivo .srt
87
- output_path = f"{Path(filename).stem}.srt"
88
- save_as_srt([chunk["text"] for chunk in timestamps], [(chunk["timestamp_start"], chunk["timestamp_end"]) for chunk in timestamps], output_path)
89
-
90
  logger.info(f"Transcription saved to {output_path}")
91
- return transcription
 
92
 
93
 
94
  def transcribe_others(audio) -> tuple[str, str]:
@@ -115,7 +104,6 @@ generate_kwargs = {
115
  "num_beams": 1,
116
  "no_repeat_ngram_size": 5,
117
  "max_new_tokens": 64, # η΅ζžœγŒι•·γ„γ¨γγ―ι€”δΈ­γ§ζ‰“γ‘εˆ‡γ‚‰γ‚Œγ‚‹
118
- "return_timestamps": True, # Para incluir tiempos
119
  }
120
  ```
121
  """
 
20
  "num_beams": 1,
21
  "no_repeat_ngram_size": 5,
22
  "max_new_tokens": 64,
 
23
  }
24
 
25
+
26
  model_dict = {
27
  "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
28
  "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
 
42
  logger.success("Pipelines initialized!")
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  @spaces.GPU
46
  def transcribe_common(audio: str, model: str) -> str:
47
  if not audio:
 
49
  filename = Path(audio).name
50
  logger.info(f"Model: {model}")
51
  logger.info(f"Audio: {filename}")
52
+
53
  try:
54
  y, sr = librosa.load(audio, mono=True, sr=16000)
55
  except Exception as e:
 
61
 
62
  duration = librosa.get_duration(y=y, sr=sr)
63
  logger.info(f"Duration: {duration:.2f}s")
64
+ kwargs = generate_kwargs.copy()
65
+ if duration > 30:
66
+ kwargs["return_timestamps"] = True
67
 
68
  start_time = time.time()
69
+ result = pipe_dict[model](y, generate_kwargs=kwargs)["text"]
70
  end_time = time.time()
71
+ logger.success(f"Finished in {end_time - start_time:.2f}s\n{result}")
72
+
73
+ # Guardar resultado en un archivo .str
74
+ output_path = f"{Path(filename).stem}.str"
75
+ with open(output_path, "w", encoding="utf-8") as f:
76
+ f.write(result)
77
 
 
 
 
 
 
 
78
  logger.info(f"Transcription saved to {output_path}")
79
+ return result
80
+
81
 
82
 
83
  def transcribe_others(audio) -> tuple[str, str]:
 
104
  "num_beams": 1,
105
  "no_repeat_ngram_size": 5,
106
  "max_new_tokens": 64, # η΅ζžœγŒι•·γ„γ¨γγ―ι€”δΈ­γ§ζ‰“γ‘εˆ‡γ‚‰γ‚Œγ‚‹
 
107
  }
108
  ```
109
  """