Spaces:

PogusTheWhisper
/

Audio-to-Blog-Summarizer

Sleeping

App Files Files Community

Naphat Sornwichai commited on 21 days ago

Commit

b4c6511

1 Parent(s): 995e28f

update major files

Browse files

Files changed (1) hide show

app.py +35 -55

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 import yt_dlp
 from openai import OpenAI
 import os
 import json
 import torchaudio
-import torchaudio.transforms as T
 import time
 # --- 1. Model & Pipeline Initialization ---
@@ -21,14 +20,16 @@ model_id = "nectec/Pathumma-whisper-th-medium"
 print(f"Using device: {device} with dtype: {torch_dtype}")
-# Load the model and processor directly
-# We will use the model's .generate() method for long-form transcription
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id, dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
 )
-model.to(device)
-processor = AutoProcessor.from_pretrained(model_id)
 print("Transcription model loaded successfully.")
@@ -59,10 +60,11 @@ def download_youtube_audio(url: str) -> str:
 # --- 3. Core Logic ---
-def transcribe_and_summarize(audio_file: str, youtube_url: str, progress=gr.Progress()):
     """
     Main function to process audio, transcribe, and summarize.
     This is a generator function to yield status updates and logs to the UI.
     """
     log_history = ""
     def log(message):
@@ -71,8 +73,8 @@ def transcribe_and_summarize(audio_file: str, youtube_url: str, progress=gr.Prog
         log_history += f"[{timestamp}] {message}\n"
         return log_history
-    progress(0, desc="Starting...")
-    yield log("Process started."), "", "", "Starting..."
     # Step 1: Get API Key and validate inputs
     api_key = os.getenv('TYPHOON_API')
@@ -84,55 +86,34 @@ def transcribe_and_summarize(audio_file: str, youtube_url: str, progress=gr.Prog
     # Step 2: Determine audio source and get file path
     filepath = ""
     if youtube_url:
-        progress(0.1, desc="Downloading Audio...")
-        yield log("YouTube link detected. Starting download."), "", "", "Downloading Audio..."
         try:
             filepath = download_youtube_audio(youtube_url)
-            yield log(f"Audio downloaded successfully to '{filepath}'."), "", "", "Download Complete"
         except Exception as e:
-             yield log(f"Error downloading from YouTube: {e}"), "", "", f"Error: {e}"
              return
     else:
         filepath = audio_file
-        yield log(f"Processing uploaded file: '{filepath}'."), "", "", "Processing File..."
-    # Step 3: Transcribe audio using the model's generate method for long-form audio
-    progress(0.3, desc="Transcribing Audio...")
-    yield log("Beginning audio transcription..."), "", "", "Transcribing Audio..."
     try:
-        # Load audio file using torchaudio
-        waveform, sr = torchaudio.load(filepath)
-        # Resample to 16kHz if necessary, as Whisper expects this rate
-        if sr != 16000:
-            yield log(f"Original sample rate is {sr}Hz. Resampling to 16000Hz."), "", "", "Resampling..."
-            resampler = T.Resample(orig_freq=sr, new_freq=16000)
-            waveform = resampler(waveform)
-        # Process the audio waveform to get input features
-        input_features = processor(
-            waveform.squeeze().numpy(),
-            return_tensors="pt",
-            sampling_rate=16000
-        ).input_features.to(device, dtype=torch_dtype)
-        # Generate token IDs from the input features, passing task and language directly
-        predicted_ids = model.generate(input_features, language="th", task="transcribe")
-        # Decode the token IDs to text
-        transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        yield log("Transcription complete."), transcribed_text, "", "Transcription Complete"
     except Exception as e:
         raise gr.Error(f"An error occurred during transcription: {str(e)}")
     # Step 4: Summarize with Typhoon LLM
-    progress(0.8, desc="Generating Summary...")
-    yield log("Sending transcription to Typhoon LLM for summarization."), transcribed_text, "", "Generating Summary..."
     if not transcribed_text or not transcribed_text.strip():
-        yield log("Transcription is empty. Aborting summarization."), "", "Could not generate summary because the transcription is empty.", "Aborted"
         return
     # Initialize OpenAI client for Typhoon
@@ -166,7 +147,7 @@ The JSON object must have the following structure:
             temperature=0.7
         )
         summary_json_string = response.choices[0].message.content
-        yield log("Received summary from Typhoon LLM. Parsing JSON."), transcribed_text, "", "Parsing Summary..."
         # Parse the JSON and format it as Markdown
         try:
@@ -182,14 +163,15 @@ The JSON object must have the following structure:
             # Build the blog post in Markdown format
             summary_markdown = f"# {title}\n\n"
-            summary_markdown += f"{key_takeaway}\n\n"
             if main_ideas:
                 summary_markdown += "## Key Features & Main Ideas\n\n"
                 for idea in main_ideas:
-                    summary_markdown += f"- {idea}\n"
-                summary_markdown += "\n"
-            summary_markdown += f"## Conclusion\n\n{conclusion}"
-            yield log("Successfully parsed and formatted summary."), transcribed_text, summary_markdown, "Formatting Complete"
         except (json.JSONDecodeError, AttributeError) as e:
             error_message = f"Failed to parse the summary from the AI. Raw response: {summary_json_string}"
@@ -199,8 +181,7 @@ The JSON object must have the following structure:
         raise gr.Error(f"Could not connect to the Typhoon API. Please check your API key. Error: {str(e)}")
     # Step 5: Return final results
-    progress(1.0, desc="Done!")
-    yield log("Process finished successfully."), transcribed_text, summary_markdown, "Done!"
 # --- 4. Gradio UI ---
 # Custom CSS for a beautiful, blog-like output.
@@ -270,8 +251,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
                     )
             submit_button = gr.Button("🚀 Generate Blog Post", variant="primary")
-            status_output = gr.Textbox(label="Status", interactive=False, lines=1)
-            with gr.Accordion("📝 View Process Log", open=False):
                 log_output = gr.Textbox(label="Log", interactive=False, lines=10)
         with gr.Column(scale=2):
@@ -285,7 +265,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
     submit_button.click(
         fn=transcribe_and_summarize,
         inputs=[audio_file_input, youtube_url_input],
-        outputs=[log_output, transcription_output, blog_summary_output, status_output]
     )
 if __name__ == "__main__":

 import gradio as gr
 import torch
+from transformers import pipeline
 import yt_dlp
 from openai import OpenAI
 import os
 import json
 import torchaudio
 import time
 # --- 1. Model & Pipeline Initialization ---
 print(f"Using device: {device} with dtype: {torch_dtype}")
+# Initialize the ASR pipeline, which is more robust for handling inputs
+pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=model_id,
+    dtype=torch_dtype,
+    device=device,
 )
+# Set the language and task for the pipeline
+pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="th", task="transcribe")
 print("Transcription model loaded successfully.")
 # --- 3. Core Logic ---
+def transcribe_and_summarize(audio_file: str, youtube_url: str):
     """
     Main function to process audio, transcribe, and summarize.
     This is a generator function to yield status updates and logs to the UI.
+    No longer uses gr.Progress, shows loading state in the output component itself.
     """
     log_history = ""
     def log(message):
         log_history += f"[{timestamp}] {message}\n"
         return log_history
+    loading_message = "⏳ Please wait, your article is being generated..."
+    yield log("Process started."), "", loading_message
     # Step 1: Get API Key and validate inputs
     api_key = os.getenv('TYPHOON_API')
     # Step 2: Determine audio source and get file path
     filepath = ""
     if youtube_url:
+        yield log("YouTube link detected. Starting download."), "", loading_message
         try:
             filepath = download_youtube_audio(youtube_url)
+            yield log(f"Audio downloaded successfully to '{filepath}'."), "", loading_message
         except Exception as e:
+             yield log(f"Error downloading from YouTube: {e}"), "", ""
              return
     else:
         filepath = audio_file
+        yield log(f"Processing uploaded file: '{filepath}'."), "", loading_message
+    # Step 3: Transcribe audio using the pipeline for robustness
+    yield log("Beginning audio transcription... This may take a while for long audio."), "", loading_message
     try:
+        # The pipeline handles resampling, chunking, and batching automatically
+        result = pipe(filepath, chunk_length_s=30, batch_size=8, return_timestamps=False)
+        transcribed_text = result["text"]
+        yield log("Transcription complete."), transcribed_text, loading_message
     except Exception as e:
         raise gr.Error(f"An error occurred during transcription: {str(e)}")
     # Step 4: Summarize with Typhoon LLM
+    yield log("Sending transcription to Typhoon LLM for summarization."), transcribed_text, loading_message
     if not transcribed_text or not transcribed_text.strip():
+        yield log("Transcription is empty. Aborting summarization."), "", "Could not generate summary because the transcription is empty."
         return
     # Initialize OpenAI client for Typhoon
             temperature=0.7
         )
         summary_json_string = response.choices[0].message.content
+        yield log("Received summary from Typhoon LLM. Parsing JSON."), transcribed_text, loading_message
         # Parse the JSON and format it as Markdown
         try:
             # Build the blog post in Markdown format
             summary_markdown = f"# {title}\n\n"
+            summary_markdown += f"<p>{key_takeaway}</p>\n\n"
             if main_ideas:
                 summary_markdown += "## Key Features & Main Ideas\n\n"
+                summary_markdown += "<ul>\n"
                 for idea in main_ideas:
+                    summary_markdown += f"  <li>{idea}</li>\n"
+                summary_markdown += "</ul>\n\n"
+            summary_markdown += f"## Conclusion\n\n<p>{conclusion}</p>"
+            yield log("Successfully parsed and formatted summary."), transcribed_text, summary_markdown
         except (json.JSONDecodeError, AttributeError) as e:
             error_message = f"Failed to parse the summary from the AI. Raw response: {summary_json_string}"
         raise gr.Error(f"Could not connect to the Typhoon API. Please check your API key. Error: {str(e)}")
     # Step 5: Return final results
+    yield log("Process finished successfully."), transcribed_text, summary_markdown
 # --- 4. Gradio UI ---
 # Custom CSS for a beautiful, blog-like output.
                     )
             submit_button = gr.Button("🚀 Generate Blog Post", variant="primary")
+            with gr.Accordion("📝 View Process Log", open=True):
                 log_output = gr.Textbox(label="Log", interactive=False, lines=10)
         with gr.Column(scale=2):
     submit_button.click(
         fn=transcribe_and_summarize,
         inputs=[audio_file_input, youtube_url_input],
+        outputs=[log_output, transcription_output, blog_summary_output]
     )
 if __name__ == "__main__":