Spaces:

PogusTheWhisper
/

Audio-to-Blog-Summarizer

Sleeping

App Files Files Community

Naphat Sornwichai commited on 14 days ago

Commit

81889f9

1 Parent(s): b4c6511

update major files

Browse files

Files changed (3) hide show

.gitignore +2 -1
app.py +143 -190
test.ipynb +74 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 .venv
 __pycache__
-downloaded_audio.mp3

 .venv
 __pycache__
+*.mp3
+*.wav

app.py CHANGED Viewed

@@ -1,52 +1,36 @@
 import gradio as gr
 import torch
-from transformers import pipeline
 import yt_dlp
 from openai import OpenAI
 import os
 import json
-import torchaudio
 import time
-# --- 1. Model & Pipeline Initialization ---
-# Setup device and data type for PyTorch
-print("Initializing transcription model...")
-# Updated device selection logic for CUDA, Apple MPS, and CPU
-device = "cuda:0" if torch.cuda.is_available() else "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) else torch.float32
-# Switched to the medium model as requested
-model_id = "nectec/Pathumma-whisper-th-medium"
-print(f"Using device: {device} with dtype: {torch_dtype}")
-# Initialize the ASR pipeline, which is more robust for handling inputs
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=model_id,
-    dtype=torch_dtype,
-    device=device,
-)
-# Set the language and task for the pipeline
-pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="th", task="transcribe")
 print("Transcription model loaded successfully.")
 # --- 2. Helper Functions ---
 def download_youtube_audio(url: str) -> str:
-    """
-    Downloads audio from a YouTube URL and saves it as an mp3 file.
-    Returns the path to the downloaded file.
-    """
-    output_template = 'downloaded_audio.%(ext)s'
     ydl_opts = {
         'format': 'bestaudio/best',
-        'postprocessors': [{
-            'key': 'FFmpegExtractAudio',
-            'preferredcodec': 'mp3',
-            'preferredquality': '192',
-        }],
         'outtmpl': output_template,
         'quiet': True,
         'overwrite': True,
@@ -54,106 +38,92 @@ def download_youtube_audio(url: str) -> str:
     try:
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             ydl.download([url])
-        return 'downloaded_audio.mp3'
     except Exception as e:
-        raise gr.Error(f"Failed to download audio from YouTube. Please check the link. Error: {str(e)}")
-# --- 3. Core Logic ---
 def transcribe_and_summarize(audio_file: str, youtube_url: str):
-    """
-    Main function to process audio, transcribe, and summarize.
-    This is a generator function to yield status updates and logs to the UI.
-    No longer uses gr.Progress, shows loading state in the output component itself.
-    """
     log_history = ""
     def log(message):
         nonlocal log_history
-        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
         log_history += f"[{timestamp}] {message}\n"
         return log_history
-    loading_message = "⏳ Please wait, your article is being generated..."
-    yield log("Process started."), "", loading_message
-    # Step 1: Get API Key and validate inputs
     api_key = os.getenv('TYPHOON_API')
     if not api_key:
-        raise gr.Error("TYPHOON_API environment variable not set. Please set it before running the app.")
     if audio_file is None and not youtube_url:
         raise gr.Error("Please upload an audio file or provide a YouTube link.")
-    # Step 2: Determine audio source and get file path
     filepath = ""
-    if youtube_url:
-        yield log("YouTube link detected. Starting download."), "", loading_message
-        try:
-            filepath = download_youtube_audio(youtube_url)
-            yield log(f"Audio downloaded successfully to '{filepath}'."), "", loading_message
-        except Exception as e:
-             yield log(f"Error downloading from YouTube: {e}"), "", ""
-             return
-    else:
-        filepath = audio_file
-        yield log(f"Processing uploaded file: '{filepath}'."), "", loading_message
-    # Step 3: Transcribe audio using the pipeline for robustness
-    yield log("Beginning audio transcription... This may take a while for long audio."), "", loading_message
     try:
-        # The pipeline handles resampling, chunking, and batching automatically
-        result = pipe(filepath, chunk_length_s=30, batch_size=8, return_timestamps=False)
-        transcribed_text = result["text"]
-        yield log("Transcription complete."), transcribed_text, loading_message
-    except Exception as e:
-        raise gr.Error(f"An error occurred during transcription: {str(e)}")
-    # Step 4: Summarize with Typhoon LLM
-    yield log("Sending transcription to Typhoon LLM for summarization."), transcribed_text, loading_message
-    if not transcribed_text or not transcribed_text.strip():
-        yield log("Transcription is empty. Aborting summarization."), "", "Could not generate summary because the transcription is empty."
-        return
-    # Initialize OpenAI client for Typhoon
-    client = OpenAI(
-        api_key=api_key,
-        base_url="https://api.opentyphoon.ai/v1"
-    )
-    system_prompt = """You are a professional editor and content creator. Your task is to take a raw transcript and reformat it into a beautiful, easy-to-read blog post.
-You MUST reply ONLY with a valid JSON object. Do not add any text before or after the JSON.
 The JSON object must have the following structure:
-{
-  "title": "A catchy and relevant title for the blog post in Thai.",
-  "key_takeaway": "A single paragraph summarizing the most important point of the entire content in Thai.",
   "main_ideas": [
-    "A key point or feature, written as a string in Thai.",
-    "Another key point or feature, written as a string in Thai.",
     "And so on..."
   ],
-  "conclusion": "A concluding paragraph that wraps up the main ideas in Thai."
-}"""
-    try:
         response = client.chat.completions.create(
             model="typhoon-v2.1-12b-instruct",
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": f"Please summarize and restructure the following transcript into the specified JSON format:\n\n---\n\n{transcribed_text}"}
-            ],
             max_tokens=2048,
             temperature=0.7
         )
         summary_json_string = response.choices[0].message.content
-        yield log("Received summary from Typhoon LLM. Parsing JSON."), transcribed_text, loading_message
-        # Parse the JSON and format it as Markdown
         try:
-            # Clean potential markdown code blocks from the response
-            if summary_json_string.strip().startswith("```json"):
-                summary_json_string = summary_json_string.strip()[7:-4]
             data = json.loads(summary_json_string)
             title = data.get("title", "Title Not Found")
@@ -161,113 +131,96 @@ The JSON object must have the following structure:
             main_ideas = data.get("main_ideas", [])
             conclusion = data.get("conclusion", "")
-            # Build the blog post in Markdown format
-            summary_markdown = f"# {title}\n\n"
-            summary_markdown += f"<p>{key_takeaway}</p>\n\n"
-            if main_ideas:
-                summary_markdown += "## Key Features & Main Ideas\n\n"
-                summary_markdown += "<ul>\n"
-                for idea in main_ideas:
-                    summary_markdown += f"  <li>{idea}</li>\n"
-                summary_markdown += "</ul>\n\n"
-            summary_markdown += f"## Conclusion\n\n<p>{conclusion}</p>"
-            yield log("Successfully parsed and formatted summary."), transcribed_text, summary_markdown
-        except (json.JSONDecodeError, AttributeError) as e:
-            error_message = f"Failed to parse the summary from the AI. Raw response: {summary_json_string}"
-            raise gr.Error(error_message)
     except Exception as e:
-        raise gr.Error(f"Could not connect to the Typhoon API. Please check your API key. Error: {str(e)}")
-    # Step 5: Return final results
-    yield log("Process finished successfully."), transcribed_text, summary_markdown
-# --- 4. Gradio UI ---
-# Custom CSS for a beautiful, blog-like output.
 css = """
-@import url('[https://fonts.googleapis.com/css2?family=Sarabun:wght@400;700&display=swap](https://fonts.googleapis.com/css2?family=Sarabun:wght@400;700&display=swap)');
-.blog-output {
-    font-family: 'Sarabun', sans-serif;
-    line-height: 1.8;
-    max-width: 800px;
-    margin: auto;
-    padding: 2rem;
-    border-radius: 12px;
-    background-color: #ffffff;
-    border: 1px solid #e5e7eb;
-}
-.blog-output h1 {
-    font-size: 2.2em;
-    font-weight: 700;
-    border-bottom: 2px solid #f3f4f6;
-    padding-bottom: 15px;
-    margin-bottom: 25px;
-    color: #111827;
-}
-.blog-output h2 {
-    font-size: 1.6em;
-    font-weight: 700;
-    margin-top: 40px;
-    margin-bottom: 20px;
-    color: #1f2937;
-}
-.blog-output p {
-    font-size: 1.1em;
-    margin-bottom: 20px;
-    color: #374151;
-}
-.blog-output ul {
-    padding-left: 25px;
-    list-style-type: disc;
-}
-.blog-output li {
-    margin-bottom: 12px;
-    padding-left: 5px;
-}
 """
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=css) as demo:
-    gr.Markdown(
-        """
-        # 🎙️ Audio to Blog Summarizer ✒️
-        Upload an audio file (MP3, WAV) or paste a YouTube link to transcribe it to Thai text and summarize the content into a beautiful, blog-style article using AI from NECTEC and OpenTyphoon.
-        """
-    )
     with gr.Row():
         with gr.Column(scale=1):
             with gr.Tabs():
-                with gr.TabItem("⬆️ Upload Audio File"):
-                    audio_file_input = gr.Audio(
-                        label="Upload MP3 or WAV file",
-                        type="filepath",
-                        sources=["upload"]
-                    )
-                with gr.TabItem("🔗 Paste YouTube Link"):
-                    youtube_url_input = gr.Textbox(
-                        label="Paste YouTube link here",
-                        placeholder="e.g., [https://www.youtube.com/watch?v=](https://www.youtube.com/watch?v=)..."
-                    )
             submit_button = gr.Button("🚀 Generate Blog Post", variant="primary")
             with gr.Accordion("📝 View Process Log", open=True):
                 log_output = gr.Textbox(label="Log", interactive=False, lines=10)
         with gr.Column(scale=2):
             gr.Markdown("## ✨ Article Output")
             blog_summary_output = gr.Markdown(elem_classes=["blog-output"])
             with gr.Accordion("📜 View Full Transcription", open=False):
                  transcription_output = gr.Textbox(label="Full Text", interactive=False, lines=10)
-    # Link button to the main function
     submit_button.click(
         fn=transcribe_and_summarize,
         inputs=[audio_file_input, youtube_url_input],
         outputs=[log_output, transcription_output, blog_summary_output]
     )
 if __name__ == "__main__":
-    demo.launch(debug=True)

 import gradio as gr
 import torch
+from faster_whisper import WhisperModel
 import yt_dlp
 from openai import OpenAI
 import os
 import json
 import time
+import uuid
+# --- 1. Model Initialization (Efficient: Done Once at Startup) ---
+print("Initializing transcription model (faster-whisper)...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+if device == "cuda":
+    compute_type = "float16"
+    print("CUDA detected. Using GPU with compute_type: 'float16'")
+else:
+    compute_type = "int8"
+    print("No CUDA device found. Using CPU with compute_type: 'int8'")
+model_size = "large-v3"
+model = WhisperModel(model_size, device=device, compute_type=compute_type)
 print("Transcription model loaded successfully.")
 # --- 2. Helper Functions ---
 def download_youtube_audio(url: str) -> str:
+    """Downloads audio from a YouTube URL and saves it as an MP3 file."""
+    unique_id = uuid.uuid4()
+    output_template = f'{unique_id}.%(ext)s'
+    final_filepath = f'{unique_id}.mp3'
     ydl_opts = {
         'format': 'bestaudio/best',
+        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
         'outtmpl': output_template,
         'quiet': True,
         'overwrite': True,
     try:
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             ydl.download([url])
+        return final_filepath
     except Exception as e:
+        raise gr.Error(f"Failed to download audio from YouTube. Error: {str(e)}")
 def transcribe_and_summarize(audio_file: str, youtube_url: str):
+    """The main processing pipeline: download, transcribe (with streaming), and summarize."""
     log_history = ""
     def log(message):
         nonlocal log_history
+        timestamp = time.strftime("%H:%M:%S") # Use shorter timestamp
         log_history += f"[{timestamp}] {message}\n"
         return log_history
+    loading_message = "⏳ Generating summary..."
+    yield log("Process started."), "", ""
     api_key = os.getenv('TYPHOON_API')
     if not api_key:
+        error_msg = "TYPHOON_API environment variable not set. Cannot summarize."
+        yield log(error_msg), "", gr.Markdown(f"## Error\n{error_msg}")
+        return
     if audio_file is None and not youtube_url:
         raise gr.Error("Please upload an audio file or provide a YouTube link.")
     filepath = ""
+    is_downloaded = False
     try:
+        if youtube_url:
+            yield log("Downloading YouTube audio..."), "", ""
+            filepath = download_youtube_audio(youtube_url)
+            is_downloaded = True
+            yield log(f"Downloaded to {filepath}"), "", ""
+        else:
+            filepath = audio_file
+        yield log("Transcription started..."), "", ""
+        segments, info = model.transcribe(filepath, beam_size=5)
+        detected_lang = info.language
+        yield log(f"Detected language '{detected_lang}' with probability {info.language_probability:.2f}"), "", ""
+        transcribed_text = ""
+        for segment in segments:
+            line = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text.strip()}"
+            transcribed_text += segment.text + " "
+            yield log(line), transcribed_text, ""
+        yield log("Transcription complete."), transcribed_text, ""
+        yield log("Sending to AI for summarization..."), transcribed_text, loading_message
+        client = OpenAI(api_key=api_key, base_url="https://api.opentyphoon.ai/v1")
+        system_prompt = f"""You are an automated system that converts transcripts into a blog post.
+Your ONLY function is to output a valid JSON object.
+Do NOT write any explanations, apologies, or introductory text.
+The response MUST start with a `{{` and end with a `}}`.
 The JSON object must have the following structure:
+{{
+  "title": "A catchy and relevant title for the blog post in {detected_lang}.",
+  "key_takeaway": "A single paragraph summarizing the most important point of the entire content in {detected_lang}.",
   "main_ideas": [
+    "A key point or feature, written as a string in {detected_lang}.",
+    "Another key point or feature...",
     "And so on..."
   ],
+  "conclusion": "A concluding paragraph that wraps up the main ideas in {detected_lang}."
+}}"""
         response = client.chat.completions.create(
             model="typhoon-v2.1-12b-instruct",
+            messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": transcribed_text}],
             max_tokens=2048,
             temperature=0.7
         )
         summary_json_string = response.choices[0].message.content
+        # --- THIS IS THE FIX ---
+        # Clean up the string to remove markdown fences if the AI included them
+        if summary_json_string.strip().startswith("```json"):
+            summary_json_string = summary_json_string.strip()[7:-4].strip()
+        # --- END OF FIX ---
         try:
+            if not summary_json_string or not summary_json_string.strip():
+                 raise json.JSONDecodeError("Empty response from API", summary_json_string, 0)
             data = json.loads(summary_json_string)
             title = data.get("title", "Title Not Found")
             main_ideas = data.get("main_ideas", [])
             conclusion = data.get("conclusion", "")
+            summary_markdown = f"# {title}\n\n<p>{key_takeaway}</p>\n\n## Key Ideas\n\n<ul>"
+            for idea in main_ideas:
+                summary_markdown += f"<li>{idea}</li>"
+            summary_markdown += f"</ul>\n\n## Conclusion\n\n<p>{conclusion}</p>"
+            yield log("Summarization complete."), transcribed_text, summary_markdown
+        except json.JSONDecodeError:
+            error_log_message = "ERROR: Failed to decode JSON from AI response."
+            error_display_message = f"## Summarization Failed\n**The AI did not return a valid JSON article.**\n\n**Raw AI Response:**\n```\n{summary_json_string}\n```"
+            yield log(error_log_message), transcribed_text, gr.Markdown(error_display_message)
     except Exception as e:
+        yield log(f"An unexpected error occurred: {str(e)}"), "", f"## Error\nAn unexpected error occurred: {str(e)}"
+    finally:
+        if is_downloaded and filepath and os.path.exists(filepath):
+            print(f"Cleaning up temporary file: {filepath}")
+            os.remove(filepath)
+def update_video_preview(url):
+    """Parses a YouTube URL to find the video ID, then returns an HTML iframe embed."""
+    if not url:
+        return gr.update(value=None, visible=False)
+    video_id = None
+    try:
+        if "youtube.com/shorts/" in url:
+            video_id = url.split("/shorts/")[1].split("?")[0]
+        elif "watch?v=" in url:
+            video_id = url.split("watch?v=")[1].split("&")[0]
+        elif "youtu.be/" in url:
+            video_id = url.split("youtu.be/")[1].split("?")[0]
+    except IndexError:
+        return gr.update(value=None, visible=False)
+    if video_id:
+        iframe_html = f'<iframe width="100%" height="315" src="https://www.youtube.com/embed/{video_id}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'
+        return gr.update(value=iframe_html, visible=True)
+    else:
+        return gr.update(value=None, visible=False)
+# --- 3. Gradio UI Layout ---
 css = """
+@import url('https://fonts.googleapis.com/css2?family=Sarabun:wght@400;700&display=swap');
+.blog-output { font-family: 'Sarabun', sans-serif; line-height: 1.8; max-width: 800px; margin: auto; padding: 2rem; border-radius: 12px; background-color: #ffffff; border: 1px solid #e5e7eb; }
+.blog-output h1 { font-size: 2.2em; font-weight: 700; border-bottom: 2px solid #f3f4f6; padding-bottom: 15px; margin-bottom: 25px; color: #111827; }
+.blog-output h2 { font-size: 1.6em; font-weight: 700; margin-top: 40px; margin-bottom: 20px; color: #1f2937; }
+.blog-output p { font-size: 1.1em; margin-bottom: 20px; color: #374151; }
+.blog-output ul { padding-left: 25px; list-style-type: disc; }
+.blog-output li { margin-bottom: 12px; padding-left: 5px; }
 """
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=css) as demo:
+    gr.Markdown("# 🎙️ Audio to Blog Summarizer ✒️")
     with gr.Row():
         with gr.Column(scale=1):
             with gr.Tabs():
+                with gr.TabItem("⬆️ Upload Audio"):
+                    audio_file_input = gr.Audio(label="Upload Audio File", type="filepath")
+                with gr.TabItem("🔗 YouTube Link"):
+                    youtube_url_input = gr.Textbox(label="YouTube URL", placeholder="Paste a YouTube link here...")
             submit_button = gr.Button("🚀 Generate Blog Post", variant="primary")
+            video_preview = gr.HTML(visible=False)
             with gr.Accordion("📝 View Process Log", open=True):
                 log_output = gr.Textbox(label="Log", interactive=False, lines=10)
         with gr.Column(scale=2):
             gr.Markdown("## ✨ Article Output")
             blog_summary_output = gr.Markdown(elem_classes=["blog-output"])
             with gr.Accordion("📜 View Full Transcription", open=False):
                  transcription_output = gr.Textbox(label="Full Text", interactive=False, lines=10)
+    # --- 4. Event Listeners ---
     submit_button.click(
         fn=transcribe_and_summarize,
         inputs=[audio_file_input, youtube_url_input],
         outputs=[log_output, transcription_output, blog_summary_output]
     )
+    youtube_url_input.change(
+        fn=update_video_preview,
+        inputs=youtube_url_input,
+        outputs=video_preview
+    )
+    demo.load(
+        fn=update_video_preview,
+        inputs=youtube_url_input,
+        outputs=video_preview
+    )
+# --- 5. App Launch ---
 if __name__ == "__main__":
+    demo.launch(debug=True)

test.ipynb ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "81d301b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language 'th' with probability 0.993038\n",
+      "[0.00s -> 6.72s] เช่น ลุงแดงบอกว่า การเล่นเนี่ย สมมุติเล่นคอร์ดสี่ คอร์ดสี่อย่างงี้\n",
+      "[6.72s -> 11.88s] คอร์ดสี่อย่างงี้ มันถูกทั้งหมด แต่เวลาเอาไปใช้งานจริงจริง\n",
+      "[11.88s -> 15.60s] มันจะทําอย่างนั้นไม่ได้ มันต้องเลือกเอาว่าเล่นอะไรที่มันดีที่สุด\n",
+      "[15.60s -> 19.50s] เออ ลูกหลานลองฟังเสียงคอร์ด เงื้อเสียงมันต่างกัน\n",
+      "[19.50s -> 23.10s] ฟังแบบนี้มันกําแก่งนะ เนี้ย\n",
+      "[24.78s -> 30.58s] แล้วแขมเล่นไปต้องคอยระวัง ระวังไอ้สายห้ากับหกด้วย\n",
+      "[30.58s -> 32.98s] เดี๋ยวมันจะวิ่งออกมากวนกัน เพราะปลิ๊กมันขบยาก\n",
+      "[32.98s -> 35.54s] เดี๋ยวมันปลายไปโดนนิดหนึ่ง มันก็ออกแล้ว\n",
+      "[35.54s -> 40.58s] เราจะดิดหกสายฟังให้ดีนะลูกหลาย ถ้าจับแบบนี้\n",
+      "[40.58s -> 45.98s] บอร์ด ดัง บอร์ด เห็นไหม เล่นแบบนี้ก็เล่นในทั่วไป\n",
+      "[45.98s -> 50.18s] เสียงแรงต่างมา ไม่ผิดนะ แต่ก็ดีแบบนั้น เอาดี ๆ เลย\n",
+      "[50.18s -> 54.50s] บอร์ด ชัดเจน บอร์ด เห็นไหม แล้วดีดกันเลย\n"
+     ]
+    }
+   ],
+   "source": [
+    "from faster_whisper import WhisperModel\n",
+    "\n",
+    "model_size = \"large-v3\"\n",
+    "\n",
+    "model = WhisperModel(model_size, device=\"cpu\", compute_type=\"int8\")\n",
+    "\n",
+    "segments, info = model.transcribe(\"bacfd788-dd5c-4ff3-851a-45bbf742acd5.mp3\", beam_size=5)\n",
+    "\n",
+    "print(\"Detected language '%s' with probability %f\" % (info.language, info.language_probability))\n",
+    "\n",
+    "for segment in segments:\n",
+    "    print(\"[%.2fs -> %.2fs] %s\" % (segment.start, segment.end, segment.text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e94c566",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Jumps",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}