Spaces:

PogusTheWhisper
/

Audio-to-Blog-Summarizer

Sleeping

App Files Files Community

Naphat Sornwichai commited on 30 days ago

Commit

9910e37

1 Parent(s): 2705d4a

update major files

Browse files

Files changed (3) hide show

.gitignore +3 -0
app.py +293 -4
requirements.txt +0 -341

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.venv
+__pycache__
+downloaded_audio.mp3

app.py CHANGED Viewed

@@ -1,7 +1,296 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
+import yt_dlp
+from openai import OpenAI
+import os
+import json
+import torchaudio
+import torchaudio.transforms as T
+import time
+# --- 1. Model & Pipeline Initialization ---
+# Setup device and data type for PyTorch
+print("Initializing transcription model...")
+# Updated device selection logic for CUDA, Apple MPS, and CPU
+device = "cuda:0" if torch.cuda.is_available() else "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) else torch.float32
+# Switched to the medium model as requested
+model_id = "nectec/Pathumma-whisper-th-medium"
+print(f"Using device: {device} with dtype: {torch_dtype}")
+# Load the model and processor directly
+# We will use the model's .generate() method for long-form transcription
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+print("Transcription model loaded successfully.")
+# --- 2. Helper Functions ---
+def download_youtube_audio(url: str) -> str:
+    """
+    Downloads audio from a YouTube URL and saves it as an mp3 file.
+    Returns the path to the downloaded file.
+    """
+    output_template = 'downloaded_audio.%(ext)s'
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+        'outtmpl': output_template,
+        'quiet': True,
+        'overwrite': True,
+    }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+        return 'downloaded_audio.mp3'
+    except Exception as e:
+        raise gr.Error(f"Failed to download audio from YouTube. Please check the link. Error: {str(e)}")
+# --- 3. Core Logic ---
+def transcribe_and_summarize(audio_file: str, youtube_url: str, progress=gr.Progress()):
+    """
+    Main function to process audio, transcribe, and summarize.
+    This is a generator function to yield status updates and logs to the UI.
+    """
+    log_history = ""
+    def log(message):
+        nonlocal log_history
+        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+        log_history += f"[{timestamp}] {message}\n"
+        return log_history
+    progress(0, desc="Starting...")
+    yield log("Process started."), "", "", "Starting..."
+    # Step 1: Get API Key and validate inputs
+    api_key = os.getenv('TYPHOON_API')
+    if not api_key:
+        raise gr.Error("TYPHOON_API environment variable not set. Please set it before running the app.")
+    if audio_file is None and not youtube_url:
+        raise gr.Error("Please upload an audio file or provide a YouTube link.")
+    # Step 2: Determine audio source and get file path
+    filepath = ""
+    if youtube_url:
+        progress(0.1, desc="Downloading Audio...")
+        yield log("YouTube link detected. Starting download."), "", "", "Downloading Audio..."
+        try:
+            filepath = download_youtube_audio(youtube_url)
+            yield log(f"Audio downloaded successfully to '{filepath}'."), "", "", "Download Complete"
+        except Exception as e:
+             yield log(f"Error downloading from YouTube: {e}"), "", "", f"Error: {e}"
+             return
+    else:
+        filepath = audio_file
+        yield log(f"Processing uploaded file: '{filepath}'."), "", "", "Processing File..."
+    # Step 3: Transcribe audio using the model's generate method for long-form audio
+    progress(0.3, desc="Transcribing Audio...")
+    yield log("Beginning audio transcription..."), "", "", "Transcribing Audio..."
+    try:
+        # Load audio file using torchaudio
+        waveform, sr = torchaudio.load(filepath)
+        # Resample to 16kHz if necessary, as Whisper expects this rate
+        if sr != 16000:
+            yield log(f"Original sample rate is {sr}Hz. Resampling to 16000Hz."), "", "", "Resampling..."
+            resampler = T.Resample(orig_freq=sr, new_freq=16000)
+            waveform = resampler(waveform)
+        # Process the audio waveform to get input features
+        input_features = processor(
+            waveform.squeeze().numpy(),
+            return_tensors="pt",
+            sampling_rate=16000
+        ).input_features.to(device, dtype=torch_dtype)
+        # Set the generation language and task for Thai transcription
+        decoder_prompt_ids = processor.get_decoder_prompt_ids(language="th", task="transcribe")
+        # Generate token IDs from the input features
+        predicted_ids = model.generate(input_features, forced_decoder_ids=decoder_prompt_ids)
+        # Decode the token IDs to text
+        transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        yield log("Transcription complete."), transcribed_text, "", "Transcription Complete"
+    except Exception as e:
+        raise gr.Error(f"An error occurred during transcription: {str(e)}")
+    # Step 4: Summarize with Typhoon LLM
+    progress(0.8, desc="Generating Summary...")
+    yield log("Sending transcription to Typhoon LLM for summarization."), transcribed_text, "", "Generating Summary..."
+    if not transcribed_text or not transcribed_text.strip():
+        yield log("Transcription is empty. Aborting summarization."), "", "Could not generate summary because the transcription is empty.", "Aborted"
+        return
+    # Initialize OpenAI client for Typhoon
+    client = OpenAI(
+        api_key=api_key,
+        base_url="https://api.opentyphoon.ai/v1"
+    )
+    system_prompt = """You are a professional editor and content creator. Your task is to take a raw transcript and reformat it into a beautiful, easy-to-read blog post.
+You MUST reply ONLY with a valid JSON object. Do not add any text before or after the JSON.
+The JSON object must have the following structure:
+{
+  "title": "A catchy and relevant title for the blog post in Thai.",
+  "key_takeaway": "A single paragraph summarizing the most important point of the entire content in Thai.",
+  "main_ideas": [
+    "A key point or feature, written as a string in Thai.",
+    "Another key point or feature, written as a string in Thai.",
+    "And so on..."
+  ],
+  "conclusion": "A concluding paragraph that wraps up the main ideas in Thai."
+}"""
+    try:
+        response = client.chat.completions.create(
+            model="typhoon-v2.1-12b-instruct",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": f"Please summarize and restructure the following transcript into the specified JSON format:\n\n---\n\n{transcribed_text}"}
+            ],
+            max_tokens=2048,
+            temperature=0.7
+        )
+        summary_json_string = response.choices[0].message.content
+        yield log("Received summary from Typhoon LLM. Parsing JSON."), transcribed_text, "", "Parsing Summary..."
+        # Parse the JSON and format it as Markdown
+        try:
+            # Clean potential markdown code blocks from the response
+            if summary_json_string.strip().startswith("```json"):
+                summary_json_string = summary_json_string.strip()[7:-4]
+            data = json.loads(summary_json_string)
+            title = data.get("title", "Title Not Found")
+            key_takeaway = data.get("key_takeaway", "")
+            main_ideas = data.get("main_ideas", [])
+            conclusion = data.get("conclusion", "")
+            # Build the blog post in Markdown format
+            summary_markdown = f"# {title}\n\n"
+            summary_markdown += f"{key_takeaway}\n\n"
+            if main_ideas:
+                summary_markdown += "## Key Features & Main Ideas\n\n"
+                for idea in main_ideas:
+                    summary_markdown += f"- {idea}\n"
+                summary_markdown += "\n"
+            summary_markdown += f"## Conclusion\n\n{conclusion}"
+            yield log("Successfully parsed and formatted summary."), transcribed_text, summary_markdown, "Formatting Complete"
+        except (json.JSONDecodeError, AttributeError) as e:
+            error_message = f"Failed to parse the summary from the AI. Raw response: {summary_json_string}"
+            raise gr.Error(error_message)
+    except Exception as e:
+        raise gr.Error(f"Could not connect to the Typhoon API. Please check your API key. Error: {str(e)}")
+    # Step 5: Return final results
+    progress(1.0, desc="Done!")
+    yield log("Process finished successfully."), transcribed_text, summary_markdown, "Done!"
+# --- 4. Gradio UI ---
+# Custom CSS for a beautiful, blog-like output.
+css = """
+@import url('[https://fonts.googleapis.com/css2?family=Sarabun:wght@400;700&display=swap](https://fonts.googleapis.com/css2?family=Sarabun:wght@400;700&display=swap)');
+.blog-output {
+    font-family: 'Sarabun', sans-serif;
+    line-height: 1.8;
+    max-width: 800px;
+    margin: auto;
+    padding: 2rem;
+    border-radius: 12px;
+    background-color: #ffffff;
+    border: 1px solid #e5e7eb;
+}
+.blog-output h1 {
+    font-size: 2.2em;
+    font-weight: 700;
+    border-bottom: 2px solid #f3f4f6;
+    padding-bottom: 15px;
+    margin-bottom: 25px;
+    color: #111827;
+}
+.blog-output h2 {
+    font-size: 1.6em;
+    font-weight: 700;
+    margin-top: 40px;
+    margin-bottom: 20px;
+    color: #1f2937;
+}
+.blog-output p {
+    font-size: 1.1em;
+    margin-bottom: 20px;
+    color: #374151;
+}
+.blog-output ul {
+    padding-left: 25px;
+    list-style-type: disc;
+}
+.blog-output li {
+    margin-bottom: 12px;
+    padding-left: 5px;
+}
+"""
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=css) as demo:
+    gr.Markdown(
+        """
+        # 🎙️ Audio to Blog Summarizer ✒️
+        Upload an audio file (MP3, WAV) or paste a YouTube link to transcribe it to Thai text and summarize the content into a beautiful, blog-style article using AI from NECTEC and OpenTyphoon.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Tabs():
+                with gr.TabItem("⬆️ Upload Audio File"):
+                    audio_file_input = gr.Audio(
+                        label="Upload MP3 or WAV file",
+                        type="filepath",
+                        sources=["upload"]
+                    )
+                with gr.TabItem("🔗 Paste YouTube Link"):
+                    youtube_url_input = gr.Textbox(
+                        label="Paste YouTube link here",
+                        placeholder="e.g., [https://www.youtube.com/watch?v=](https://www.youtube.com/watch?v=)..."
+                    )
+            submit_button = gr.Button("🚀 Generate Blog Post", variant="primary")
+            status_output = gr.Textbox(label="Status", interactive=False, lines=1)
+            with gr.Accordion("📝 View Process Log", open=False):
+                log_output = gr.Textbox(label="Log", interactive=False, lines=10)
+        with gr.Column(scale=2):
+            gr.Markdown("## ✨ Article Output")
+            blog_summary_output = gr.Markdown(elem_classes=["blog-output"])
+            with gr.Accordion("📜 View Full Transcription", open=False):
+                 transcription_output = gr.Textbox(label="Full Text", interactive=False, lines=10)
+    # Link button to the main function
+    submit_button.click(
+        fn=transcribe_and_summarize,
+        inputs=[audio_file_input, youtube_url_input],
+        outputs=[log_output, transcription_output, blog_summary_output, status_output]
+    )
+if __name__ == "__main__":
+    demo.launch(debug=True)

requirements.txt DELETED Viewed

@@ -1,341 +0,0 @@
-accelerate==1.8.1
-aiocache==0.12.3
-aiofiles==24.1.0
-aiohappyeyeballs==2.6.1
-aiohttp==3.11.11
-aiosignal==1.3.2
-alembic==1.14.0
-annotated-types==0.7.0
-anthropic==0.56.0
-anyio==4.9.0
-appdirs==1.4.4
-appnope==0.1.4
-APScheduler==3.10.4
-argon2-cffi==23.1.0
-argon2-cffi-bindings==21.2.0
-asgiref==3.8.1
-asttokens==3.0.0
-async-timeout==5.0.1
-attrs==25.3.0
-Authlib==1.4.1
-av==14.4.0
-azure-ai-documentintelligence==1.0.2
-azure-core==1.35.0
-azure-identity==1.20.0
-azure-storage-blob==12.24.1
-backoff==2.2.1
-bcrypt==4.3.0
-beautifulsoup4==4.13.4
-bidict==0.23.1
-bitarray==3.4.3
-black==25.1.0
-blinker==1.9.0
-boto3==1.35.53
-botocore==1.35.99
-Brotli==1.1.0
-build==1.2.2.post1
-cachetools==5.5.2
-certifi==2025.6.15
-cffi==1.17.1
-chardet==5.2.0
-charset-normalizer==3.4.2
-chroma-hnswlib==0.7.6
-chromadb==0.6.3
-click==8.2.1
-colbert-ai==0.2.21
-colorama==0.4.6
-colorclass==2.2.2
-coloredlogs==15.0.1
-comm==0.2.2
-compressed-rtf==1.0.7
-cryptography==45.0.5
-ctranslate2==4.6.0
-dataclasses-json==0.6.7
-datasets==3.6.0
-debugpy==1.8.5
-decorator==5.2.1
-defusedxml==0.7.1
-dill==0.3.8
-distro==1.9.0
-dnspython==2.7.0
-docker==7.1.0
-docx2txt==0.8
-duckduckgo_search==8.0.2
-durationpy==0.10
-easygui==0.98.3
-ebcdic==1.1.1
-ecdsa==0.19.1
-einops==0.8.1
-elastic-transport==8.17.1
-elasticsearch==9.0.1
-emoji==2.14.1
-et_xmlfile==2.0.0
-eval_type_backport==0.2.2
-Events==0.5
-executing==2.2.0
-extract-msg==0.54.1
-fake-useragent==2.1.0
-fastapi==0.115.7
-faster-whisper==1.1.1
-filelock==3.18.0
-filetype==1.2.0
-firecrawl-py==1.12.0
-Flask==3.1.1
-flatbuffers==25.2.10
-fonttools==4.58.5
-fpdf2==2.8.2
-frozenlist==1.7.0
-fs==2.4.16
-fsspec==2025.3.0
-ftfy==6.2.3
-gcp-storage-emulator==2024.8.3
-git-python==1.0.3
-gitdb==4.0.12
-GitPython==3.1.44
-google-ai-generativelanguage==0.6.15
-google-api-core==2.25.1
-google-api-python-client==2.174.0
-google-auth==2.40.3
-google-auth-httplib2==0.2.0
-google-auth-oauthlib==1.2.2
-google-cloud-core==2.4.3
-google-cloud-storage==2.19.0
-google-crc32c==1.7.1
-google-genai==1.15.0
-google-generativeai==0.8.5
-google-resumable-media==2.7.2
-googleapis-common-protos==1.63.2
-greenlet==3.1.1
-grpcio==1.67.1
-grpcio-status==1.67.1
-grpcio-tools==1.67.1
-h11==0.16.0
-h2==4.2.0
-hf-xet==1.1.5
-hf_transfer==0.1.9
-hpack==4.1.0
-html5lib==1.1
-httpcore==1.0.9
-httplib2==0.22.0
-httptools==0.6.4
-httpx==0.28.1
-httpx-sse==0.4.1
-huggingface-hub==0.33.2
-humanfriendly==10.0
-hyperframe==6.1.0
-idna==3.10
-importlib_metadata==8.7.0
-importlib_resources==6.5.2
-iniconfig==2.1.0
-ipykernel==6.29.5
-ipython==9.4.0
-ipython_pygments_lexers==1.1.1
-isodate==0.7.2
-itsdangerous==2.2.0
-jedi==0.19.2
-Jinja2==3.1.6
-jiter==0.10.0
-jmespath==1.0.1
-joblib==1.5.1
-jsonpatch==1.33
-jsonpointer==3.0.0
-jupyter_client==8.6.3
-jupyter_core==5.7.2
-kubernetes==33.1.0
-langchain==0.3.24
-langchain-community==0.3.23
-langchain-core==0.3.67
-langchain-text-splitters==0.3.8
-langdetect==1.0.9
-langfuse==2.44.0
-langsmith==0.3.45
-lark==1.1.9
-ldap3==2.9.1
-loguru==0.7.3
-lxml==6.0.0
-Mako==1.3.10
-Markdown==3.7
-markdown-it-py==3.0.0
-MarkupSafe==3.0.2
-marshmallow==3.26.1
-matplotlib-inline==0.1.7
-mdurl==0.1.2
-milvus-lite==2.5.1
-mmh3==5.1.0
-moto==5.1.6
-mpmath==1.3.0
-msal==1.32.3
-msal-extensions==1.3.1
-msoffcrypto-tool==5.4.2
-multidict==6.6.3
-multiprocess==0.70.16
-mypy_extensions==1.1.0
-nest-asyncio==1.6.0
-networkx==3.5
-ninja==1.11.1.4
-nltk==3.9.1
-numpy==1.26.4
-oauthlib==3.3.1
-olefile==0.47
-oletools==0.60.2
-onnxruntime==1.20.1
-open-webui==0.6.15
-openai==1.93.0
-opencv-python==4.11.0.86
-opencv-python-headless==4.11.0.86
-openpyxl==3.1.5
-opensearch-py==2.8.0
-opentelemetry-api==1.34.1
-opentelemetry-exporter-otlp-proto-common==1.34.1
-opentelemetry-exporter-otlp-proto-grpc==1.34.1
-opentelemetry-instrumentation==0.55b1
-opentelemetry-instrumentation-asgi==0.55b1
-opentelemetry-instrumentation-fastapi==0.55b1
-opentelemetry-proto==1.34.1
-opentelemetry-sdk==1.34.1
-opentelemetry-semantic-conventions==0.55b1
-opentelemetry-util-http==0.55b1
-orjson==3.10.18
-overrides==7.7.0
-packaging==23.2
-pandas==2.2.3
-parso==0.8.4
-passlib==1.7.4
-pathspec==0.12.1
-pcodedmp==1.2.6
-peewee==3.18.1
-peewee-migrate==1.12.2
-pexpect==4.9.0
-pgvector==0.4.0
-pillow==11.2.1
-pinecone==6.0.2
-pinecone-plugin-interface==0.0.7
-platformdirs==4.3.6
-playwright==1.49.1
-pluggy==1.6.0
-portalocker==2.10.1
-posthog==6.0.2
-primp==0.15.0
-prompt_toolkit==3.0.51
-propcache==0.3.2
-proto-plus==1.26.1
-protobuf==5.29.5
-psutil==7.0.0
-psycopg2-binary==2.9.9
-ptyprocess==0.7.0
-pure_eval==0.2.3
-py-partiql-parser==0.6.1
-pyarrow==20.0.0
-pyasn1==0.4.8
-pyasn1_modules==0.4.1
-pyclipper==1.3.0.post6
-pycparser==2.22
-pydantic==2.10.6
-pydantic-settings==2.10.1
-pydantic_core==2.27.2
-pydub==0.25.1
-pyee==12.0.0
-Pygments==2.19.2
-PyJWT==2.10.1
-pymdown-extensions==10.14.2
-pymilvus==2.5.0
-pymongo==4.13.2
-PyMySQL==1.1.1
-pypandoc==1.15
-pyparsing==3.2.3
-pypdf==4.3.1
-PyPika==0.48.9
-pyproject_hooks==1.2.0
-pytest==8.3.5
-pytest-docker==3.1.2
-python-dateutil==2.9.0.post0
-python-dotenv==1.1.1
-python-engineio==4.12.2
-python-iso639==2025.2.18
-python-jose==3.4.0
-python-magic==0.4.27
-python-multipart==0.0.20
-python-oxmsg==0.0.2
-python-pptx==1.0.2
-python-socketio==5.13.0
-pytube==15.0.0
-pytz==2025.2
-pyxlsb==1.0.10
-PyYAML==6.0.2
-pyzmq==26.2.0
-qdrant-client==1.12.2
-rank-bm25==0.2.2
-RapidFuzz==3.13.0
-rapidocr-onnxruntime==1.4.4
-red-black-tree-mod==1.22
-redis==6.2.0
-regex==2024.11.6
-requests==2.32.4
-requests-oauthlib==2.0.0
-requests-toolbelt==1.0.0
-responses==0.25.7
-RestrictedPython==8.0
-rich==14.0.0
-rsa==4.9.1
-RTFDE==0.1.2.1
-s3transfer==0.10.4
-safetensors==0.5.3
-scikit-learn==1.7.0
-scipy==1.16.0
-sentence-transformers==4.1.0
-sentencepiece==0.2.0
-shapely==2.1.1
-shellingham==1.5.4
-simple-websocket==1.1.0
-six==1.17.0
-smmap==5.0.2
-sniffio==1.3.1
-soundfile==0.13.1
-soupsieve==2.7
-SQLAlchemy==2.0.38
-stack-data==0.6.3
-starlette==0.45.3
-starlette-compress==1.6.0
-sympy==1.14.0
-tabulate==0.9.0
-tenacity==9.1.2
-tencentcloud-sdk-python==3.0.1336
-threadpoolctl==3.6.0
-tiktoken==0.9.0
-tokenizers==0.21.2
-torch==2.7.1
-tornado==6.4.1
-tqdm==4.67.1
-traitlets==5.14.3
-transformers==4.53.0
-typer==0.16.0
-typing-inspect==0.9.0
-typing-inspection==0.4.1
-typing_extensions==4.14.0
-tzdata==2025.2
-tzlocal==5.3.1
-ujson==5.10.0
-unstructured==0.16.17
-unstructured-client==0.32.3
-uritemplate==4.2.0
-urllib3==2.5.0
-uv==0.8.6
-uvicorn==0.34.2
-uvloop==0.21.0
-validators==0.35.0
-watchfiles==1.1.0
-wcwidth==0.2.13
-webencodings==0.5.1
-websocket-client==1.8.0
-websockets==15.0.1
-Werkzeug==3.1.3
-wrapt==1.17.2
-wsproto==1.2.0
-xlrd==2.0.1
-xlsxwriter==3.2.5
-xmltodict==0.14.2
-xxhash==3.5.0
-yarl==1.20.1
-youtube-transcript-api==1.1.0
-zipp==3.23.0
-zstandard==0.23.0