Naphat Sornwichai commited on
Commit
1d415e7
·
1 Parent(s): 0fe5052

update major files

Browse files
Files changed (4) hide show
  1. README.md +12 -0
  2. app.py +176 -0
  3. requirements.txt +5 -0
  4. test.ipynb +66 -0
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Jumps
3
+ emoji: 👀
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.44.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from faster_whisper import WhisperModel
4
+ import yt_dlp
5
+ from openai import OpenAI
6
+ import os
7
+ import json
8
+ import time
9
+ import uuid
10
+
11
+ print("Initializing transcription model (faster-whisper)...")
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ compute_type = "float16" if device == "cuda" else "int8"
14
+ model_size = "large-v3-turbo"
15
+ model = WhisperModel(model_size, device=device, compute_type=compute_type)
16
+ print("Transcription model loaded successfully.")
17
+
18
+ def download_youtube_audio(url: str) -> str:
19
+ unique_id = uuid.uuid4()
20
+ output_template = f'{unique_id}.%(ext)s'
21
+ final_filepath = f'{unique_id}.mp3'
22
+ ydl_opts = {
23
+ 'format': 'bestaudio/best',
24
+ 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
25
+ 'outtmpl': output_template,
26
+ 'quiet': True,
27
+ 'overwrite': True,
28
+ }
29
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
30
+ ydl.download([url])
31
+ return final_filepath
32
+
33
+ def transcribe_and_summarize(audio_file: str, youtube_url: str):
34
+ log_history = ""
35
+ def log(message):
36
+ nonlocal log_history
37
+ timestamp = time.strftime("%H:%M:%S")
38
+ log_history += f"[{timestamp}] {message}\n"
39
+ return log_history
40
+
41
+ loading_message = "⏳ Generating summary..."
42
+ yield log("Process started."), "", ""
43
+
44
+ api_key = os.getenv('TYPHOON_API')
45
+ if not api_key:
46
+ yield log("TYPHOON_API environment variable not set."), "", gr.Markdown("## Error\nAPI key missing")
47
+ return
48
+
49
+ if audio_file is None and not youtube_url:
50
+ raise gr.Error("Please upload an audio file or provide a YouTube link.")
51
+
52
+ filepath = ""
53
+ is_downloaded = False
54
+ try:
55
+ if youtube_url:
56
+ yield log("Downloading YouTube audio..."), "", ""
57
+ filepath = download_youtube_audio(youtube_url)
58
+ is_downloaded = True
59
+ yield log(f"Downloaded to {filepath}"), "", ""
60
+ else:
61
+ filepath = audio_file
62
+
63
+ yield log("Transcription started (Language: Thai)..."), "", ""
64
+ segments, info = model.transcribe(filepath, beam_size=5, language="th", task="transcribe")
65
+ transcribed_text = ""
66
+ for segment in segments:
67
+ line = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text.strip()}"
68
+ transcribed_text += segment.text + " "
69
+ yield log(line), transcribed_text, ""
70
+
71
+ yield log("Transcription complete."), transcribed_text, ""
72
+ yield log("Sending to AI for summarization..."), transcribed_text, loading_message
73
+
74
+ client = OpenAI(api_key=api_key, base_url="https://api.opentyphoon.ai/v1")
75
+ system_prompt = f"""You are an automated system that converts transcripts into a blog post.
76
+ Your ONLY function is to output a valid JSON object. All text values in the JSON MUST be in the Thai language.
77
+ หน้าที่เดียวของคุณคือการส่งออกอ็อบเจกต์ JSON ที่ถูกต้อง โดยค่าที่เป็นข้อความทั้งหมดต้องเป็นภาษาไทยเท่านั้น
78
+ Do NOT write any explanations. The response MUST start with `{{` and end with `}}`.
79
+
80
+ The JSON object must have the following structure:
81
+ {{
82
+ "title": "หัวข้อบทความที่น่าสนใจและเกี่ยวข้อง (เป็นภาษาไทย)",
83
+ "key_takeaway": "สรุปใจความสำคัญของเนื้อหาทั้งหมดในหนึ่งย่อหน้า (เป็นภาษาไทย)",
84
+ "main_ideas": [
85
+ "ประเด็นหลักหรือใจความสำคัญ (เป็นภาษาไทย)",
86
+ "ประเด็นหลักถัดไป...",
87
+ "และต่อไปเรื่อยๆ..."
88
+ ],
89
+ "conclusion": "ย่อหน้าสรุปปิดท้าย (เป็นภาษาไทย)"
90
+ }}"""
91
+ response = client.chat.completions.create(
92
+ model="typhoon-v2.1-12b-instruct",
93
+ messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": transcribed_text}],
94
+ max_tokens=2048,
95
+ temperature=0.7
96
+ )
97
+ summary_json_string = response.choices[0].message.content
98
+ if summary_json_string.strip().startswith("```json"):
99
+ summary_json_string = summary_json_string.strip()[7:-4].strip()
100
+
101
+ data = json.loads(summary_json_string)
102
+ title = data.get("title", "Title Not Found")
103
+ key_takeaway = data.get("key_takeaway", "")
104
+ main_ideas = data.get("main_ideas", [])
105
+ conclusion = data.get("conclusion", "")
106
+ summary_markdown = f"# {title}\n\n<p>{key_takeaway}</p>\n\n## Key Ideas\n\n<ul>"
107
+ for idea in main_ideas:
108
+ summary_markdown += f"<li>{idea}</li>"
109
+ summary_markdown += f"</ul>\n\n## Conclusion\n\n<p>{conclusion}</p>"
110
+
111
+ yield log("Summarization complete."), transcribed_text, summary_markdown
112
+
113
+ finally:
114
+ if is_downloaded and os.path.exists(filepath):
115
+ os.remove(filepath)
116
+
117
+ def update_video_preview(url):
118
+ if not url:
119
+ return gr.update(value=None, visible=False)
120
+ video_id = None
121
+ try:
122
+ if "youtube.com/shorts/" in url:
123
+ video_id = url.split("/shorts/")[1].split("?")[0]
124
+ elif "watch?v=" in url:
125
+ video_id = url.split("watch?v=")[1].split("&")[0]
126
+ elif "youtu.be/" in url:
127
+ video_id = url.split("youtu.be/")[1].split("?")[0]
128
+ except IndexError:
129
+ pass
130
+ if video_id:
131
+ embed_url = f"https://www.youtube.com/embed/{video_id}"
132
+ iframe_html = f'<iframe width="100%" height="315" src="{embed_url}" frameborder="0" allowfullscreen></iframe>'
133
+ return gr.update(value=iframe_html, visible=True)
134
+ return gr.update(value=None, visible=False)
135
+
136
+ css = """
137
+ @import url('https://fonts.googleapis.com/css2?family=Sarabun:wght@400;700&display=swap');
138
+ .blog-output { font-family: 'Sarabun', sans-serif; line-height: 1.8; max-width: 800px; margin: auto; padding: 2rem; border-radius: 12px; background-color: #ffffff; border: 1px solid #e5e7eb; }
139
+ .blog-output h1 { font-size: 2.2em; font-weight: 700; border-bottom: 2px solid #f3f4f6; padding-bottom: 15px; margin-bottom: 25px; color: #111827; }
140
+ .blog-output h2 { font-size: 1.6em; font-weight: 700; margin-top: 40px; margin-bottom: 20px; color: #1f2937; }
141
+ .blog-output p { font-size: 1.1em; margin-bottom: 20px; color: #374151; }
142
+ .blog-output ul { padding-left: 25px; list-style-type: disc; }
143
+ .blog-output li { margin-bottom: 12px; padding-left: 5px; }
144
+ """
145
+
146
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=css) as demo:
147
+ gr.Markdown("# 🎙️ Audio to Blog Summarizer ✒️")
148
+ with gr.Row():
149
+ with gr.Column(scale=1):
150
+ with gr.Tabs():
151
+ with gr.TabItem("⬆️ Upload Audio"):
152
+ audio_file_input = gr.Audio(label="Upload Audio File", type="filepath")
153
+ with gr.TabItem("🔗 YouTube Link"):
154
+ youtube_url_input = gr.Textbox(label="YouTube URL", placeholder="Paste a YouTube link here...")
155
+ submit_button = gr.Button("🚀 Generate Blog Post", variant="primary")
156
+ video_preview = gr.HTML(visible=False)
157
+ with gr.Accordion("📝 View Process Log", open=True):
158
+ log_output = gr.Textbox(label="Log", interactive=False, lines=10)
159
+ with gr.Column(scale=2):
160
+ gr.Markdown("## ✨ Article Output")
161
+ blog_summary_output = gr.Markdown(elem_classes=["blog-output"])
162
+ with gr.Accordion("📜 View Full Transcription", open=False):
163
+ transcription_output = gr.Textbox(label="Full Text", interactive=False, lines=10)
164
+
165
+ submit_button.click(fn=transcribe_and_summarize,
166
+ inputs=[audio_file_input, youtube_url_input],
167
+ outputs=[log_output, transcription_output, blog_summary_output])
168
+ youtube_url_input.change(fn=update_video_preview,
169
+ inputs=youtube_url_input,
170
+ outputs=video_preview)
171
+ demo.load(fn=update_video_preview,
172
+ inputs=youtube_url_input,
173
+ outputs=video_preview)
174
+
175
+ if __name__ == "__main__":
176
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ faster-whisper
4
+ yt-dlp
5
+ openai
test.ipynb ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 6,
6
+ "id": "81d301b6",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Detected language 'th' with probability 0.993038\n",
14
+ "[0.00s -> 6.72s] เช่น ลุงแดงบอกว่า การเล่นเนี่ย สมมุติเล่นคอร์ดสี่ คอร์ดสี่อย่างงี้\n",
15
+ "[6.72s -> 11.88s] คอร์ดสี่อย่างงี้ มันถูกทั้งหมด แต่เวลาเอาไปใช้งานจริงจริง\n",
16
+ "[11.88s -> 15.60s] มันจะทําอย่างนั้นไม่ได้ มันต้องเลือกเอาว่าเล่นอะไรที่มันดีที่สุด\n",
17
+ "[15.60s -> 19.50s] เออ ลูกหลานลองฟังเสียงคอร์ด เงื้อเสียงมันต่างกัน\n",
18
+ "[19.50s -> 23.10s] ฟังแบบนี้มันกําแก่งนะ เนี้ย\n",
19
+ "[24.78s -> 30.58s] แล้วแขมเล่นไปต้องคอยระวัง ระวังไอ้สายห้ากับหกด้วย\n",
20
+ "[30.58s -> 32.98s] เดี๋ยวมันจะวิ่งออกมากวนกัน เพราะปลิ๊กมันขบยาก\n",
21
+ "[32.98s -> 35.54s] เดี๋ยวมันปลายไปโดนนิดหนึ่ง มันก็ออกแล้ว\n",
22
+ "[35.54s -> 40.58s] เราจะดิดหกสายฟังให้ดีนะลูกหลาย ถ้าจับแบบนี้\n",
23
+ "[40.58s -> 45.98s] บอร์ด ดัง บอร์ด เห็นไหม เล่นแบบนี้ก็เล่นในทั่วไป\n",
24
+ "[45.98s -> 50.18s] เสียงแรงต่างมา ไม่ผิดนะ แต่ก็ดีแบบนั้น เอาดี ๆ เลย\n",
25
+ "[50.18s -> 54.50s] บอร์ด ชัดเจน บอร์ด เห็นไหม แล้วดีดกันเลย\n"
26
+ ]
27
+ }
28
+ ],
29
+ "source": [
30
+ "from faster_whisper import WhisperModel\n",
31
+ "\n",
32
+ "model_size = \"large-v3\"\n",
33
+ "\n",
34
+ "model = WhisperModel(model_size, device=\"cpu\", compute_type=\"int8\")\n",
35
+ "\n",
36
+ "segments, info = model.transcribe(\"bacfd788-dd5c-4ff3-851a-45bbf742acd5.mp3\", beam_size=5)\n",
37
+ "\n",
38
+ "print(\"Detected language '%s' with probability %f\" % (info.language, info.language_probability))\n",
39
+ "\n",
40
+ "for segment in segments:\n",
41
+ " print(\"[%.2fs -> %.2fs] %s\" % (segment.start, segment.end, segment.text))"
42
+ ]
43
+ }
44
+ ],
45
+ "metadata": {
46
+ "kernelspec": {
47
+ "display_name": "Jumps",
48
+ "language": "python",
49
+ "name": "python3"
50
+ },
51
+ "language_info": {
52
+ "codemirror_mode": {
53
+ "name": "ipython",
54
+ "version": 3
55
+ },
56
+ "file_extension": ".py",
57
+ "mimetype": "text/x-python",
58
+ "name": "python",
59
+ "nbconvert_exporter": "python",
60
+ "pygments_lexer": "ipython3",
61
+ "version": "3.11.11"
62
+ }
63
+ },
64
+ "nbformat": 4,
65
+ "nbformat_minor": 5
66
+ }