PirateXX commited on
Commit
3dfb32c
·
verified ·
1 Parent(s): 7586839

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # imports
2
+
3
+ import os
4
+
5
+ import requests
6
+ from dotenv import load_dotenv
7
+ from IPython.display import Markdown
8
+
9
+ from openai import OpenAI
10
+ from youtube_transcript_api import YouTubeTranscriptApi
11
+ import re
12
+
13
+ openai = OpenAI()
14
+
15
+ class YoutubeVideoID:
16
+ def __init__(self, url):
17
+ self.url = url
18
+ self.video_id = self.extract_video_id(url)
19
+
20
+ def extract_video_id(self, url):
21
+ """
22
+ Extracts the YouTube video ID from a given URL.
23
+ Supports both regular and shortened URLs.
24
+ """
25
+ # Regular expression to match YouTube video URL and extract the video ID
26
+ regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|\S*\?v=)|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})"
27
+ match = re.match(regex, url)
28
+
29
+ if match:
30
+ return match.group(1)
31
+ else:
32
+ raise ValueError("Invalid YouTube URL")
33
+
34
+ def __str__(self):
35
+ return f"Video ID: {self.video_id}"
36
+
37
+ def get_transcript(video_id, language='en'):
38
+ try:
39
+ # Try to get the transcript in the desired language (Indonesian by default)
40
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
41
+ # Join all the 'text' fields into a single string
42
+ return " ".join([item['text'] for item in transcript])
43
+ except Exception as e:
44
+ print(f"Error fetching transcript: {e}")
45
+ return None
46
+
47
+
48
+
49
+ def summarize_text(text):
50
+ try:
51
+ text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16)
52
+ output = text_summary(input_text)
53
+ return output[0]['summary_text']
54
+ except Exception as e:
55
+ print(f"Error summarizing text: {e}")
56
+ return None
57
+
58
+ def split_text(text, chunk_size=3000):
59
+ """
60
+ Splits large text into smaller chunks based on the given chunk size.
61
+ Ensures that chunks end with a full stop where possible to maintain sentence integrity.
62
+
63
+ :param text: str, the text to be split
64
+ :param chunk_size: int, maximum size of each chunk (default 3000 characters)
65
+ :return: list of str, where each str is a chunk of text
66
+ """
67
+ chunks = []
68
+ while len(text) > chunk_size:
69
+ # Find the last full stop within or at the chunk size
70
+ split_point = text.rfind('.', 0, chunk_size + 1) # +1 to include the period itself if it's at chunk_size
71
+ if split_point == -1: # No period found within the chunk size
72
+ split_point = chunk_size
73
+
74
+ # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure
75
+ chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])
76
+ text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]
77
+
78
+ # Add the remaining text as the final chunk, only strip if there's content
79
+ if text:
80
+ chunks.append(text.strip())
81
+
82
+ return chunks
83
+
84
+
85
+ def get_result(video_url, summarize=True):
86
+ # Fetch transcript using the video ID
87
+ yt_video = YoutubeVideoID(video_url)
88
+ transcript_text = get_transcript(yt_video.video_id)
89
+ if summarize == False:
90
+ return transcript_text
91
+ transcript_chunks = split_text(transcript_text)
92
+ summaries = []
93
+ for chunk in transcript_chunks:
94
+ summary = summarize_text(chunk)
95
+ summaries.append(summary)
96
+ full_summary = " ".join(summaries)
97
+ return Markdown(full_summary)
98
+
99
+
100
+ def create_gradio_interface():
101
+ with gr.Blocks() as demo:
102
+ gr.Markdown("# YouTube Video Summarizer")
103
+ gr.Markdown("""
104
+ This space provides summary of youtube video urls, you can also get full transcripts if you choose so.
105
+ ### Credits:
106
+ Created by **Arsh** – Providing a simple solution for video summarization!
107
+ """)
108
+
109
+ # Input for YouTube URL
110
+ video_url_input = gr.Textbox(label="Enter YouTube URL", lines=1)
111
+
112
+ # Radio button for choosing output type (summary or full transcript)
113
+ output_type = gr.Radio(choices=["Summary", "Full Transcript"], label="Choose Output Type", value="Summary")
114
+
115
+ # Output for summarized or full transcript text
116
+ output_text = gr.Textbox(label="Result", lines=6)
117
+
118
+ # Submit button
119
+ submit_button = gr.Button("Generate", variant="primary")
120
+
121
+ # Define the action for the button press
122
+ submit_button.click(fn=get_youtube_transcript,
123
+ inputs=[video_url_input, output_type],
124
+ outputs=[output_text])
125
+
126
+ return demo
127
+
128
+ # Launch the interface with user credit
129
+ demo = create_gradio_interface()
130
+ demo.launch(share=True, show_api=True)