import torch import os import gradio as gr from video_transformers import VideoModel from utils import ( convert_frames_to_gif, download_youtube_video, sample_frames_from_video_file, ) video_model = VideoModel.from_transformers("facebook/timesformer-base-finetuned-k400") examples = [ #["https://www.youtube.com/watch?v=huAJ9dC5lmI"], ["https://www.youtube.com/watch?v=wvcWt6u5HTg"], ["https://www.youtube.com/watch?v=-3kZSi5qjRM"], ["https://www.youtube.com/watch?v=-6usjfP8hys"], ["https://www.youtube.com/watch?v=BDHub0gBGtc"], ["https://www.youtube.com/watch?v=B9ea7YyCP6E"], ["https://www.youtube.com/watch?v=BBkpaeJBKmk"], ["https://www.youtube.com/watch?v=BBqU8Apee_g"], ["https://www.youtube.com/watch?v=B8OdMwVwyXc"], ["https://www.youtube.com/watch?v=I7cwq6_4QtM"], ["https://www.youtube.com/watch?v=Z0mJDXpNhYA"], ["https://www.youtube.com/watch?v=QkQQjFGnZlg"], ["https://www.youtube.com/watch?v=IQaoRUQif14"], ] def predict(youtube_url): video_path = download_youtube_video(youtube_url) frames = sample_frames_from_video_file(video_path, num_frames=16) gif_path = convert_frames_to_gif(frames) result = video_model.predict(video_or_folder_path=video_path) os.remove(video_path) return result["predictions"], gif_path app = gr.Blocks() with app: gr.Markdown("# **<p align='center'>Video Classification with Timesformer</p>**") gr.Markdown( """ <p style='text-align: center'> Timesformer is a video model that uses a Transformer architecture to process video frames. <br>It is released by Facebook AI Research in ICML 2021. <br>This version is trained on Kinetics-400 dataset and can classify videos into 400 classes. </p> """ ) gr.Markdown( """ <p style='text-align: center'> Follow me for more! <br> <a href='https://twitter.com/fcakyon' target='_blank'>twitter</a> | <a href='https://github.com/fcakyon' target='_blank'>github</a> | <a href='https://www.linkedin.com/in/fcakyon/' target='_blank'>linkedin</a> | <a href='https://fcakyon.medium.com/' target='_blank'>medium</a> </p> """ ) with gr.Row(): with gr.Column(): gr.Markdown("Provide a Youtube video URL.") youtube_url = gr.Textbox(label="Youtube URL:", show_label=True) predict_btn = gr.Button(value="Predict") with gr.Column(): video_gif = gr.Image( label="Input Clip", show_label=True, ) with gr.Column(): predictions = gr.Label( label="Predictions:", show_label=True, num_top_classes=5 ) gr.Markdown("**Examples:**") gr.Examples( examples, youtube_url, [predictions, video_gif], fn=predict, cache_examples=True, ) predict_btn.click(predict, inputs=youtube_url, outputs=[predictions, video_gif]) gr.Markdown( """ \n Demo created by: <a href=\"https://github.com/fcakyon\">fcakyon</a> <br> Based on this <a href=\"https://huggingface.co/docs/transformers/main/model_doc/timesformer">HuggingFace model</a> """ ) app.launch()