|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from PIL import Image |
|
import torch |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
"vikhyatk/moondream2", |
|
revision="2025-06-21", |
|
trust_remote_code=True, |
|
device_map={"": device} |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"vikhyatk/moondream2", |
|
revision="2025-06-21", |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
def caption_image(image: Image.Image, length: str = "normal"): |
|
|
|
caption_stream = model.caption(image, length=length, stream=True)["caption"] |
|
caption = "".join(caption_stream) |
|
|
|
|
|
tokens = tokenizer(caption, return_tensors="pt")["input_ids"].shape[1] |
|
words = len(caption.split()) |
|
|
|
return caption.strip(), f"{tokens} tokens", f"{words} words" |
|
|
|
|
|
title = "🖼️ Image Captioner using Moondream 2" |
|
description = ( |
|
"Upload an image and get a generated caption using Moondream 2 (streamed output). " |
|
"Choose caption length (short, normal, long). Outputs include caption, token count, and word count." |
|
) |
|
|
|
demo = gr.Interface( |
|
fn=caption_image, |
|
inputs=[ |
|
gr.Image(type="pil", label="Upload Image"), |
|
gr.Radio(["short", "normal", "long"], value="normal", label="Caption Length") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Generated Caption", lines=3, show_copy_button=True), |
|
gr.Textbox(label="Token Count", show_copy_button=True), |
|
gr.Textbox(label="Word Count", show_copy_button=True) |
|
], |
|
title=title, |
|
description=description, |
|
theme="soft" |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|