import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from PIL import Image import torch device = "cuda" if torch.cuda.is_available() else "cpu" # Load model and tokenizer model = AutoModelForCausalLM.from_pretrained( "vikhyatk/moondream2", revision="2025-06-21", trust_remote_code=True, device_map={"": device} # or "cpu" / "mps" ) tokenizer = AutoTokenizer.from_pretrained( "vikhyatk/moondream2", revision="2025-06-21", trust_remote_code=True ) # Inference function with stream=True and copyable outputs def caption_image(image: Image.Image, length: str = "normal"): # Streaming caption generation caption_stream = model.caption(image, length=length, stream=True)["caption"] caption = "".join(caption_stream) # Token and word counts tokens = tokenizer(caption, return_tensors="pt")["input_ids"].shape[1] words = len(caption.split()) return caption.strip(), f"{tokens} tokens", f"{words} words" # Gradio UI title = "🖼️ Image Captioner using Moondream 2" description = ( "Upload an image and get a generated caption using Moondream 2 (streamed output). " "Choose caption length (short, normal, long). Outputs include caption, token count, and word count." ) demo = gr.Interface( fn=caption_image, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Radio(["short", "normal", "long"], value="normal", label="Caption Length") ], outputs=[ gr.Textbox(label="Generated Caption", lines=3, show_copy_button=True), gr.Textbox(label="Token Count", show_copy_button=True), gr.Textbox(label="Word Count", show_copy_button=True) ], title=title, description=description, theme="soft" ) if __name__ == "__main__": demo.launch()