import gradio as gr import spaces import torch import os from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info # pip install qwen-vl-utils[decord]==0.0.8 # ============================================================================= # Qwen2.5-VL-7B-Instruct: model & processor # ============================================================================= MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct" # 권장: flash-attn2 사용 (환경에 따라 주석 해제) # model = Qwen2_5_VLForConditionalGeneration.from_pretrained( # MODEL_ID, # torch_dtype=torch.bfloat16, # attn_implementation="flash_attention_2", # device_map="auto", # ) # 기본 로드 model = Qwen2_5_VLForConditionalGeneration.from_pretrained( MODEL_ID, torch_dtype="auto", device_map="auto", ) model.eval() # 해상도 자동 조절(기본값 사용). 필요시 min/max_pixels로 토큰 비용 제어 가능. processor = AutoProcessor.from_pretrained(MODEL_ID) # 예: min_pixels = 256*28*28; max_pixels = 1280*28*28 # processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=min_pixels, max_pixels=max_pixels) # ============================================================================= # Inference (image-only UI, text는 선택) # ============================================================================= @spaces.GPU def qwen_vl_inference(image_path: str | None, text_input: str | None = None): if image_path is None: return "Please upload an image first." # Qwen은 파일 경로를 file:// URI로 전달하는 방식을 공식 예제로 제공 file_uri = f"file://{os.path.abspath(image_path)}" user_text = text_input.strip() if text_input else "Describe this image." messages = [ { "role": "user", "content": [ {"type": "image", "image": file_uri}, {"type": "text", "text": user_text}, ], } ] # 텍스트/비전 전처리 chat_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[chat_text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) # 모델 디바이스로 이동 (device_map="auto" 환경에서도 안전) inputs = {k: (v.to(model.device) if isinstance(v, torch.Tensor) else v) for k, v in inputs.items()} # 생성 gen_ids = model.generate(**inputs, max_new_tokens=512) # 입력 토큰 제거 후 디코딩 trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], gen_ids)] output = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] return output # ============================================================================= # Gradio UI (Gradio 5) # ============================================================================= DESCRIPTION = ( "[Qwen2.5-VL-7B-Instruct demo](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) — " "upload an image and ask anything about it." ) css = """ #output_text { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css, theme="origin") as demo: gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(label="Upload Image", type="filepath") text_input = gr.Textbox(label="Question") submit_btn = gr.Button("Submit") with gr.Column(scale=1): output_text = gr.Textbox(label="Model Output", elem_id="output_text") gr.Examples( examples=[["example.webp", "Explain this image"]], inputs=[input_image, text_input], outputs=output_text, fn=qwen_vl_inference, cache_examples=True, label="Try an example" ) submit_btn.click(qwen_vl_inference, [input_image, text_input], [output_text]) if __name__ == "__main__": demo.launch()