import numpy as np import torch import torchvision.transforms as T from PIL import Image from torchvision.transforms.functional import InterpolationMode from transformers import AutoModel, AutoTokenizer import gradio as gr # Dùng CPU thay vì GPU device = torch.device("cpu") IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) def build_transform(input_size): transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) ]) return transform def load_image(image, input_size=448): transform = build_transform(input_size=input_size) pixel_values = transform(image).unsqueeze(0) # Thêm batch dimension return pixel_values # Load model trên CPU model = AutoModel.from_pretrained( "5CD-AI/Vintern-1B-v3_5", torch_dtype=torch.float32, # Dùng float32 cho CPU low_cpu_mem_usage=True, trust_remote_code=True, ).eval().to(device) tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-1B-v3_5", trust_remote_code=True, use_fast=False) def process_image(image): pixel_values = load_image(image).to(device) generation_config = dict(max_new_tokens=1024, do_sample=False, num_beams=3, repetition_penalty=2.5) question = "\nTrích xuất toàn bộ thông tin trong ảnh và trả về dạng text." response, _ = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True) return response iface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs="text", title="Vietnamese Hand Writing ORC", description="Extract all the information from the image and return it in text form." ) if __name__ == "__main__": iface.launch()