|
import numpy as np |
|
import torch |
|
import torchvision.transforms as T |
|
from PIL import Image |
|
from torchvision.transforms.functional import InterpolationMode |
|
from transformers import AutoModel, AutoTokenizer |
|
import gradio as gr |
|
|
|
|
|
device = torch.device("cpu") |
|
|
|
IMAGENET_MEAN = (0.485, 0.456, 0.406) |
|
IMAGENET_STD = (0.229, 0.224, 0.225) |
|
|
|
def build_transform(input_size): |
|
transform = T.Compose([ |
|
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), |
|
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), |
|
T.ToTensor(), |
|
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) |
|
]) |
|
return transform |
|
|
|
def load_image(image, input_size=448): |
|
transform = build_transform(input_size=input_size) |
|
pixel_values = transform(image).unsqueeze(0) |
|
return pixel_values |
|
|
|
|
|
model = AutoModel.from_pretrained( |
|
"5CD-AI/Vintern-1B-v3_5", |
|
torch_dtype=torch.float32, |
|
low_cpu_mem_usage=True, |
|
trust_remote_code=True, |
|
).eval().to(device) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-1B-v3_5", trust_remote_code=True, use_fast=False) |
|
|
|
def process_image(image): |
|
pixel_values = load_image(image).to(device) |
|
generation_config = dict(max_new_tokens=1024, do_sample=False, num_beams=3, repetition_penalty=2.5) |
|
|
|
question = "<image>\nTrích xuất toàn bộ thông tin trong ảnh và trả về dạng text." |
|
response, _ = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True) |
|
|
|
return response |
|
|
|
iface = gr.Interface( |
|
fn=process_image, |
|
inputs=gr.Image(type="pil"), |
|
outputs="text", |
|
title="Vietnamese Hand Writing ORC", |
|
description="Extract all the information from the image and return it in text form." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |