Juctxy commited on
Commit
fdecca6
·
verified ·
1 Parent(s): 19f9548

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -0
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torchvision.transforms as T
4
+ from PIL import Image
5
+ from torchvision.transforms.functional import InterpolationMode
6
+ from transformers import AutoModel, AutoTokenizer
7
+ import gradio as gr
8
+
9
+ # Dùng CPU thay vì GPU
10
+ device = torch.device("cpu")
11
+
12
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
13
+ IMAGENET_STD = (0.229, 0.224, 0.225)
14
+
15
+ def build_transform(input_size):
16
+ transform = T.Compose([
17
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
18
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
19
+ T.ToTensor(),
20
+ T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
21
+ ])
22
+ return transform
23
+
24
+ def load_image(image, input_size=448):
25
+ transform = build_transform(input_size=input_size)
26
+ pixel_values = transform(image).unsqueeze(0) # Thêm batch dimension
27
+ return pixel_values
28
+
29
+ # Load model trên CPU
30
+ model = AutoModel.from_pretrained(
31
+ "5CD-AI/Vintern-1B-v3_5",
32
+ torch_dtype=torch.float32, # Dùng float32 cho CPU
33
+ low_cpu_mem_usage=True,
34
+ trust_remote_code=True,
35
+ ).eval().to(device)
36
+
37
+ tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-1B-v3_5", trust_remote_code=True, use_fast=False)
38
+
39
+ def process_image(image):
40
+ pixel_values = load_image(image).to(device)
41
+ generation_config = dict(max_new_tokens=1024, do_sample=False, num_beams=3, repetition_penalty=2.5)
42
+
43
+ question = "<image>\nTrích xuất toàn bộ thông tin trong ảnh và trả về dạng text."
44
+ response, _ = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
45
+
46
+ return response
47
+
48
+ iface = gr.Interface(
49
+ fn=process_image,
50
+ inputs=gr.Image(type="pil"),
51
+ outputs="text",
52
+ title="Vietnamese Hand Writing ORC",
53
+ description="Extract all the information from the image and return it in text form."
54
+ )
55
+
56
+ if __name__ == "__main__":
57
+ iface.launch()