OP7 commited on
Commit
a329065
·
verified ·
1 Parent(s): 9c4039d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
2
+ #
3
+ # This space is created by SANJOG GHONGE for testing and learning purpose.
4
+ #
5
+ # If you want to remove this space or credits please contact me on my email id [[email protected]].
6
+ #
7
+ # Citation : @misc{qvq-72b-preview,
8
+ # title = {QVQ: To See the World with Wisdom},
9
+ # url = {https://qwenlm.github.io/blog/qvq-72b-preview/},
10
+ # author = {Qwen Team},
11
+ # month = {December},
12
+ # year = {2024}
13
+ # }
14
+
15
+ # @article{Qwen2VL,
16
+ # title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution},
17
+ # author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai,
18
+ # Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang,
19
+ # Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou,
20
+ # Jingren and Lin, Junyang},
21
+ # journal={arXiv preprint arXiv:2409.12191},
22
+ # year={2024}
23
+ # }
24
+ #
25
+ # -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+
27
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
28
+ from qwen_vl_utils import process_vision_info
29
+ import gradio as gr
30
+ from PIL import Image
31
+
32
+ # Load the model and processor
33
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
34
+ "Qwen/QVQ-72B-Preview", torch_dtype="auto", device_map="auto"
35
+ )
36
+ processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
37
+
38
+ # Function to process the image and question
39
+ def process_image_and_question(image, question):
40
+ if image is None or question.strip() == "":
41
+ return "Please provide both an image and a question."
42
+
43
+ # Prepare the input message
44
+ messages = [
45
+ {
46
+ "role": "system",
47
+ "content": [
48
+ {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
49
+ ],
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {"type": "image", "image": image},
55
+ {"type": "text", "text": question},
56
+ ],
57
+ }
58
+ ]
59
+
60
+ # Process the inputs
61
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
62
+ image_inputs, video_inputs = process_vision_info(messages)
63
+
64
+ inputs = processor(
65
+ text=[text],
66
+ images=image_inputs,
67
+ videos=video_inputs,
68
+ padding=True,
69
+ return_tensors="pt",
70
+ )
71
+ inputs = inputs.to("cuda")
72
+
73
+ # Generate the output
74
+ generated_ids = model.generate(**inputs, max_new_tokens=8192)
75
+ generated_ids_trimmed = [
76
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
77
+ ]
78
+ output_text = processor.batch_decode(
79
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
80
+ )
81
+
82
+ return output_text[0] if output_text else "No output generated."
83
+
84
+ # Define the Gradio interface
85
+ with gr.Blocks() as demo:
86
+ gr.Markdown("# Sanjog Test : Image and Question Answering\nProvide an image (JPG/PNG) and a related question to get an answer.")
87
+
88
+ with gr.Row():
89
+ with gr.Column():
90
+ image_input = gr.Image(type="pil", label="Upload Image (JPG/PNG)")
91
+ question_input = gr.Textbox(label="Enter your question")
92
+
93
+ with gr.Column():
94
+ output_box = gr.Textbox(label="Result", interactive=False)
95
+
96
+ with gr.Row():
97
+ clear_button = gr.Button("Clear")
98
+ submit_button = gr.Button("Submit")
99
+
100
+ # Define button functionality
101
+ clear_button.click(lambda: (None, "", ""), inputs=[], outputs=[image_input, question_input, output_box])
102
+ submit_button.click(process_image_and_question, inputs=[image_input, question_input], outputs=output_box)
103
+
104
+ # Launch the interface
105
+ demo.launch()
106
+
107
+