OP7 commited on
Commit
699b814
·
verified ·
1 Parent(s): 9ff62b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -124
app.py CHANGED
@@ -1,85 +1,135 @@
1
- # # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
2
- # #
3
- # # This space is created by SANJOG GHONGE for testing and learning purpose.
4
- # #
5
- # # If you want to remove this space or credits please contact me on my email id [[email protected]].
6
- # #
7
- # # Citation : @misc{qvq-72b-preview,
8
- # # title = {QVQ: To See the World with Wisdom},
9
- # # url = {https://qwenlm.github.io/blog/qvq-72b-preview/},
10
- # # author = {Qwen Team},
11
- # # month = {December},
12
- # # year = {2024}
13
- # # }
14
-
15
- # # @article{Qwen2VL,
16
- # # title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution},
17
- # # author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai,
18
- # # Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang,
19
- # # Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou,
20
- # # Jingren and Lin, Junyang},
21
- # # journal={arXiv preprint arXiv:2409.12191},
22
- # # year={2024}
23
- # # }
24
- # #
25
- # # -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
-
27
- # from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
28
- # from qwen_vl_utils import process_vision_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # import gradio as gr
30
- # from PIL import Image
31
 
32
- # # Load the model and processor
33
- # model = Qwen2VLForConditionalGeneration.from_pretrained(
34
- # "Qwen/QVQ-72B-Preview", torch_dtype="auto", device_map="auto"
35
- # )
36
- # processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
37
 
38
- # # Function to process the image and question
39
  # def process_image_and_question(image, question):
40
- # if image is None or question.strip() == "":
41
  # return "Please provide both an image and a question."
42
-
43
- # # Prepare the input message
44
- # messages = [
45
- # {
46
- # "role": "system",
47
- # "content": [
48
- # {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
49
- # ],
50
- # },
51
- # {
52
- # "role": "user",
53
- # "content": [
54
- # {"type": "image", "image": image},
55
- # {"type": "text", "text": question},
56
- # ],
57
- # }
58
- # ]
59
-
60
  # # Process the inputs
61
- # text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
62
- # image_inputs, video_inputs = process_vision_info(messages)
63
-
64
- # inputs = processor(
65
- # text=[text],
66
- # images=image_inputs,
67
- # videos=video_inputs,
68
- # padding=True,
69
- # return_tensors="pt",
70
- # )
71
- # inputs = inputs.to("cuda")
72
-
73
  # # Generate the output
74
- # generated_ids = model.generate(**inputs, max_new_tokens=8192)
75
- # generated_ids_trimmed = [
76
- # out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
77
- # ]
78
- # output_text = processor.batch_decode(
79
- # generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
80
- # )
81
 
82
- # return output_text[0] if output_text else "No output generated."
83
 
84
  # # Define the Gradio interface
85
  # with gr.Blocks() as demo:
@@ -103,53 +153,3 @@
103
 
104
  # # Launch the interface
105
  # demo.launch()
106
-
107
-
108
- # ------------------------------------------------------------------------------------------------------------------------------------
109
-
110
-
111
-
112
- import gradio as gr
113
- from transformers import AutoProcessor, AutoModelForImageTextToText
114
-
115
- # Load the processor and model
116
- model_name = "Qwen/QVQ-72B-Preview"
117
- processor = AutoProcessor.from_pretrained(model_name)
118
- model = AutoModelForImageTextToText.from_pretrained(model_name)
119
-
120
- # Define the prediction function
121
- def process_image_and_question(image, question):
122
- if image is None or not question:
123
- return "Please provide both an image and a question."
124
-
125
- # Process the inputs
126
- inputs = processor(images=image, text=question, return_tensors="pt")
127
-
128
- # Generate the output
129
- outputs = model.generate(**inputs)
130
- answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
131
-
132
- return answer
133
-
134
- # Define the Gradio interface
135
- with gr.Blocks() as demo:
136
- gr.Markdown("# Image and Question Answering\nProvide an image (JPG/PNG) and a related question to get an answer.")
137
-
138
- with gr.Row():
139
- with gr.Column():
140
- image_input = gr.Image(type="pil", label="Upload Image (JPG/PNG)")
141
- question_input = gr.Textbox(label="Enter your question")
142
-
143
- with gr.Column():
144
- output_box = gr.Textbox(label="Result", interactive=False)
145
-
146
- with gr.Row():
147
- clear_button = gr.Button("Clear")
148
- submit_button = gr.Button("Submit")
149
-
150
- # Define button functionality
151
- clear_button.click(lambda: (None, "", ""), inputs=[], outputs=[image_input, question_input, output_box])
152
- submit_button.click(process_image_and_question, inputs=[image_input, question_input], outputs=output_box)
153
-
154
- # Launch the interface
155
- demo.launch()
 
1
+ # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
2
+ #
3
+ # This space is created by SANJOG GHONGE for testing and learning purpose.
4
+ #
5
+ # If you want to remove this space or credits please contact me on my email id [[email protected]].
6
+ #
7
+ # Citation : @misc{qvq-72b-preview,
8
+ # title = {QVQ: To See the World with Wisdom},
9
+ # url = {https://qwenlm.github.io/blog/qvq-72b-preview/},
10
+ # author = {Qwen Team},
11
+ # month = {December},
12
+ # year = {2024}
13
+ # }
14
+
15
+ # @article{Qwen2VL,
16
+ # title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution},
17
+ # author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai,
18
+ # Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang,
19
+ # Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou,
20
+ # Jingren and Lin, Junyang},
21
+ # journal={arXiv preprint arXiv:2409.12191},
22
+ # year={2024}
23
+ # }
24
+ #
25
+ # -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+
27
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
28
+ from qwen_vl_utils import process_vision_info
29
+ import gradio as gr
30
+ from PIL import Image
31
+
32
+ # Load the model and processor
33
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
34
+ "Qwen/QVQ-72B-Preview", torch_dtype="auto", device_map="auto"
35
+ )
36
+ processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
37
+
38
+ # Function to process the image and question
39
+ def process_image_and_question(image, question):
40
+ if image is None or question.strip() == "":
41
+ return "Please provide both an image and a question."
42
+
43
+ # Prepare the input message
44
+ messages = [
45
+ {
46
+ "role": "system",
47
+ "content": [
48
+ {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
49
+ ],
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {"type": "image", "image": image},
55
+ {"type": "text", "text": question},
56
+ ],
57
+ }
58
+ ]
59
+
60
+ # Process the inputs
61
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
62
+ image_inputs, video_inputs = process_vision_info(messages)
63
+
64
+ inputs = processor(
65
+ text=[text],
66
+ images=image_inputs,
67
+ videos=video_inputs,
68
+ padding=True,
69
+ return_tensors="pt",
70
+ )
71
+ inputs = inputs.to("cuda")
72
+
73
+ # Generate the output
74
+ generated_ids = model.generate(**inputs, max_new_tokens=8192)
75
+ generated_ids_trimmed = [
76
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
77
+ ]
78
+ output_text = processor.batch_decode(
79
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
80
+ )
81
+
82
+ return output_text[0] if output_text else "No output generated."
83
+
84
+ # Define the Gradio interface
85
+ with gr.Blocks() as demo:
86
+ gr.Markdown("# Sanjog Image and Question Answering\nProvide an image (JPG/PNG) and a related question to get an answer.")
87
+
88
+ with gr.Row():
89
+ with gr.Column():
90
+ image_input = gr.Image(type="pil", label="Upload Image (JPG/PNG)")
91
+ question_input = gr.Textbox(label="Enter your question")
92
+
93
+ with gr.Column():
94
+ output_box = gr.Textbox(label="Result", interactive=False)
95
+
96
+ with gr.Row():
97
+ clear_button = gr.Button("Clear")
98
+ submit_button = gr.Button("Submit")
99
+
100
+ # Define button functionality
101
+ clear_button.click(lambda: (None, "", ""), inputs=[], outputs=[image_input, question_input, output_box])
102
+ submit_button.click(process_image_and_question, inputs=[image_input, question_input], outputs=output_box)
103
+
104
+ # Launch the interface
105
+ demo.launch()
106
+
107
+
108
+ # ------------------------------------------------------------------------------------------------------------------------------------
109
+
110
+
111
+
112
  # import gradio as gr
113
+ # from transformers import AutoProcessor, AutoModelForImageTextToText
114
 
115
+ # # Load the processor and model
116
+ # model_name = "Qwen/QVQ-72B-Preview"
117
+ # processor = AutoProcessor.from_pretrained(model_name)
118
+ # model = AutoModelForImageTextToText.from_pretrained(model_name)
 
119
 
120
+ # # Define the prediction function
121
  # def process_image_and_question(image, question):
122
+ # if image is None or not question:
123
  # return "Please provide both an image and a question."
124
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  # # Process the inputs
126
+ # inputs = processor(images=image, text=question, return_tensors="pt")
127
+
 
 
 
 
 
 
 
 
 
 
128
  # # Generate the output
129
+ # outputs = model.generate(**inputs)
130
+ # answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
 
 
 
 
131
 
132
+ # return answer
133
 
134
  # # Define the Gradio interface
135
  # with gr.Blocks() as demo:
 
153
 
154
  # # Launch the interface
155
  # demo.launch()