Mizukiluke commited on
Commit
acf0a85
·
verified ·
1 Parent(s): bb9e1ff

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +553 -0
app.py ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ #!/usr/bin/env python
4
+ # encoding: utf-8
5
+ import torch
6
+ import argparse
7
+ from transformers import AutoModel, AutoTokenizer
8
+ import gradio as gr
9
+ from PIL import Image
10
+ from decord import VideoReader, cpu
11
+ import io
12
+ import os
13
+ import copy
14
+ import requests
15
+ import base64
16
+ import json
17
+ import traceback
18
+ import re
19
+ import modelscope_studio as mgr
20
+ from modelscope.hub.snapshot_download import snapshot_download
21
+ model_dir = snapshot_download('iic/mPLUG-Owl3-7B-240728', cache_dir='./')
22
+ os.system('ls')
23
+ # README, How to run demo on different devices
24
+
25
+ # For Nvidia GPUs.
26
+ # python web_demo_2.6.py --device cuda
27
+
28
+ # For Mac with MPS (Apple silicon or AMD GPUs).
29
+ # PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.6.py --device mps
30
+
31
+ # Argparser
32
+ parser = argparse.ArgumentParser(description='demo')
33
+ parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
34
+ parser.add_argument("--host", type=str, default="0.0.0.0")
35
+ parser.add_argument("--port", type=int)
36
+ args = parser.parse_args()
37
+ device = args.device
38
+ assert device in ['cuda', 'mps']
39
+
40
+ # Load model
41
+ model_path = './iic/mPLUG-Owl3-7B-240728'
42
+ if 'int4' in model_path:
43
+ if device == 'mps':
44
+ print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
45
+ exit()
46
+ model = AutoModel.from_pretrained(model_path, attn_implementation='sdpa', trust_remote_code=True)
47
+ else:
48
+ model = AutoModel.from_pretrained(model_path, attn_implementation='sdpa', trust_remote_code=True, torch_dtype=torch.bfloat16)
49
+ model = model.to(device=device)
50
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
51
+ model.eval()
52
+
53
+
54
+
55
+
56
+ ERROR_MSG = "Error, please retry"
57
+ model_name = 'mPLUG-Owl3'
58
+ MAX_NUM_FRAMES = 64
59
+ IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
60
+ VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
61
+
62
+ def get_file_extension(filename):
63
+ return os.path.splitext(filename)[1].lower()
64
+
65
+ def is_image(filename):
66
+ return get_file_extension(filename) in IMAGE_EXTENSIONS
67
+
68
+ def is_video(filename):
69
+ return get_file_extension(filename) in VIDEO_EXTENSIONS
70
+
71
+
72
+ form_radio = {
73
+ 'choices': ['Beam Search', 'Sampling'],
74
+ #'value': 'Beam Search',
75
+ 'value': 'Sampling',
76
+ 'interactive': True,
77
+ 'label': 'Decode Type'
78
+ }
79
+
80
+
81
+ def create_component(params, comp='Slider'):
82
+ if comp == 'Slider':
83
+ return gr.Slider(
84
+ minimum=params['minimum'],
85
+ maximum=params['maximum'],
86
+ value=params['value'],
87
+ step=params['step'],
88
+ interactive=params['interactive'],
89
+ label=params['label']
90
+ )
91
+ elif comp == 'Radio':
92
+ return gr.Radio(
93
+ choices=params['choices'],
94
+ value=params['value'],
95
+ interactive=params['interactive'],
96
+ label=params['label']
97
+ )
98
+ elif comp == 'Button':
99
+ return gr.Button(
100
+ value=params['value'],
101
+ interactive=True
102
+ )
103
+
104
+
105
+ def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
106
+ return mgr.MultimodalInput(upload_image_button_props={'label': 'Upload Image', 'disabled': upload_image_disabled, 'file_count': 'multiple'},
107
+ upload_video_button_props={'label': 'Upload Video', 'disabled': upload_video_disabled, 'file_count': 'single'},
108
+ submit_button_props={'label': 'Submit'})
109
+
110
+
111
+ def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
112
+ try:
113
+ print('msgs:', msgs)
114
+ images = []
115
+ videos = []
116
+ messages = []
117
+ for line in msgs:
118
+ s = ""
119
+ for item in line['content']:
120
+ if isinstance(item, str):
121
+ s+=item
122
+ else:
123
+ s+='<|image|>'
124
+ images.append(item)
125
+ messages.append({"role": line['role'], "content": s})
126
+ messages.append({"role": "assistant", "content": ""})
127
+
128
+ answer = model.chat(
129
+ images=images,
130
+ videos=videos,
131
+ messages=messages,
132
+ tokenizer=tokenizer,
133
+ **params
134
+ )
135
+ res = re.sub(r'(<box>.*</box>)', '', answer)
136
+ res = res.replace('<ref>', '')
137
+ res = res.replace('</ref>', '')
138
+ res = res.replace('<box>', '')
139
+ answer = res.replace('</box>', '')
140
+ print('answer:', answer)
141
+ return 0, answer, None, None
142
+ except Exception as e:
143
+ print(e)
144
+ traceback.print_exc()
145
+ return -1, ERROR_MSG, None, None
146
+
147
+
148
+ def encode_image(image):
149
+ if not isinstance(image, Image.Image):
150
+ if hasattr(image, 'path'):
151
+ image = Image.open(image.path).convert("RGB")
152
+ else:
153
+ image = Image.open(image.file.path).convert("RGB")
154
+ # resize to max_size
155
+ max_size = 448*16
156
+ if max(image.size) > max_size:
157
+ w,h = image.size
158
+ if w > h:
159
+ new_w = max_size
160
+ new_h = int(h * max_size / w)
161
+ else:
162
+ new_h = max_size
163
+ new_w = int(w * max_size / h)
164
+ image = image.resize((new_w, new_h), resample=Image.BICUBIC)
165
+ return image
166
+ ## save by BytesIO and convert to base64
167
+ #buffered = io.BytesIO()
168
+ #image.save(buffered, format="png")
169
+ #im_b64 = base64.b64encode(buffered.getvalue()).decode()
170
+ #return {"type": "image", "pairs": im_b64}
171
+
172
+
173
+ def encode_video(video):
174
+ def uniform_sample(l, n):
175
+ gap = len(l) / n
176
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
177
+ return [l[i] for i in idxs]
178
+
179
+ if hasattr(video, 'path'):
180
+ vr = VideoReader(video.path, ctx=cpu(0))
181
+ else:
182
+ vr = VideoReader(video.file.path, ctx=cpu(0))
183
+ sample_fps = round(vr.get_avg_fps() / 1) # FPS
184
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
185
+ if len(frame_idx)>MAX_NUM_FRAMES:
186
+ frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
187
+ video = vr.get_batch(frame_idx).asnumpy()
188
+ video = [Image.fromarray(v.astype('uint8')) for v in video]
189
+ video = [encode_image(v) for v in video]
190
+ print('video frames:', len(video))
191
+ return video
192
+
193
+
194
+ def check_mm_type(mm_file):
195
+ if hasattr(mm_file, 'path'):
196
+ path = mm_file.path
197
+ else:
198
+ path = mm_file.file.path
199
+ if is_image(path):
200
+ return "image"
201
+ if is_video(path):
202
+ return "video"
203
+ return None
204
+
205
+
206
+ def encode_mm_file(mm_file):
207
+ if check_mm_type(mm_file) == 'image':
208
+ return [encode_image(mm_file)]
209
+ if check_mm_type(mm_file) == 'video':
210
+ return encode_video(mm_file)
211
+ return None
212
+
213
+ def make_text(text):
214
+ #return {"type": "text", "pairs": text} # # For remote call
215
+ return text
216
+
217
+ def encode_message(_question):
218
+ files = _question.files
219
+ question = _question.text
220
+ pattern = r"\[mm_media\]\d+\[/mm_media\]"
221
+ matches = re.split(pattern, question)
222
+ message = []
223
+ if len(matches) != len(files) + 1:
224
+ gr.Warning("Number of Images not match the placeholder in text, please refresh the page to restart!")
225
+ assert len(matches) == len(files) + 1
226
+
227
+ text = matches[0].strip()
228
+ if text:
229
+ message.append(make_text(text))
230
+ for i in range(len(files)):
231
+ message += encode_mm_file(files[i])
232
+ text = matches[i + 1].strip()
233
+ if text:
234
+ message.append(make_text(text))
235
+ return message
236
+
237
+
238
+ def check_has_videos(_question):
239
+ images_cnt = 0
240
+ videos_cnt = 0
241
+ for file in _question.files:
242
+ if check_mm_type(file) == "image":
243
+ images_cnt += 1
244
+ else:
245
+ videos_cnt += 1
246
+ return images_cnt, videos_cnt
247
+
248
+
249
+ def count_video_frames(_context):
250
+ num_frames = 0
251
+ for message in _context:
252
+ for item in message["content"]:
253
+ #if item["type"] == "image": # For remote call
254
+ if isinstance(item, Image.Image):
255
+ num_frames += 1
256
+ return num_frames
257
+
258
+
259
+ def respond(_question, _chat_bot, _app_cfg, params_form):
260
+ _context = _app_cfg['ctx'].copy()
261
+ _context.append({'role': 'user', 'content': encode_message(_question)})
262
+
263
+ images_cnt = _app_cfg['images_cnt']
264
+ videos_cnt = _app_cfg['videos_cnt']
265
+ files_cnts = check_has_videos(_question)
266
+ if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
267
+ gr.Warning("Only supports single video file input right now!")
268
+ return _question, _chat_bot, _app_cfg
269
+
270
+ if params_form == 'Beam Search':
271
+ params = {
272
+ 'sampling': False,
273
+ 'num_beams': 3,
274
+ 'repetition_penalty': 1.2,
275
+ "max_new_tokens": 2048
276
+ }
277
+ else:
278
+ params = {
279
+ 'sampling': True,
280
+ 'top_p': 0.8,
281
+ 'top_k': 100,
282
+ 'temperature': 0.7,
283
+ 'repetition_penalty': 1.05,
284
+ "max_new_tokens": 2048
285
+ }
286
+
287
+ if files_cnts[1] + videos_cnt > 0:
288
+ params["max_inp_length"] = 4352 # 4096+256
289
+ params["use_image_id"] = False
290
+ params["max_slice_nums"] = 1 if count_video_frames(_context) > 16 else 2
291
+
292
+ code, _answer, _, sts = chat("", _context, None, params)
293
+
294
+ images_cnt += files_cnts[0]
295
+ videos_cnt += files_cnts[1]
296
+ _context.append({"role": "assistant", "content": [make_text(_answer)]})
297
+ _chat_bot.append((_question, _answer))
298
+ if code == 0:
299
+ _app_cfg['ctx']=_context
300
+ _app_cfg['sts']=sts
301
+ _app_cfg['images_cnt'] = images_cnt
302
+ _app_cfg['videos_cnt'] = videos_cnt
303
+
304
+ upload_image_disabled = videos_cnt > 0
305
+ upload_video_disabled = videos_cnt > 0 or images_cnt > 0
306
+ return create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg
307
+
308
+
309
+ def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
310
+ ctx = _app_cfg["ctx"]
311
+ message_item = []
312
+ if _image is not None:
313
+ image = Image.open(_image).convert("RGB")
314
+ ctx.append({"role": "user", "content": [encode_image(image), make_text(_user_message)]})
315
+ message_item.append({"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]})
316
+ else:
317
+ if _user_message:
318
+ ctx.append({"role": "user", "content": [make_text(_user_message)]})
319
+ message_item.append({"text": _user_message, "files": []})
320
+ else:
321
+ message_item.append(None)
322
+ if _assistant_message:
323
+ ctx.append({"role": "assistant", "content": [make_text(_assistant_message)]})
324
+ message_item.append({"text": _assistant_message, "files": []})
325
+ else:
326
+ message_item.append(None)
327
+
328
+ _chat_bot.append(message_item)
329
+ return None, "", "", _chat_bot, _app_cfg
330
+
331
+
332
+ def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form):
333
+ user_message_contents = []
334
+ _context = _app_cfg["ctx"].copy()
335
+ if _image:
336
+ image = Image.open(_image).convert("RGB")
337
+ user_message_contents += [encode_image(image)]
338
+ if _user_message:
339
+ user_message_contents += [make_text(_user_message)]
340
+ if user_message_contents:
341
+ _context.append({"role": "user", "content": user_message_contents})
342
+
343
+ if params_form == 'Beam Search':
344
+ params = {
345
+ 'sampling': False,
346
+ 'num_beams': 3,
347
+ 'repetition_penalty': 1.2,
348
+ "max_new_tokens": 2048
349
+ }
350
+ else:
351
+ params = {
352
+ 'sampling': True,
353
+ 'top_p': 0.8,
354
+ 'top_k': 100,
355
+ 'temperature': 0.7,
356
+ 'repetition_penalty': 1.05,
357
+ "max_new_tokens": 2048
358
+ }
359
+
360
+ code, _answer, _, sts = chat("", _context, None, params)
361
+
362
+ _context.append({"role": "assistant", "content": [make_text(_answer)]})
363
+
364
+ if _image:
365
+ _chat_bot.append([
366
+ {"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]},
367
+ {"text": _answer, "files": []}
368
+ ])
369
+ else:
370
+ _chat_bot.append([
371
+ {"text": _user_message, "files": [_image]},
372
+ {"text": _answer, "files": []}
373
+ ])
374
+ if code == 0:
375
+ _app_cfg['ctx']=_context
376
+ _app_cfg['sts']=sts
377
+ return None, '', '', _chat_bot, _app_cfg
378
+
379
+
380
+ def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form):
381
+ if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
382
+ gr.Warning('No question for regeneration.')
383
+ return '', _image, _user_message, _assistant_message, _chat_bot, _app_cfg
384
+ if _app_cfg["chat_type"] == "Chat":
385
+ images_cnt = _app_cfg['images_cnt']
386
+ videos_cnt = _app_cfg['videos_cnt']
387
+ _question = _chat_bot[-1][0]
388
+ _chat_bot = _chat_bot[:-1]
389
+ _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
390
+ files_cnts = check_has_videos(_question)
391
+ images_cnt -= files_cnts[0]
392
+ videos_cnt -= files_cnts[1]
393
+ _app_cfg['images_cnt'] = images_cnt
394
+ _app_cfg['videos_cnt'] = videos_cnt
395
+ upload_image_disabled = videos_cnt > 0
396
+ upload_video_disabled = videos_cnt > 0 or images_cnt > 0
397
+ _question, _chat_bot, _app_cfg = respond(_question, _chat_bot, _app_cfg, params_form)
398
+ return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
399
+ else:
400
+ last_message = _chat_bot[-1][0]
401
+ last_image = None
402
+ last_user_message = ''
403
+ if last_message.text:
404
+ last_user_message = last_message.text
405
+ if last_message.files:
406
+ last_image = last_message.files[0].file.path
407
+ _chat_bot = _chat_bot[:-1]
408
+ _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
409
+ _image, _user_message, _assistant_message, _chat_bot, _app_cfg = fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form)
410
+ return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
411
+
412
+
413
+ def flushed():
414
+ return gr.update(interactive=True)
415
+
416
+
417
+ def clear(txt_message, chat_bot, app_session):
418
+ txt_message.files.clear()
419
+ txt_message.text = ''
420
+ chat_bot = copy.deepcopy(init_conversation)
421
+ app_session['sts'] = None
422
+ app_session['ctx'] = []
423
+ app_session['images_cnt'] = 0
424
+ app_session['videos_cnt'] = 0
425
+ return create_multimodal_input(), chat_bot, app_session, None, '', ''
426
+
427
+
428
+ def select_chat_type(_tab, _app_cfg):
429
+ _app_cfg["chat_type"] = _tab
430
+ return _app_cfg
431
+
432
+
433
+ init_conversation = [
434
+ [
435
+ None,
436
+ {
437
+ # The first message of bot closes the typewriter.
438
+ "text": "You can talk to me now",
439
+ "flushing": False
440
+ }
441
+ ],
442
+ ]
443
+
444
+
445
+ css = """
446
+ video { height: auto !important; }
447
+ .example label { font-size: 16px;}
448
+ """
449
+
450
+ introduction = """
451
+
452
+ ##
453
+ Github:
454
+
455
+ [mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl)
456
+
457
+ Checkpoint:
458
+
459
+ [mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)
460
+
461
+ Paper:
462
+
463
+ [mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models
464
+ ](https://arxiv.org/abs/2408.04840)
465
+
466
+ """
467
+
468
+
469
+ with gr.Blocks(css=css) as demo:
470
+ with gr.Tab(model_name):
471
+ with gr.Row():
472
+ with gr.Column(scale=1, min_width=300):
473
+ gr.Markdown(value=introduction)
474
+ params_form = create_component(form_radio, comp='Radio')
475
+ regenerate = create_component({'value': 'Regenerate'}, comp='Button')
476
+ clear_button = create_component({'value': 'Clear History'}, comp='Button')
477
+
478
+ with gr.Column(scale=3, min_width=500):
479
+ app_session = gr.State({'sts':None,'ctx':[], 'images_cnt': 0, 'videos_cnt': 0, 'chat_type': 'Chat'})
480
+ chat_bot = mgr.Chatbot(label=f"Chat with {model_name}", value=copy.deepcopy(init_conversation), height=600, flushing=False, bubble_full_width=False)
481
+
482
+ with gr.Tab("Chat") as chat_tab:
483
+ txt_message = create_multimodal_input()
484
+ chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
485
+
486
+ txt_message.submit(
487
+ respond,
488
+ [txt_message, chat_bot, app_session, params_form],
489
+ [txt_message, chat_bot, app_session]
490
+ )
491
+
492
+ with gr.Tab("Few Shot") as fewshot_tab:
493
+ fewshot_tab_label = gr.Textbox(value="Few Shot", interactive=False, visible=False)
494
+ with gr.Row():
495
+ with gr.Column(scale=1):
496
+ image_input = gr.Image(type="filepath", sources=["upload"])
497
+ with gr.Column(scale=3):
498
+ user_message = gr.Textbox(label="User")
499
+ assistant_message = gr.Textbox(label="Assistant")
500
+ with gr.Row():
501
+ add_demonstration_button = gr.Button("Add Example")
502
+ generate_button = gr.Button(value="Generate", variant="primary")
503
+ add_demonstration_button.click(
504
+ fewshot_add_demonstration,
505
+ [image_input, user_message, assistant_message, chat_bot, app_session],
506
+ [image_input, user_message, assistant_message, chat_bot, app_session]
507
+ )
508
+ generate_button.click(
509
+ fewshot_respond,
510
+ [image_input, user_message, chat_bot, app_session, params_form],
511
+ [image_input, user_message, assistant_message, chat_bot, app_session]
512
+ )
513
+
514
+ chat_tab.select(
515
+ select_chat_type,
516
+ [chat_tab_label, app_session],
517
+ [app_session]
518
+ )
519
+ chat_tab.select( # do clear
520
+ clear,
521
+ [txt_message, chat_bot, app_session],
522
+ [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
523
+ )
524
+ fewshot_tab.select(
525
+ select_chat_type,
526
+ [fewshot_tab_label, app_session],
527
+ [app_session]
528
+ )
529
+ fewshot_tab.select( # do clear
530
+ clear,
531
+ [txt_message, chat_bot, app_session],
532
+ [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
533
+ )
534
+ chat_bot.flushed(
535
+ flushed,
536
+ outputs=[txt_message]
537
+ )
538
+ regenerate.click(
539
+ regenerate_button_clicked,
540
+ [txt_message, image_input, user_message, assistant_message, chat_bot, app_session, params_form],
541
+ [txt_message, image_input, user_message, assistant_message, chat_bot, app_session]
542
+ )
543
+ clear_button.click(
544
+ clear,
545
+ [txt_message, chat_bot, app_session],
546
+ [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
547
+ )
548
+
549
+
550
+
551
+
552
+ # launch
553
+ demo.launch(share=False, debug=True, show_api=False, server_port=args.port, server_name=args.host)