Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,7 +21,7 @@ import io
|
|
| 21 |
import datasets
|
| 22 |
|
| 23 |
import gradio as gr
|
| 24 |
-
from transformers import AutoProcessor, TextIteratorStreamer
|
| 25 |
from transformers import Idefics2ForConditionalGeneration
|
| 26 |
import tempfile
|
| 27 |
from streaming_stt_nemo import Model
|
|
@@ -30,17 +30,24 @@ import edge_tts
|
|
| 30 |
import asyncio
|
| 31 |
from transformers import pipeline
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
from gradio import Image, Textbox
|
| 46 |
|
|
@@ -307,7 +314,7 @@ def extract_images_from_msg_list(msg_list):
|
|
| 307 |
return all_images
|
| 308 |
|
| 309 |
|
| 310 |
-
@spaces.GPU(duration=
|
| 311 |
def model_inference(
|
| 312 |
user_prompt,
|
| 313 |
chat_history,
|
|
@@ -535,7 +542,7 @@ with gr.Blocks() as voice2:
|
|
| 535 |
outputs=[output], live=True)
|
| 536 |
|
| 537 |
with gr.Blocks() as video:
|
| 538 |
-
gr.Markdown(" ## Live Chat
|
| 539 |
gr.Markdown("### Click camera option to update image")
|
| 540 |
gr.Interface(
|
| 541 |
fn=answer_question,
|
|
|
|
| 21 |
import datasets
|
| 22 |
|
| 23 |
import gradio as gr
|
| 24 |
+
from transformers import AutoModel, AutoProcessor, TextIteratorStreamer
|
| 25 |
from transformers import Idefics2ForConditionalGeneration
|
| 26 |
import tempfile
|
| 27 |
from streaming_stt_nemo import Model
|
|
|
|
| 30 |
import asyncio
|
| 31 |
from transformers import pipeline
|
| 32 |
|
| 33 |
+
model = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
|
| 34 |
+
processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
|
| 35 |
+
|
| 36 |
+
@spaces.GPU(duration=10, queue=False)
|
| 37 |
+
def answer_question(image, prompt):
|
| 38 |
+
inputs = processor(text=[prompt], images=[image], return_tensors="pt")
|
| 39 |
+
with torch.inference_mode():
|
| 40 |
+
output = model.generate(
|
| 41 |
+
**inputs,
|
| 42 |
+
do_sample=False,
|
| 43 |
+
use_cache=True,
|
| 44 |
+
max_new_tokens=256,
|
| 45 |
+
eos_token_id=151645,
|
| 46 |
+
pad_token_id=processor.tokenizer.pad_token_id
|
| 47 |
+
)
|
| 48 |
+
prompt_len = inputs["input_ids"].shape[1]
|
| 49 |
+
decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
|
| 50 |
+
return decoded_text
|
| 51 |
|
| 52 |
from gradio import Image, Textbox
|
| 53 |
|
|
|
|
| 314 |
return all_images
|
| 315 |
|
| 316 |
|
| 317 |
+
@spaces.GPU(duration=30, queue=False)
|
| 318 |
def model_inference(
|
| 319 |
user_prompt,
|
| 320 |
chat_history,
|
|
|
|
| 542 |
outputs=[output], live=True)
|
| 543 |
|
| 544 |
with gr.Blocks() as video:
|
| 545 |
+
gr.Markdown(" ## Live Chat")
|
| 546 |
gr.Markdown("### Click camera option to update image")
|
| 547 |
gr.Interface(
|
| 548 |
fn=answer_question,
|