Added models in Voice chat and Improved UI
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -54,9 +54,20 @@ def videochat(image3, prompt3): 
     | 
|
| 54 | 
         
             
                    decoded_text = decoded_text[:-10]
         
     | 
| 55 | 
         
             
                yield decoded_text
         
     | 
| 56 | 
         | 
| 57 | 
         
            -
            theme = gr.themes. 
     | 
| 58 | 
         
            -
                 
     | 
| 59 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 60 | 
         | 
| 61 | 
         
             
            MODEL_NAME = "openai/whisper-medium"
         
     | 
| 62 | 
         
             
            BATCH_SIZE = 10
         
     | 
| 
         @@ -78,18 +89,39 @@ def transcribe(inputs): 
     | 
|
| 78 | 
         
             
                text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"})["text"]
         
     | 
| 79 | 
         
             
                return  text
         
     | 
| 80 | 
         | 
| 81 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 82 | 
         | 
| 83 | 
         
             
            system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
         
     | 
| 84 | 
         | 
| 85 | 
         
            -
            def  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 86 | 
         
             
                generate_kwargs = dict(
         
     | 
| 87 | 
         
             
                    temperature=0.7,
         
     | 
| 88 | 
         
             
                    max_new_tokens=512,
         
     | 
| 89 | 
         
             
                    top_p=0.95,
         
     | 
| 90 | 
         
             
                    repetition_penalty=1,
         
     | 
| 91 | 
         
             
                    do_sample=True,
         
     | 
| 92 | 
         
            -
                    seed= 
     | 
| 93 | 
         
             
                )
         
     | 
| 94 | 
         | 
| 95 | 
         
             
                formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
         
     | 
| 
         @@ -115,7 +147,7 @@ DEVICE = torch.device("cuda") 
     | 
|
| 115 | 
         
             
            MODELS = {
         
     | 
| 116 | 
         
             
                "idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
         
     | 
| 117 | 
         
             
                    "HuggingFaceM4/idefics2-8b-chatty",
         
     | 
| 118 | 
         
            -
                    torch_dtype=torch. 
     | 
| 119 | 
         
             
                    _attn_implementation="flash_attention_2",
         
     | 
| 120 | 
         
             
                ).to(DEVICE),
         
     | 
| 121 | 
         
             
            }
         
     | 
| 
         @@ -521,16 +553,12 @@ with gr.Blocks() as voice: 
     | 
|
| 521 | 
         
             
                                    autoplay=True,
         
     | 
| 522 | 
         
             
                                    elem_classes="audio")
         
     | 
| 523 | 
         
             
                    gr.Interface(
         
     | 
| 524 | 
         
            -
                        batch=True,
         
     | 
| 525 | 
         
            -
                        max_batch_size=10, 
         
     | 
| 526 | 
         
             
                        fn=respond, 
         
     | 
| 527 | 
         
             
                        inputs=[input],
         
     | 
| 528 | 
         
            -
                            outputs=[output], live=True)
         
     | 
| 529 | 
         | 
| 530 | 
         
             
            with gr.Blocks() as livechat:  
         
     | 
| 531 | 
         
             
                gr.Interface(
         
     | 
| 532 | 
         
            -
                    batch=True,
         
     | 
| 533 | 
         
            -
                    max_batch_size=10, 
         
     | 
| 534 | 
         
             
                    fn=videochat,
         
     | 
| 535 | 
         
             
                    inputs=[gr.Image(type="pil",sources="webcam", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
         
     | 
| 536 | 
         
             
                    outputs=gr.Textbox(label="Answer")
         
     | 
| 
         | 
|
| 54 | 
         
             
                    decoded_text = decoded_text[:-10]
         
     | 
| 55 | 
         
             
                yield decoded_text
         
     | 
| 56 | 
         | 
| 57 | 
         
            +
            theme = gr.themes.Soft(
         
     | 
| 58 | 
         
            +
                primary_hue="blue",
         
     | 
| 59 | 
         
            +
                secondary_hue="orange",
         
     | 
| 60 | 
         
            +
                neutral_hue="gray",
         
     | 
| 61 | 
         
            +
                font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif']).set(
         
     | 
| 62 | 
         
            +
                    body_background_fill_dark="#111111",
         
     | 
| 63 | 
         
            +
                    block_background_fill_dark="#111111",
         
     | 
| 64 | 
         
            +
                    block_border_width="1px",
         
     | 
| 65 | 
         
            +
                    block_title_background_fill_dark="#1e1c26",
         
     | 
| 66 | 
         
            +
                    input_background_fill_dark="#292733",
         
     | 
| 67 | 
         
            +
                    button_secondary_background_fill_dark="#24212b",
         
     | 
| 68 | 
         
            +
                    border_color_primary_dark="#343140",
         
     | 
| 69 | 
         
            +
                    background_fill_secondary_dark="#111111",
         
     | 
| 70 | 
         
            +
                    color_accent_soft_dark="transparent")
         
     | 
| 71 | 
         | 
| 72 | 
         
             
            MODEL_NAME = "openai/whisper-medium"
         
     | 
| 73 | 
         
             
            BATCH_SIZE = 10
         
     | 
| 
         | 
|
| 89 | 
         
             
                text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"})["text"]
         
     | 
| 90 | 
         
             
                return  text
         
     | 
| 91 | 
         | 
| 92 | 
         
            +
            HF_TOKEN = os.environ.get("HF_TOKEN", None)
         
     | 
| 93 | 
         
            +
             
     | 
| 94 | 
         
            +
            def client_fn(model):
         
     | 
| 95 | 
         
            +
                if "Mixtral" in model:
         
     | 
| 96 | 
         
            +
                    return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
         
     | 
| 97 | 
         
            +
                elif "Llama" in model:
         
     | 
| 98 | 
         
            +
                    return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
         
     | 
| 99 | 
         
            +
                elif "Mistral" in model:
         
     | 
| 100 | 
         
            +
                    return InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
         
     | 
| 101 | 
         
            +
                elif "Phi" in model:
         
     | 
| 102 | 
         
            +
                    return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
         
     | 
| 103 | 
         
            +
                else: 
         
     | 
| 104 | 
         
            +
                    return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
         
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
            def randomize_seed_fn(seed: int) -> int:
         
     | 
| 107 | 
         
            +
                seed = random.randint(0, 999999)
         
     | 
| 108 | 
         
            +
                return seed
         
     | 
| 109 | 
         | 
| 110 | 
         
             
            system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
         
     | 
| 111 | 
         | 
| 112 | 
         
            +
            def models(text, model="Mixtral 8x7B", seed=42):
         
     | 
| 113 | 
         
            +
             
     | 
| 114 | 
         
            +
                seed = int(randomize_seed_fn(seed))
         
     | 
| 115 | 
         
            +
                generator = torch.Generator().manual_seed(seed)  
         
     | 
| 116 | 
         
            +
                
         
     | 
| 117 | 
         
            +
                client = client_fn(model)
         
     | 
| 118 | 
         
             
                generate_kwargs = dict(
         
     | 
| 119 | 
         
             
                    temperature=0.7,
         
     | 
| 120 | 
         
             
                    max_new_tokens=512,
         
     | 
| 121 | 
         
             
                    top_p=0.95,
         
     | 
| 122 | 
         
             
                    repetition_penalty=1,
         
     | 
| 123 | 
         
             
                    do_sample=True,
         
     | 
| 124 | 
         
            +
                    seed=seed,
         
     | 
| 125 | 
         
             
                )
         
     | 
| 126 | 
         | 
| 127 | 
         
             
                formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
         
     | 
| 
         | 
|
| 147 | 
         
             
            MODELS = {
         
     | 
| 148 | 
         
             
                "idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
         
     | 
| 149 | 
         
             
                    "HuggingFaceM4/idefics2-8b-chatty",
         
     | 
| 150 | 
         
            +
                    torch_dtype=torch.float16,
         
     | 
| 151 | 
         
             
                    _attn_implementation="flash_attention_2",
         
     | 
| 152 | 
         
             
                ).to(DEVICE),
         
     | 
| 153 | 
         
             
            }
         
     | 
| 
         | 
|
| 553 | 
         
             
                                    autoplay=True,
         
     | 
| 554 | 
         
             
                                    elem_classes="audio")
         
     | 
| 555 | 
         
             
                    gr.Interface(
         
     | 
| 
         | 
|
| 
         | 
|
| 556 | 
         
             
                        fn=respond, 
         
     | 
| 557 | 
         
             
                        inputs=[input],
         
     | 
| 558 | 
         
            +
                            outputs=[output], api_name="translate", live=True)
         
     | 
| 559 | 
         | 
| 560 | 
         
             
            with gr.Blocks() as livechat:  
         
     | 
| 561 | 
         
             
                gr.Interface(
         
     | 
| 
         | 
|
| 
         | 
|
| 562 | 
         
             
                    fn=videochat,
         
     | 
| 563 | 
         
             
                    inputs=[gr.Image(type="pil",sources="webcam", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
         
     | 
| 564 | 
         
             
                    outputs=gr.Textbox(label="Answer")
         
     |