Spaces:
Sleeping
Sleeping
from voice_of_the_patient import record_audio, transcribe_with_groq | |
from brain_of_the_doctor import analyze_image_with_query, encode_image | |
from voice_of_the_doctor import text_to_speech_with_gtts_autoplay | |
import os | |
import gradio as gr | |
from dotenv import load_dotenv | |
load_dotenv() | |
GROQ_API_KEY = os.environ.get('GROQ_API_KEY') | |
stt_model = "whisper-large-v3" | |
system_prompt="""You have to act as a professional doctor, i know you are not but this is for learning purpose. | |
What's in this image?. Do you find anything wrong with it medically? | |
If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in | |
your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person. | |
Donot say 'In the image I see' but say 'With what I see, I think you have ....' | |
Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot, | |
Keep your answer concise (max 2 sentences). No preamble, start your answer right away please""" | |
def process_input(audio_file_path,image_file_path): | |
speech_to_text_output = transcribe_with_groq(stt_model=stt_model, | |
audio_file_path=audio_file_path, | |
GROQ_API_KEY=GROQ_API_KEY) | |
if image_file_path: | |
doctor_response=analyze_image_with_query(query=system_prompt+speech_to_text_output , | |
model= "llama-3.2-11b-vision-preview", | |
encoded_image=encode_image(image_file_path)) | |
else : | |
doctor_response = "No image provide to me for analysis" | |
voice_of_doctor = text_to_speech_with_gtts_autoplay(input_text=doctor_response , output_filepath="final.mp3") | |
return speech_to_text_output,doctor_response,voice_of_doctor | |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")) as demo: | |
gr.Markdown("## π₯ AI Doctor with Vision and Voice", elem_classes="title") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
audio_input = gr.Audio(label="π€ Record Audio", sources="microphone", type="filepath") | |
image_input = gr.Image(label="πΌοΈ Upload Image", type="filepath") | |
with gr.Row(): | |
clear_btn = gr.Button("π§Ή Clear", variant="secondary") | |
submit_btn = gr.Button("π Submit", variant="primary") | |
with gr.Column(scale=1): | |
speech_to_text_output = gr.Textbox(label="π Speech to Text", interactive=False) | |
doctor_response_output = gr.Textbox(label="π¬ Doctor's Response", interactive=False) | |
audio_output = gr.Audio(label="π Output Audio", interactive=False) | |
flag_btn = gr.Button("π© Flag", variant="secondary") | |
clear_btn.click( | |
lambda: [None, None, None, None, None], # Return None for all inputs/outputs | |
inputs=[], | |
outputs=[audio_input, image_input, speech_to_text_output, doctor_response_output, audio_output] | |
) | |
submit_btn.click( | |
process_input, | |
inputs=[audio_input, image_input], | |
outputs=[speech_to_text_output, doctor_response_output, audio_output] | |
) | |
demo.launch(debug=True,share=True) | |