Spaces:

YashuSinghal
/

AI_Doctor_Voice_Chat

Running

App Files Files Community

YashuSinghal commited on 29 days ago

Commit

e98860f

verified ·

1 Parent(s): 8ce3a32

Upload 4 files

Browse files

Files changed (4) hide show

brain_of_the_doctor.py +46 -0
gradio_app.py +72 -0
voice_of_the_doctor.py +66 -0
voice_of_the_patient.py +60 -0

brain_of_the_doctor.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import base64
+from groq import Groq
+from dotenv import load_dotenv
+load_dotenv()
+GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
+image_path ='./images/doctor.png'
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
+    return encoded_image
+query="what diseases is this in the image and in which part is it located in boby?"
+model = 'llama-3.2-90b-vision-preview'
+# encoded_image = encode_image(image_path)
+def analyze_image_with_query(query, model ,encoded_image):
+    client = Groq(api_key=GROQ_API_KEY)
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": query
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{encoded_image}"
+                }
+            }
+        ]
+    }]
+    chat_completion = client.chat.completions.create(
+    model=model,
+    messages=messages
+    )
+    return chat_completion.choices[0].message.content

gradio_app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from voice_of_the_patient import  record_audio, transcribe_with_groq
+from brain_of_the_doctor import analyze_image_with_query, encode_image
+from voice_of_the_doctor import text_to_speech_with_gtts_autoplay
+import os
+import gradio as gr
+from dotenv import load_dotenv
+load_dotenv()
+GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
+stt_model = "whisper-large-v3"
+system_prompt="""You have to act as a professional doctor, i know you are not but this is for learning purpose.
+            What's in this image?. Do you find anything wrong with it medically?
+            If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
+            your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
+            Donot say 'In the image I see' but say 'With what I see, I think you have ....'
+            Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
+            Keep your answer concise (max 2 sentences). No preamble, start your answer right away please"""
+def process_input(audio_file_path,image_file_path):
+    speech_to_text_output = transcribe_with_groq(stt_model=stt_model,
+                                                audio_file_path=audio_file_path,
+                                                GROQ_API_KEY=GROQ_API_KEY)
+    if image_file_path:
+        doctor_response=analyze_image_with_query(query=system_prompt+speech_to_text_output ,
+                                              model= "llama-3.2-11b-vision-preview",
+                                              encoded_image=encode_image(image_file_path))
+    else :
+        doctor_response = "No image provide to me for analysis"
+    voice_of_doctor = text_to_speech_with_gtts_autoplay(input_text=doctor_response , output_filepath="final.mp3")
+    return speech_to_text_output,doctor_response,voice_of_doctor
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")) as demo:
+    gr.Markdown("## 🏥 AI Doctor with Vision and Voice", elem_classes="title")
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(label="🎤 Record Audio", sources="microphone", type="filepath")
+            image_input = gr.Image(label="🖼️ Upload Image", type="filepath")
+            with gr.Row():
+                clear_btn = gr.Button("🧹 Clear", variant="secondary")
+                submit_btn = gr.Button("🚀 Submit", variant="primary")
+        with gr.Column(scale=1):
+            speech_to_text_output = gr.Textbox(label="📝 Speech to Text", interactive=False)
+            doctor_response_output = gr.Textbox(label="💬 Doctor's Response", interactive=False)
+            audio_output = gr.Audio(label="🔊 Output Audio", interactive=False)
+            flag_btn = gr.Button("🚩 Flag", variant="secondary")
+    clear_btn.click(
+        lambda: [None, None, None, None, None],  # Return None for all inputs/outputs
+        inputs=[],
+        outputs=[audio_input, image_input, speech_to_text_output, doctor_response_output, audio_output]
+    )
+    submit_btn.click(
+        process_input,
+        inputs=[audio_input, image_input],
+        outputs=[speech_to_text_output, doctor_response_output, audio_output]
+    )
+demo.launch(debug=True,share=True)

voice_of_the_doctor.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+from gtts import gTTS
+from dotenv import load_dotenv
+load_dotenv()
+def text_to_speech_with_gtts(input_text, output_filepath):
+    """
+    Convert input text to speech and save it as an MP3 file.
+    Args:
+        input_text (str): The text to be converted to speech.
+        output_filepath (str): The path to save the generated MP3 file.
+    """
+    audioobj = gTTS(text=input_text, lang='en',slow=  False)
+    audioobj.save(output_filepath)
+input_text = "Hello, how can i help you today, how are you feeling ?"
+text_to_speech_with_gtts(input_text=input_text, output_filepath="gtts_testing.mp3")
+import elevenlabs
+from elevenlabs.client import ElevenLabs
+ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
+def text_to_speech_with_elevanlabs(input_text,output_filepath):
+    client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
+    audio = client.generate(
+        text = input_text,
+        voice = "Aria",
+        output_format = "mp3_22050_32",
+        model = "eleven_turbo_v2"
+    )
+    elevenlabs.save(audio,output_filepath)
+# text_to_speech_with_elevanlabs(input_text=input_text,output_filepath="elevenlabs_testing.mp3")
+import subprocess
+import platform
+from playsound import playsound
+def text_to_speech_with_gtts_autoplay(input_text, output_filepath):
+    abs_filepath = os.path.abspath(output_filepath)
+    audioobj = gTTS(text=input_text, lang='en', slow=False)
+    audioobj.save(abs_filepath)
+    os_name = platform.system()
+    try:
+        if os_name == "Darwin":  # macOS
+            subprocess.run(['afplay', abs_filepath])
+        elif os_name == "Windows":  # Windows
+            playsound(abs_filepath)
+        elif os_name == "Linux":  # Linux
+            subprocess.run(['aplay', abs_filepath])
+        else:
+            raise OSError("Unsupported operating system")
+    except Exception as e:
+        print(f"An error occurred while trying to play the audio: {e}")
+    return abs_filepath
+text_to_speech_with_gtts_autoplay(input_text=input_text, output_filepath="gtts_testing.mp3")

voice_of_the_patient.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import logging
+import speech_recognition as sr
+from pydub import AudioSegment
+from io import BytesIO
+from dotenv import load_dotenv
+import os
+load_dotenv()
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def record_audio(file_path, timeout=20 ,phrase_time_limit=10):
+    recognizer = sr.Recognizer()
+    try:
+        with sr.Microphone() as source:
+            logging.info("Adjusting for ambient noise...")
+            recognizer.adjust_for_ambient_noise(source, duration=1)
+            logging.info("Start Speech now...")
+            audio_data = recognizer.listen(source, timeout=timeout,phrase_time_limit=phrase_time_limit)
+            logging.info("Recording complete.")
+            wav_data = audio_data.get_wav_data()
+            audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
+            audio_segment.export(file_path, format="mp3",bitrate='128k')
+            logging.info(f"Audio saved to {file_path}")
+    except Exception as e:
+        logging.error(f"An error occurred: {str(e)}")
+audio_file_path='patient_voice_test.mp3'
+# record_audio(file_path=audio_file_path)
+from groq import Groq
+GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
+def transcribe_with_groq(stt_model,audio_file_path,GROQ_API_KEY):
+    try:
+        client = Groq(api_key=GROQ_API_KEY)
+     # Ensure the file exists
+        if not os.path.exists(audio_file_path):
+            return "Error: Audio file not found"
+        with open(audio_file_path, 'rb') as audio_file:
+            transcription = client.audio.transcriptions.create(
+            model=stt_model,
+            file=audio_file,
+            language="en"
+            )
+        return transcription.text
+    except Exception as e:
+        logging.error(f"Transcription error: {str(e)}")
+        return f"Error during transcription: {str(e)}"