Upload 4 files
Browse files- brain_of_the_doctor.py +46 -0
- gradio_app.py +72 -0
- voice_of_the_doctor.py +66 -0
- voice_of_the_patient.py +60 -0
brain_of_the_doctor.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
from groq import Groq
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
|
8 |
+
image_path ='./images/doctor.png'
|
9 |
+
|
10 |
+
|
11 |
+
def encode_image(image_path):
|
12 |
+
with open(image_path, "rb") as image_file:
|
13 |
+
encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
|
14 |
+
return encoded_image
|
15 |
+
|
16 |
+
query="what diseases is this in the image and in which part is it located in boby?"
|
17 |
+
model = 'llama-3.2-90b-vision-preview'
|
18 |
+
|
19 |
+
# encoded_image = encode_image(image_path)
|
20 |
+
|
21 |
+
def analyze_image_with_query(query, model ,encoded_image):
|
22 |
+
|
23 |
+
client = Groq(api_key=GROQ_API_KEY)
|
24 |
+
messages=[{
|
25 |
+
"role": "user",
|
26 |
+
"content": [
|
27 |
+
{
|
28 |
+
"type": "text",
|
29 |
+
"text": query
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"type": "image_url",
|
33 |
+
"image_url": {
|
34 |
+
"url": f"data:image/jpeg;base64,{encoded_image}"
|
35 |
+
}
|
36 |
+
}
|
37 |
+
]
|
38 |
+
}]
|
39 |
+
|
40 |
+
chat_completion = client.chat.completions.create(
|
41 |
+
model=model,
|
42 |
+
messages=messages
|
43 |
+
)
|
44 |
+
|
45 |
+
return chat_completion.choices[0].message.content
|
46 |
+
|
gradio_app.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from voice_of_the_patient import record_audio, transcribe_with_groq
|
2 |
+
from brain_of_the_doctor import analyze_image_with_query, encode_image
|
3 |
+
from voice_of_the_doctor import text_to_speech_with_gtts_autoplay
|
4 |
+
|
5 |
+
import os
|
6 |
+
import gradio as gr
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
|
11 |
+
stt_model = "whisper-large-v3"
|
12 |
+
|
13 |
+
system_prompt="""You have to act as a professional doctor, i know you are not but this is for learning purpose.
|
14 |
+
What's in this image?. Do you find anything wrong with it medically?
|
15 |
+
If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
|
16 |
+
your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
|
17 |
+
Donot say 'In the image I see' but say 'With what I see, I think you have ....'
|
18 |
+
Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
|
19 |
+
Keep your answer concise (max 2 sentences). No preamble, start your answer right away please"""
|
20 |
+
|
21 |
+
def process_input(audio_file_path,image_file_path):
|
22 |
+
|
23 |
+
speech_to_text_output = transcribe_with_groq(stt_model=stt_model,
|
24 |
+
audio_file_path=audio_file_path,
|
25 |
+
GROQ_API_KEY=GROQ_API_KEY)
|
26 |
+
|
27 |
+
if image_file_path:
|
28 |
+
doctor_response=analyze_image_with_query(query=system_prompt+speech_to_text_output ,
|
29 |
+
model= "llama-3.2-11b-vision-preview",
|
30 |
+
encoded_image=encode_image(image_file_path))
|
31 |
+
else :
|
32 |
+
doctor_response = "No image provide to me for analysis"
|
33 |
+
|
34 |
+
voice_of_doctor = text_to_speech_with_gtts_autoplay(input_text=doctor_response , output_filepath="final.mp3")
|
35 |
+
|
36 |
+
return speech_to_text_output,doctor_response,voice_of_doctor
|
37 |
+
|
38 |
+
|
39 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")) as demo:
|
40 |
+
gr.Markdown("## 🏥 AI Doctor with Vision and Voice", elem_classes="title")
|
41 |
+
|
42 |
+
with gr.Row():
|
43 |
+
with gr.Column(scale=1):
|
44 |
+
audio_input = gr.Audio(label="🎤 Record Audio", sources="microphone", type="filepath")
|
45 |
+
image_input = gr.Image(label="🖼️ Upload Image", type="filepath")
|
46 |
+
|
47 |
+
with gr.Row():
|
48 |
+
clear_btn = gr.Button("🧹 Clear", variant="secondary")
|
49 |
+
submit_btn = gr.Button("🚀 Submit", variant="primary")
|
50 |
+
|
51 |
+
with gr.Column(scale=1):
|
52 |
+
speech_to_text_output = gr.Textbox(label="📝 Speech to Text", interactive=False)
|
53 |
+
doctor_response_output = gr.Textbox(label="💬 Doctor's Response", interactive=False)
|
54 |
+
audio_output = gr.Audio(label="🔊 Output Audio", interactive=False)
|
55 |
+
flag_btn = gr.Button("🚩 Flag", variant="secondary")
|
56 |
+
|
57 |
+
clear_btn.click(
|
58 |
+
lambda: [None, None, None, None, None], # Return None for all inputs/outputs
|
59 |
+
inputs=[],
|
60 |
+
outputs=[audio_input, image_input, speech_to_text_output, doctor_response_output, audio_output]
|
61 |
+
)
|
62 |
+
|
63 |
+
submit_btn.click(
|
64 |
+
process_input,
|
65 |
+
inputs=[audio_input, image_input],
|
66 |
+
outputs=[speech_to_text_output, doctor_response_output, audio_output]
|
67 |
+
)
|
68 |
+
|
69 |
+
|
70 |
+
demo.launch(debug=True,share=True)
|
71 |
+
|
72 |
+
|
voice_of_the_doctor.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from gtts import gTTS
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
|
8 |
+
def text_to_speech_with_gtts(input_text, output_filepath):
|
9 |
+
"""
|
10 |
+
Convert input text to speech and save it as an MP3 file.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
input_text (str): The text to be converted to speech.
|
14 |
+
output_filepath (str): The path to save the generated MP3 file.
|
15 |
+
"""
|
16 |
+
audioobj = gTTS(text=input_text, lang='en',slow= False)
|
17 |
+
audioobj.save(output_filepath)
|
18 |
+
|
19 |
+
input_text = "Hello, how can i help you today, how are you feeling ?"
|
20 |
+
text_to_speech_with_gtts(input_text=input_text, output_filepath="gtts_testing.mp3")
|
21 |
+
|
22 |
+
|
23 |
+
import elevenlabs
|
24 |
+
from elevenlabs.client import ElevenLabs
|
25 |
+
|
26 |
+
ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
|
27 |
+
|
28 |
+
def text_to_speech_with_elevanlabs(input_text,output_filepath):
|
29 |
+
|
30 |
+
client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
31 |
+
audio = client.generate(
|
32 |
+
text = input_text,
|
33 |
+
voice = "Aria",
|
34 |
+
output_format = "mp3_22050_32",
|
35 |
+
model = "eleven_turbo_v2"
|
36 |
+
)
|
37 |
+
elevenlabs.save(audio,output_filepath)
|
38 |
+
|
39 |
+
# text_to_speech_with_elevanlabs(input_text=input_text,output_filepath="elevenlabs_testing.mp3")
|
40 |
+
|
41 |
+
import subprocess
|
42 |
+
import platform
|
43 |
+
from playsound import playsound
|
44 |
+
|
45 |
+
def text_to_speech_with_gtts_autoplay(input_text, output_filepath):
|
46 |
+
|
47 |
+
abs_filepath = os.path.abspath(output_filepath)
|
48 |
+
audioobj = gTTS(text=input_text, lang='en', slow=False)
|
49 |
+
audioobj.save(abs_filepath)
|
50 |
+
|
51 |
+
os_name = platform.system()
|
52 |
+
try:
|
53 |
+
if os_name == "Darwin": # macOS
|
54 |
+
subprocess.run(['afplay', abs_filepath])
|
55 |
+
elif os_name == "Windows": # Windows
|
56 |
+
playsound(abs_filepath)
|
57 |
+
elif os_name == "Linux": # Linux
|
58 |
+
subprocess.run(['aplay', abs_filepath])
|
59 |
+
else:
|
60 |
+
raise OSError("Unsupported operating system")
|
61 |
+
except Exception as e:
|
62 |
+
print(f"An error occurred while trying to play the audio: {e}")
|
63 |
+
|
64 |
+
return abs_filepath
|
65 |
+
|
66 |
+
text_to_speech_with_gtts_autoplay(input_text=input_text, output_filepath="gtts_testing.mp3")
|
voice_of_the_patient.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import speech_recognition as sr
|
3 |
+
from pydub import AudioSegment
|
4 |
+
from io import BytesIO
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import os
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
10 |
+
|
11 |
+
def record_audio(file_path, timeout=20 ,phrase_time_limit=10):
|
12 |
+
|
13 |
+
recognizer = sr.Recognizer()
|
14 |
+
|
15 |
+
try:
|
16 |
+
with sr.Microphone() as source:
|
17 |
+
logging.info("Adjusting for ambient noise...")
|
18 |
+
recognizer.adjust_for_ambient_noise(source, duration=1)
|
19 |
+
logging.info("Start Speech now...")
|
20 |
+
|
21 |
+
audio_data = recognizer.listen(source, timeout=timeout,phrase_time_limit=phrase_time_limit)
|
22 |
+
logging.info("Recording complete.")
|
23 |
+
|
24 |
+
wav_data = audio_data.get_wav_data()
|
25 |
+
audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
|
26 |
+
audio_segment.export(file_path, format="mp3",bitrate='128k')
|
27 |
+
|
28 |
+
logging.info(f"Audio saved to {file_path}")
|
29 |
+
|
30 |
+
except Exception as e:
|
31 |
+
logging.error(f"An error occurred: {str(e)}")
|
32 |
+
|
33 |
+
|
34 |
+
audio_file_path='patient_voice_test.mp3'
|
35 |
+
# record_audio(file_path=audio_file_path)
|
36 |
+
|
37 |
+
|
38 |
+
from groq import Groq
|
39 |
+
GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
|
40 |
+
|
41 |
+
def transcribe_with_groq(stt_model,audio_file_path,GROQ_API_KEY):
|
42 |
+
|
43 |
+
try:
|
44 |
+
client = Groq(api_key=GROQ_API_KEY)
|
45 |
+
|
46 |
+
# Ensure the file exists
|
47 |
+
if not os.path.exists(audio_file_path):
|
48 |
+
return "Error: Audio file not found"
|
49 |
+
|
50 |
+
with open(audio_file_path, 'rb') as audio_file:
|
51 |
+
transcription = client.audio.transcriptions.create(
|
52 |
+
model=stt_model,
|
53 |
+
file=audio_file,
|
54 |
+
language="en"
|
55 |
+
)
|
56 |
+
return transcription.text
|
57 |
+
|
58 |
+
except Exception as e:
|
59 |
+
logging.error(f"Transcription error: {str(e)}")
|
60 |
+
return f"Error during transcription: {str(e)}"
|