YashuSinghal commited on
Commit
e98860f
·
verified ·
1 Parent(s): 8ce3a32

Upload 4 files

Browse files
brain_of_the_doctor.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ from groq import Groq
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+
7
+ GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
8
+ image_path ='./images/doctor.png'
9
+
10
+
11
+ def encode_image(image_path):
12
+ with open(image_path, "rb") as image_file:
13
+ encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
14
+ return encoded_image
15
+
16
+ query="what diseases is this in the image and in which part is it located in boby?"
17
+ model = 'llama-3.2-90b-vision-preview'
18
+
19
+ # encoded_image = encode_image(image_path)
20
+
21
+ def analyze_image_with_query(query, model ,encoded_image):
22
+
23
+ client = Groq(api_key=GROQ_API_KEY)
24
+ messages=[{
25
+ "role": "user",
26
+ "content": [
27
+ {
28
+ "type": "text",
29
+ "text": query
30
+ },
31
+ {
32
+ "type": "image_url",
33
+ "image_url": {
34
+ "url": f"data:image/jpeg;base64,{encoded_image}"
35
+ }
36
+ }
37
+ ]
38
+ }]
39
+
40
+ chat_completion = client.chat.completions.create(
41
+ model=model,
42
+ messages=messages
43
+ )
44
+
45
+ return chat_completion.choices[0].message.content
46
+
gradio_app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from voice_of_the_patient import record_audio, transcribe_with_groq
2
+ from brain_of_the_doctor import analyze_image_with_query, encode_image
3
+ from voice_of_the_doctor import text_to_speech_with_gtts_autoplay
4
+
5
+ import os
6
+ import gradio as gr
7
+ from dotenv import load_dotenv
8
+ load_dotenv()
9
+
10
+ GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
11
+ stt_model = "whisper-large-v3"
12
+
13
+ system_prompt="""You have to act as a professional doctor, i know you are not but this is for learning purpose.
14
+ What's in this image?. Do you find anything wrong with it medically?
15
+ If you make a differential, suggest some remedies for them. Donot add any numbers or special characters in
16
+ your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
17
+ Donot say 'In the image I see' but say 'With what I see, I think you have ....'
18
+ Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
19
+ Keep your answer concise (max 2 sentences). No preamble, start your answer right away please"""
20
+
21
+ def process_input(audio_file_path,image_file_path):
22
+
23
+ speech_to_text_output = transcribe_with_groq(stt_model=stt_model,
24
+ audio_file_path=audio_file_path,
25
+ GROQ_API_KEY=GROQ_API_KEY)
26
+
27
+ if image_file_path:
28
+ doctor_response=analyze_image_with_query(query=system_prompt+speech_to_text_output ,
29
+ model= "llama-3.2-11b-vision-preview",
30
+ encoded_image=encode_image(image_file_path))
31
+ else :
32
+ doctor_response = "No image provide to me for analysis"
33
+
34
+ voice_of_doctor = text_to_speech_with_gtts_autoplay(input_text=doctor_response , output_filepath="final.mp3")
35
+
36
+ return speech_to_text_output,doctor_response,voice_of_doctor
37
+
38
+
39
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")) as demo:
40
+ gr.Markdown("## 🏥 AI Doctor with Vision and Voice", elem_classes="title")
41
+
42
+ with gr.Row():
43
+ with gr.Column(scale=1):
44
+ audio_input = gr.Audio(label="🎤 Record Audio", sources="microphone", type="filepath")
45
+ image_input = gr.Image(label="🖼️ Upload Image", type="filepath")
46
+
47
+ with gr.Row():
48
+ clear_btn = gr.Button("🧹 Clear", variant="secondary")
49
+ submit_btn = gr.Button("🚀 Submit", variant="primary")
50
+
51
+ with gr.Column(scale=1):
52
+ speech_to_text_output = gr.Textbox(label="📝 Speech to Text", interactive=False)
53
+ doctor_response_output = gr.Textbox(label="💬 Doctor's Response", interactive=False)
54
+ audio_output = gr.Audio(label="🔊 Output Audio", interactive=False)
55
+ flag_btn = gr.Button("🚩 Flag", variant="secondary")
56
+
57
+ clear_btn.click(
58
+ lambda: [None, None, None, None, None], # Return None for all inputs/outputs
59
+ inputs=[],
60
+ outputs=[audio_input, image_input, speech_to_text_output, doctor_response_output, audio_output]
61
+ )
62
+
63
+ submit_btn.click(
64
+ process_input,
65
+ inputs=[audio_input, image_input],
66
+ outputs=[speech_to_text_output, doctor_response_output, audio_output]
67
+ )
68
+
69
+
70
+ demo.launch(debug=True,share=True)
71
+
72
+
voice_of_the_doctor.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from gtts import gTTS
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+
8
+ def text_to_speech_with_gtts(input_text, output_filepath):
9
+ """
10
+ Convert input text to speech and save it as an MP3 file.
11
+
12
+ Args:
13
+ input_text (str): The text to be converted to speech.
14
+ output_filepath (str): The path to save the generated MP3 file.
15
+ """
16
+ audioobj = gTTS(text=input_text, lang='en',slow= False)
17
+ audioobj.save(output_filepath)
18
+
19
+ input_text = "Hello, how can i help you today, how are you feeling ?"
20
+ text_to_speech_with_gtts(input_text=input_text, output_filepath="gtts_testing.mp3")
21
+
22
+
23
+ import elevenlabs
24
+ from elevenlabs.client import ElevenLabs
25
+
26
+ ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
27
+
28
+ def text_to_speech_with_elevanlabs(input_text,output_filepath):
29
+
30
+ client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
31
+ audio = client.generate(
32
+ text = input_text,
33
+ voice = "Aria",
34
+ output_format = "mp3_22050_32",
35
+ model = "eleven_turbo_v2"
36
+ )
37
+ elevenlabs.save(audio,output_filepath)
38
+
39
+ # text_to_speech_with_elevanlabs(input_text=input_text,output_filepath="elevenlabs_testing.mp3")
40
+
41
+ import subprocess
42
+ import platform
43
+ from playsound import playsound
44
+
45
+ def text_to_speech_with_gtts_autoplay(input_text, output_filepath):
46
+
47
+ abs_filepath = os.path.abspath(output_filepath)
48
+ audioobj = gTTS(text=input_text, lang='en', slow=False)
49
+ audioobj.save(abs_filepath)
50
+
51
+ os_name = platform.system()
52
+ try:
53
+ if os_name == "Darwin": # macOS
54
+ subprocess.run(['afplay', abs_filepath])
55
+ elif os_name == "Windows": # Windows
56
+ playsound(abs_filepath)
57
+ elif os_name == "Linux": # Linux
58
+ subprocess.run(['aplay', abs_filepath])
59
+ else:
60
+ raise OSError("Unsupported operating system")
61
+ except Exception as e:
62
+ print(f"An error occurred while trying to play the audio: {e}")
63
+
64
+ return abs_filepath
65
+
66
+ text_to_speech_with_gtts_autoplay(input_text=input_text, output_filepath="gtts_testing.mp3")
voice_of_the_patient.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import speech_recognition as sr
3
+ from pydub import AudioSegment
4
+ from io import BytesIO
5
+ from dotenv import load_dotenv
6
+ import os
7
+ load_dotenv()
8
+
9
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
+
11
+ def record_audio(file_path, timeout=20 ,phrase_time_limit=10):
12
+
13
+ recognizer = sr.Recognizer()
14
+
15
+ try:
16
+ with sr.Microphone() as source:
17
+ logging.info("Adjusting for ambient noise...")
18
+ recognizer.adjust_for_ambient_noise(source, duration=1)
19
+ logging.info("Start Speech now...")
20
+
21
+ audio_data = recognizer.listen(source, timeout=timeout,phrase_time_limit=phrase_time_limit)
22
+ logging.info("Recording complete.")
23
+
24
+ wav_data = audio_data.get_wav_data()
25
+ audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
26
+ audio_segment.export(file_path, format="mp3",bitrate='128k')
27
+
28
+ logging.info(f"Audio saved to {file_path}")
29
+
30
+ except Exception as e:
31
+ logging.error(f"An error occurred: {str(e)}")
32
+
33
+
34
+ audio_file_path='patient_voice_test.mp3'
35
+ # record_audio(file_path=audio_file_path)
36
+
37
+
38
+ from groq import Groq
39
+ GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
40
+
41
+ def transcribe_with_groq(stt_model,audio_file_path,GROQ_API_KEY):
42
+
43
+ try:
44
+ client = Groq(api_key=GROQ_API_KEY)
45
+
46
+ # Ensure the file exists
47
+ if not os.path.exists(audio_file_path):
48
+ return "Error: Audio file not found"
49
+
50
+ with open(audio_file_path, 'rb') as audio_file:
51
+ transcription = client.audio.transcriptions.create(
52
+ model=stt_model,
53
+ file=audio_file,
54
+ language="en"
55
+ )
56
+ return transcription.text
57
+
58
+ except Exception as e:
59
+ logging.error(f"Transcription error: {str(e)}")
60
+ return f"Error during transcription: {str(e)}"