import gradio as gr import fal_client import asyncio import base64 from io import BytesIO from PIL import Image import httpx # System prompt to guide the model SYSTEM_PROMPT = """ You are a kind assistant that compliments people's appearances and acknowledges their hard work for their families or the work they do as a part of the Ultratech Cement Family (if they are a man). Look at the image and create a short, sincere compliment about: 1. Something positive about how they look (facial features, smile, eyes, etc.) 2. Acknowledge the hard work they likely do for their family Be warm, genuine, and uplifting without being overly flowery or artificial. Keep the response to 2-3 sentences maximum. Do not say anything apart from the compliment. """ async def process_image(image, language): # Convert image to base64 img_url = fal_client.upload_file(image) # Craft the prompt for the vision model prompt = f"Please look at this person and give them a sincere compliment about their appearance and acknowledge their hard work for their family and work for ultratech cement (If they are a man)." try: # Call FAL API to get text description from image result = await fal_client.async_client.run( "fal-ai/any-llm-open", arguments={ "prompt": prompt, "system_prompt": SYSTEM_PROMPT + f"Make sure to only respond in the {language} language.", "model": "google/gemini-2.0-flash-001", "image_url": img_url } ) compliment_text = result["output"] # Select TTS model based on language tts_model = "fal-ai/kokoro/american-english" voice = "af_heart" if language == "Hindi": tts_model = "fal-ai/kokoro/hindi" voice = "hf_alpha" # Call FAL API for text-to-speech tts_result = await fal_client.async_client.run( tts_model, arguments={ "prompt": compliment_text, "voice": voice, } ) audio_url = tts_result["audio"]["url"] # Download the audio file async with httpx.AsyncClient() as client: response = await client.get(audio_url) if response.status_code == 200: audio_data = response.content # Save the audio to a temporary file temp_file = "temp_audio.wav" with open(temp_file, "wb") as f: f.write(audio_data) return compliment_text, temp_file else: return compliment_text, None except Exception as e: return f"Error: {str(e)}", None def process_image_sync(image, language): return asyncio.run(process_image(image, language)) with gr.Blocks() as demo: gr.Markdown("# Face Reader") gr.Markdown("Upload a photo of someone, and the app will generate a prediction about them using AI") with gr.Row(): with gr.Column(): # Input components image_input = gr.Image(type="filepath", label="Upload Photo") language_selector = gr.Radio(["English", "Hindi"], label="Output Language", value="English") submit_button = gr.Button("Generate Prediction") with gr.Column(): # Output components text_output = gr.Textbox(label="AI Response") audio_output = gr.Audio(label="AI Prediction", type="filepath") # Set up the event submit_button.click( fn=process_image_sync, inputs=[image_input, language_selector], outputs=[text_output, audio_output] ) # Launch the app if __name__ == "__main__": demo.launch()