File size: 7,104 Bytes
e446af9 8cbd32b b62f438 91a42a6 793575c 8cbd32b e446af9 d56e29a e446af9 c8d031a 557d4cf c8d031a 557d4cf c8d031a 557d4cf e446af9 a142b41 e446af9 a142b41 e446af9 a142b41 e446af9 a142b41 e446af9 91a42a6 ea0cc1f a142b41 e446af9 9e9e80c e69488e 9e9e80c e69488e f213ccf e446af9 7c3d989 f528982 7c3d989 a98f6a1 7c3d989 e446af9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# app.py
import sys
import os
import numpy as np
import librosa
from scipy.io import wavfile
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import gradio as gr
from agent import NewsReporterAgent, AgentState, add_messages
from langchain_core.messages import HumanMessage, AIMessage
#######################################
# Version check
import torch
import transformers
import gradio
# Now you can check the versions
print("--- π CHECKING LIBRARY VERSIONS π ---")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Gradio version: {gradio.__version__}")
print("------------------------------------")
#########################################
# --- 1. Initialize the Agent ---
# This loads the model once when the app starts.
agent = NewsReporterAgent()
# --- 2. Define Gradio Logic Handlers ---
# These functions orchestrate the agent's actions based on UI events.
def run_initial_generation(audio_path, image_path):
"""Handles the first step: processing inputs and generating the initial report."""
if not audio_path and not image_path:
return "Please provide an audio or image file.", None, gr.update(visible=False), None, None, None
state = AgentState(audio_path=audio_path,
image_path=image_path,
news_report=[])
state.update(agent.transcribe_audio(state))
state.update(agent.describe_image(state))
state.update(agent.create_report(state))
latest_report = state["news_report"][-1].content
transcribed_text = state.get('transcribed_text') or "No audio was provided to transcribe."
image_description = state.get('image_description') or "No image was provided to describe."
return latest_report, state, gr.update(visible=True), "", transcribed_text, image_description
def run_revision(feedback, current_state):
"""Handles the revision step based on user feedback."""
if not feedback or not feedback.strip():
# Re-populate UI fields if feedback is empty
latest_report = next((msg.content for msg in reversed(current_state["news_report"]) if isinstance(msg, AIMessage)), "")
transcribed_text = current_state.get('transcribed_text', "")
image_description = current_state.get('image_description', "")
return latest_report, current_state, "Please provide feedback.", transcribed_text, image_description
current_state["news_report"] = add_messages(current_state["news_report"], [HumanMessage(content=feedback)])
current_state.update(agent.revise_report(current_state))
latest_report = current_state["news_report"][-1].content
transcribed_text = current_state.get('transcribed_text') or "No audio was provided."
image_description = current_state.get('image_description') or "No image was provided."
return latest_report, current_state, "", transcribed_text, image_description
def run_save(current_state):
"""Handles the save step."""
save_update = agent.save_report(current_state)
return save_update["final_message"]
# --- 3. Define the Gradio UI ---
# ------------------------------------------------------------------
# Build examples: audio-only, image-only, and combined
# ------------------------------------------------------------------
# Define known file extensions
AUDIO_EXTENSIONS = ['.wav', '.mp3', '.m4a', '.flac']
IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.webp', '.gif']
example_list = []
examples_dir = "examples"
if os.path.isdir(examples_dir):
audio_files = sorted(
f for f in os.listdir(examples_dir)
if any(f.lower().endswith(ext) for ext in AUDIO_EXTENSIONS)
)
image_files = sorted(
f for f in os.listdir(examples_dir)
if any(f.lower().endswith(ext) for ext in IMAGE_EXTENSIONS)
)
# 1) audio-only
for af in audio_files:
example_list.append([os.path.join(examples_dir, af), None])
# 2) image-only
for imf in image_files:
example_list.append([None, os.path.join(examples_dir, imf)])
# 3) audio + image (pair first audio with first image, etc.)
for af, imf in zip(audio_files, image_files):
example_list.append([os.path.join(examples_dir, af),
os.path.join(examples_dir, imf)])
with gr.Blocks(theme=gr.themes.Soft(), title="Multimodal News Reporter") as demo:
agent_state = gr.State(value=None)
gr.Markdown("# π° Multimodal News Reporter AI")
gr.Markdown(
"- Upload an audio recording and/or a relevant image; the AI will generate a news report you can revise and save.\n"
"- Token output is set to 128 only for faster inference. \n"
"- Note: This demo currently runs on CPU only.\n"
"- Sample audio is trimmed to 10 seconds for faster inference.\n"
"- Combined audio + image inference takes ~250-350 seconds; audio-only or image-only is much faster."
)
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(label="Audio Interview Evidence", type="filepath")
image_input = gr.Image(label="Image Evidence", type="filepath")
generate_btn = gr.Button("π Generate Initial Report", variant="primary")
# Examples
gr.Examples(
examples=example_list,
inputs=[audio_input, image_input],
label="Click an example to test"
)
with gr.Column(scale=2):
report_output = gr.Textbox(label="Generated News Report", lines=12, interactive=False)
status_output = gr.Markdown(value="")
with gr.Accordion("Show Source Information", open=False):
transcribed_audio_output = gr.Textbox(label="π€ Transcribed Audio", interactive=False, lines=5)
image_description_output = gr.Textbox(label="πΌοΈ Image Description", interactive=False, lines=5)
with gr.Group(visible=False) as revision_group:
gr.Markdown("### βοΈ Provide Feedback for Revision")
feedback_input = gr.Textbox(label="Your Feedback", placeholder="e.g., 'Make the tone more formal.'")
with gr.Row():
revise_btn = gr.Button("π Revise Report")
save_btn = gr.Button("πΎ Save Final Report")
# --- 4. Wire UI Components to Logic Handlers ---
generate_btn.click(
fn=run_initial_generation,
inputs=[audio_input, image_input],
outputs=[report_output, agent_state, revision_group, status_output, transcribed_audio_output, image_description_output]
)
revise_btn.click(
fn=run_revision,
inputs=[feedback_input, agent_state],
outputs=[report_output, agent_state, status_output, transcribed_audio_output, image_description_output]
).then(fn=lambda: "", outputs=[feedback_input])
save_btn.click(
fn=run_save,
inputs=[agent_state],
outputs=[status_output]
)
# --- 5. Launch the App ---
if __name__ == "__main__":
demo.launch(debug=True) |