Spaces:

kaiku03
/

gemma3n-2b-it-challenge-demo

Sleeping

App Files Files Community

gemma3n-2b-it-challenge-demo / app.py

kaiku03

app.py

a98f6a1 about 1 month ago

raw

history blame contribute delete

7.1 kB

	# app.py

	import sys
	import os
	import numpy as np
	import librosa
	from scipy.io import wavfile
	sys.path.append(os.path.dirname(os.path.abspath(__file__)))

	import gradio as gr
	from agent import NewsReporterAgent, AgentState, add_messages
	from langchain_core.messages import HumanMessage, AIMessage

	#######################################
	# Version check
	import torch
	import transformers
	import gradio

	# Now you can check the versions
	print("--- 🔍 CHECKING LIBRARY VERSIONS 🔍 ---")
	print(f"PyTorch version: {torch.__version__}")
	print(f"Transformers version: {transformers.__version__}")
	print(f"Gradio version: {gradio.__version__}")
	print("------------------------------------")
	#########################################


	# --- 1. Initialize the Agent ---
	# This loads the model once when the app starts.
	agent = NewsReporterAgent()

	# --- 2. Define Gradio Logic Handlers ---
	# These functions orchestrate the agent's actions based on UI events.
	def run_initial_generation(audio_path, image_path):
	"""Handles the first step: processing inputs and generating the initial report."""
	if not audio_path and not image_path:
	return "Please provide an audio or image file.", None, gr.update(visible=False), None, None, None

	state = AgentState(audio_path=audio_path,
	image_path=image_path,
	news_report=[])

	state.update(agent.transcribe_audio(state))
	state.update(agent.describe_image(state))
	state.update(agent.create_report(state))

	latest_report = state["news_report"][-1].content
	transcribed_text = state.get('transcribed_text') or "No audio was provided to transcribe."
	image_description = state.get('image_description') or "No image was provided to describe."

	return latest_report, state, gr.update(visible=True), "", transcribed_text, image_description




	def run_revision(feedback, current_state):
	"""Handles the revision step based on user feedback."""
	if not feedback or not feedback.strip():
	# Re-populate UI fields if feedback is empty
	latest_report = next((msg.content for msg in reversed(current_state["news_report"]) if isinstance(msg, AIMessage)), "")
	transcribed_text = current_state.get('transcribed_text', "")
	image_description = current_state.get('image_description', "")
	return latest_report, current_state, "Please provide feedback.", transcribed_text, image_description

	current_state["news_report"] = add_messages(current_state["news_report"], [HumanMessage(content=feedback)])
	current_state.update(agent.revise_report(current_state))

	latest_report = current_state["news_report"][-1].content
	transcribed_text = current_state.get('transcribed_text') or "No audio was provided."
	image_description = current_state.get('image_description') or "No image was provided."

	return latest_report, current_state, "", transcribed_text, image_description

	def run_save(current_state):
	"""Handles the save step."""
	save_update = agent.save_report(current_state)
	return save_update["final_message"]

	# --- 3. Define the Gradio UI ---
	# ------------------------------------------------------------------
	# Build examples: audio-only, image-only, and combined
	# ------------------------------------------------------------------
	# Define known file extensions
	AUDIO_EXTENSIONS = ['.wav', '.mp3', '.m4a', '.flac']
	IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.webp', '.gif']
	example_list = []

	examples_dir = "examples"
	if os.path.isdir(examples_dir):
	audio_files = sorted(
	f for f in os.listdir(examples_dir)
	if any(f.lower().endswith(ext) for ext in AUDIO_EXTENSIONS)
	)
	image_files = sorted(
	f for f in os.listdir(examples_dir)
	if any(f.lower().endswith(ext) for ext in IMAGE_EXTENSIONS)
	)

	# 1) audio-only
	for af in audio_files:
	example_list.append([os.path.join(examples_dir, af), None])

	# 2) image-only
	for imf in image_files:
	example_list.append([None, os.path.join(examples_dir, imf)])

	# 3) audio + image (pair first audio with first image, etc.)
	for af, imf in zip(audio_files, image_files):
	example_list.append([os.path.join(examples_dir, af),
	os.path.join(examples_dir, imf)])



	with gr.Blocks(theme=gr.themes.Soft(), title="Multimodal News Reporter") as demo:
	agent_state = gr.State(value=None)

	gr.Markdown("# 📰 Multimodal News Reporter AI")
	gr.Markdown(
	"- Upload an audio recording and/or a relevant image; the AI will generate a news report you can revise and save.\n"
	"- Token output is set to 128 only for faster inference. \n"
	"- Note: This demo currently runs on CPU only.\n"
	"- Sample audio is trimmed to 10 seconds for faster inference.\n"
	"- Combined audio + image inference takes ~250-350 seconds; audio-only or image-only is much faster."
	)

	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(label="Audio Interview Evidence", type="filepath")
	image_input = gr.Image(label="Image Evidence", type="filepath")
	generate_btn = gr.Button("📝 Generate Initial Report", variant="primary")

	# Examples
	gr.Examples(
	examples=example_list,
	inputs=[audio_input, image_input],
	label="Click an example to test"
	)


	with gr.Column(scale=2):
	report_output = gr.Textbox(label="Generated News Report", lines=12, interactive=False)
	status_output = gr.Markdown(value="")

	with gr.Accordion("Show Source Information", open=False):
	transcribed_audio_output = gr.Textbox(label="🎤 Transcribed Audio", interactive=False, lines=5)
	image_description_output = gr.Textbox(label="🖼️ Image Description", interactive=False, lines=5)

	with gr.Group(visible=False) as revision_group:
	gr.Markdown("### ✍️ Provide Feedback for Revision")
	feedback_input = gr.Textbox(label="Your Feedback", placeholder="e.g., 'Make the tone more formal.'")
	with gr.Row():
	revise_btn = gr.Button("🔄 Revise Report")
	save_btn = gr.Button("💾 Save Final Report")

	# --- 4. Wire UI Components to Logic Handlers ---
	generate_btn.click(
	fn=run_initial_generation,
	inputs=[audio_input, image_input],
	outputs=[report_output, agent_state, revision_group, status_output, transcribed_audio_output, image_description_output]
	)
	revise_btn.click(
	fn=run_revision,
	inputs=[feedback_input, agent_state],
	outputs=[report_output, agent_state, status_output, transcribed_audio_output, image_description_output]
	).then(fn=lambda: "", outputs=[feedback_input])
	save_btn.click(
	fn=run_save,
	inputs=[agent_state],
	outputs=[status_output]
	)

	# --- 5. Launch the App ---
	if __name__ == "__main__":
	demo.launch(debug=True)