Spaces:

kaiku03
/

gemma3n-2b-it-challenge-demo

Sleeping

App Files Files Community

gemma3n-2b-it-challenge-demo / agent.py

kaiku03

set token 128

f528982 about 1 month ago

raw

history blame contribute delete

6.93 kB

	# /agent.py

	import torch
	import gc
	import os
	from transformers import AutoProcessor, AutoModelForImageTextToText
	from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
	from typing import Optional, Sequence
	from typing_extensions import TypedDict

	# Helper function to mimic LangGraph's add_messages
	def add_messages(left: Sequence[BaseMessage], right: Sequence[BaseMessage]) -> Sequence[BaseMessage]:
	"""Concatenates two sequences of messages."""
	return left + right

	class AgentState(TypedDict):
	"""Defines the state of our agent."""
	audio_path: Optional[str]
	image_path: Optional[str]
	transcribed_text: Optional[str]
	image_description: Optional[str]
	news_report: Sequence[BaseMessage]
	final_message: Optional[str]


	class NewsReporterAgent:
	def __init__(self):
	"""Initializes the agent by loading the model and processor from the Hub."""
	print("--- 🚀 INITIALIZING MODEL (this may take a moment) ---")

	# Define the Hugging Face Hub model ID
	model_id = "google/gemma-3n-E2B-it" # google/gemma-3n-E4B-it # 4b taking too long

	self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	print(f" > Using device: {self.device}")

	# Load directly from the Hugging Face Hub
	self.processor = AutoProcessor.from_pretrained(model_id)
	self.model = AutoModelForImageTextToText.from_pretrained(
	model_id, torch_dtype="auto"
	).to(self.device)

	print("--- ✅ MODEL READY ---")

	def _generate(self, messages: list) -> str:
	"""Private helper to run model inference and handle memory."""
	inputs = self.processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt"
	).to(self.device, dtype=self.model.dtype)

	outputs = self.model.generate(**inputs, max_new_tokens=128, disable_compile=True) # 64 token for faster inference
	text = self.processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])

	del inputs
	del outputs
	torch.cuda.empty_cache()
	gc.collect()
	return text

	def transcribe_audio(self, state: AgentState) -> dict:
	"""Transcribes the audio file specified in the state."""
	print("--- 🎤 TRANSCRIBING AUDIO ---")
	audio_path = state.get('audio_path')
	if not audio_path:
	return {}
	messages = [{"role": "user", "content": [{"type": "audio", "audio": audio_path}, {"type": "text", "text": "Transcribe the following audio. Provide only the transcribed text."}]}]
	transcribed_text = self._generate(messages)
	print(" > Transcription generated.")
	return {"transcribed_text": transcribed_text}

	def describe_image(self, state: AgentState) -> dict:
	"""Generates a description for the image specified in the state."""
	print("--- 🖼️ DESCRIBING IMAGE ---")
	image_path = state.get('image_path')
	if not image_path:
	return {}
	messages = [{"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": "Describe this image in detail."}]}]
	image_description = self._generate(messages)
	print(" > Description generated.")
	return {"image_description": image_description}

	# This is the agent
	def create_report(self, state: AgentState) -> dict:
	"""Generates a news report from transcription and/or image description."""
	print("--- ✍️ GENERATING NEWS REPORT ---")
	context_parts = ["You are an expert news reporter. Your task is to write a clear, concise, and factual news report...", "Synthesize all available information into a single, coherent story..."]
	transcribed_text = state.get('transcribed_text')
	image_description = state.get('image_description')
	if not transcribed_text and not image_description:
	return {"news_report": [AIMessage(content="No input provided to generate a report.")]}
	if transcribed_text:
	context_parts.append(f"--- Transcribed Audio ---\n\"{transcribed_text}\"")
	if image_description:
	context_parts.append(f"--- Image Description ---\n\"{image_description}\"")
	prompt = "\n\n".join(context_parts)
	report_content = self._generate([{"role": "user", "content": [{"type": "text", "text": prompt}]}])
	print(" > Report generated successfully.")
	return {"news_report": [AIMessage(content=report_content)]}

	def revise_report(self, state: AgentState) -> dict:
	"""Revises the news report based on the latest human feedback."""
	print("--- 🔄 REVISING REPORT ---")
	# Extract context from state
	transcribed = state.get("transcribed_text", "Not available.")
	image_desc = state.get("image_description", "Not available.")
	human_feedback = next((msg.content for msg in reversed(state["news_report"]) if isinstance(msg, HumanMessage)), None)
	last_ai_report = next((msg.content for msg in reversed(state["news_report"]) if isinstance(msg, AIMessage)), None)

	prompt = f"""You are a professional news editor. Revise the news report to address the feedback...
	Original Source Information:
	--- Transcribed Audio ---
	"{transcribed}"
	--- Image Description ---
	"{image_desc}"
	Current Draft of News Report:
	"{last_ai_report}"
	Latest Human Feedback:
	"{human_feedback}"
	Provide only the full, revised news report..."""
	revised_content = self._generate([{"role": "user", "content": [{"type": "text", "text": prompt}]}])
	print(" > Revision complete.")
	return {"news_report": add_messages(state["news_report"], [AIMessage(content=revised_content)])}

	def save_report(self, state: AgentState) -> dict:
	"""Saves the latest AI-generated news report to a text file."""
	print("--- 💾 SAVING REPORT ---")
	latest_report_msg = next((msg for msg in reversed(state["news_report"]) if isinstance(msg, AIMessage)), None)
	if not latest_report_msg:
	return {"final_message": "Error: No report available to save."}
	output_dir = "saved_reports"
	os.makedirs(output_dir, exist_ok=True)
	filename = os.path.join(output_dir, f"news_report_{len(os.listdir(output_dir)) + 1}.txt")
	with open(filename, "w", encoding="utf-8") as f:
	f.write(latest_report_msg.content)
	final_message = f"✅ Report saved to: {filename}"
	print(f" > {final_message}")
	return {"final_message": final_message}