jesusvilela commited on
Commit
d984c3b
·
verified ·
1 Parent(s): f404032

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -12
app.py CHANGED
@@ -48,7 +48,7 @@ from langchain.prompts import PromptTemplate
48
  from langchain.tools import BaseTool, tool as lc_tool_decorator
49
  from langchain_google_genai import ChatGoogleGenerativeAI
50
  from langchain.agents import AgentExecutor, create_react_agent
51
- from langchain_community.tools import DuckDuckGoSearchRun
52
  from langchain_experimental.tools import PythonREPLTool
53
 
54
  # LangGraph Conditional Imports
@@ -127,7 +127,7 @@ except ImportError as e:
127
  # --- Constants ---
128
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
129
  GEMINI_MODEL_NAME = "gemini-2.5-pro"
130
- GEMINI_FLASH_MULTIMODAL_MODEL_NAME = "gemini-2.5-pro"
131
  SCORING_API_BASE_URL = os.getenv("SCORING_API_URL", DEFAULT_API_URL)
132
  MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024
133
  LOCAL_FILE_STORE_PATH = "./Data"
@@ -139,6 +139,7 @@ WHISPER_MODEL: Optional[Any] = None
139
  # --- Environment Variables & API Keys ---
140
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
141
  HUGGINGFACE_TOKEN = os.environ.get("HF_TOKEN")
 
142
 
143
  # --- Setup Logging ---
144
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(module)s:%(lineno)d - %(message)s')
@@ -255,7 +256,7 @@ def _download_file(file_identifier: str, task_id_for_file: Optional[str] = None)
255
  name_without_ext, current_ext = os.path.splitext(effective_save_path)
256
  if not current_ext:
257
  content_type_header = r.headers.get('content-type', '')
258
- content_type_val = content_type_header.split(';')[0].strip() if content_type_header else ''
259
  if content_type_val:
260
  guessed_ext = mimetypes.guess_extension(content_type_val)
261
  if guessed_ext: effective_save_path += guessed_ext; logger.info(f"Added guessed ext: {guessed_ext}")
@@ -324,10 +325,10 @@ def transcribe_audio_tool(action_input_json_str: str) -> str:
324
 
325
  @lc_tool_decorator
326
  def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
327
- """Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-1.5-flash-preview-0514) for tasks like image description, Q&A about the image, or text generation based on the image. Input: JSON '{\"file_identifier\": \"IMAGE_FILENAME_OR_URL\", \"text_prompt\": \"Your question or instruction related to the image.\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\" (optional)}'. Returns the model's text response."""
328
  global google_genai_client
329
  if not google_genai_client: return "Error: google-genai SDK client not initialized."
330
- if not PIL_TESSERACT_AVAILABLE : return "Error: Pillow (PIL) library not available for image processing." # Relies on PIL_TESSERACT_AVAILABLE for PIL
331
  try:
332
  data = json.loads(action_input_json_str)
333
  file_identifier = data.get("file_identifier")
@@ -342,7 +343,7 @@ def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
342
  except Exception as e_img_open: return f"Error opening image file {local_image_path}: {str(e_img_open)}"
343
 
344
  model_id_for_client = f"models/{GEMINI_FLASH_MULTIMODAL_MODEL_NAME}" if not GEMINI_FLASH_MULTIMODAL_MODEL_NAME.startswith("models/") else GEMINI_FLASH_MULTIMODAL_MODEL_NAME
345
- response = google_genai_client.models.generate_content( # Corrected to use google_genai_client.models
346
  model=model_id_for_client, contents=[pil_image, text_prompt]
347
  )
348
  logger.info(f"Direct Multimodal Tool: Response received from {model_id_for_client} received.")
@@ -361,7 +362,7 @@ You have access to the following tools. Use them if necessary.
361
  TOOL USAGE:
362
  - To use a tool, your response must include a `tool_calls` attribute in the AIMessage. Each tool call should be a dictionary with "name", "args" (a dictionary of arguments), and "id".
363
  - For file tools ('read_pdf_tool', 'ocr_image_tool', 'transcribe_audio_tool', 'direct_multimodal_gemini_tool'): The `args` field must be a dictionary with a single key 'action_input_json_str' whose value is a JSON STRING. Example: {{"action_input_json_str": "{{\\"file_identifier\\": \\"file.pdf\\", \\"task_id\\": \\"123\\"}}"}}.
364
- - 'web_search': `args` is like '{{"query": "search query"}}'.
365
  - 'python_repl': `args` is like '{{"query": "python code string"}}'. Use print() for output.
366
  RESPONSE FORMAT:
367
  Final AIMessage should contain ONLY the answer in 'content' and NO 'tool_calls'. If using tools, 'content' can be thought process, with 'tool_calls'.
@@ -372,7 +373,7 @@ Goal: EXACT MATCH answer. No extra text/markdown.
372
  Tools: {tools}
373
  Process: Question -> Thought -> Action (ONE of [{tool_names}]) -> Action Input -> Observation -> Thought ... -> Final Answer: [exact answer]
374
  Tool Inputs:
375
- - web_search: Your search query string.
376
  - python_repl: Python code string. Use print(). For Excel/CSV, use pandas: import pandas as pd; df = pd.read_excel('./Data/TASKID_filename.xlsx'); print(df.head())
377
  - read_pdf_tool, ocr_image_tool, transcribe_audio_tool, direct_multimodal_gemini_tool: JSON string like '{{"file_identifier": "FILENAME_OR_URL", "task_id": "CURRENT_TASK_ID_IF_FILENAME"}}'.
378
  If tool fails or info missing, Final Answer: N/A. Do NOT use unlisted tools.
@@ -394,7 +395,7 @@ def initialize_agent_and_tools(force_reinit=False):
394
  google_api_key=GOOGLE_API_KEY,
395
  temperature=0.0,
396
  timeout=120,
397
- convert_system_message_to_human=False # Explicitly set to False
398
  )
399
  logger.info(f"LangChain LLM (Planner) initialized: {GEMINI_MODEL_NAME} (Using default safety settings, convert_system_message_to_human=False)")
400
  except Exception as e:
@@ -407,8 +408,11 @@ def initialize_agent_and_tools(force_reinit=False):
407
  if WHISPER_AVAILABLE: TOOLS.append(transcribe_audio_tool)
408
  if google_genai_client and PIL_TESSERACT_AVAILABLE: TOOLS.append(direct_multimodal_gemini_tool); logger.info("Added 'direct_multimodal_gemini_tool'.")
409
  else: logger.warning("'direct_multimodal_gemini_tool' NOT added (client or PIL missing).")
410
- try: search_tool = DuckDuckGoSearchRun(name="web_search"); search_tool.description = "Web search. Input: query."; TOOLS.append(search_tool)
411
- except Exception as e: logger.warning(f"DuckDuckGoSearchRun init failed: {e}")
 
 
 
412
  try: python_repl = PythonREPLTool(name="python_repl"); python_repl.description = "Python REPL. print() for output. The input is a single string of code."; TOOLS.append(python_repl)
413
  except Exception as e: logger.warning(f"PythonREPLTool init failed: {e}")
414
  logger.info(f"Final tools list for agent: {[t.name for t in TOOLS]}")
@@ -622,7 +626,7 @@ with gr.Blocks(css=".gradio-container {max-width:1280px !important;margin:auto !
622
  1. **Login with Hugging Face** using the button below. Your HF username will be used for submission.
623
  2. Click 'Run Evaluation & Submit' to process GAIA questions (typically 20).
624
  3. **Goal: 30%+ (6/20).** Agent uses Gemini Pro ({GEMINI_MODEL_NAME}) as planner. Tools include Web Search, Python, PDF, OCR, Audio/YouTube, and a new Direct Multimodal tool using Gemini Flash ({GEMINI_FLASH_MULTIMODAL_MODEL_NAME}).
625
- 4. Ensure `GOOGLE_API_KEY` and `HUGGINGFACE_TOKEN` are Space secrets.
626
  5. Check Space logs for details. LangGraph is attempted (ReAct fallback).""")
627
 
628
  agent_status_display = gr.Markdown("**Agent Status:** Initializing...")
@@ -667,6 +671,7 @@ if __name__ == "__main__":
667
  missing_vars_startup_list_global.clear()
668
  if not GOOGLE_API_KEY: missing_vars_startup_list_global.append("GOOGLE_API_KEY")
669
  if not HUGGINGFACE_TOKEN: missing_vars_startup_list_global.append("HUGGINGFACE_TOKEN (for GAIA API)")
 
670
 
671
  try:
672
  logger.info("Pre-initializing agent...")
 
48
  from langchain.tools import BaseTool, tool as lc_tool_decorator
49
  from langchain_google_genai import ChatGoogleGenerativeAI
50
  from langchain.agents import AgentExecutor, create_react_agent
51
+ from langchain_community.tools.tavily_search import TavilySearchResults
52
  from langchain_experimental.tools import PythonREPLTool
53
 
54
  # LangGraph Conditional Imports
 
127
  # --- Constants ---
128
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
129
  GEMINI_MODEL_NAME = "gemini-2.5-pro"
130
+ GEMINI_FLASH_MULTIMODAL_MODEL_NAME = "gemini-1.5-flash-latest"
131
  SCORING_API_BASE_URL = os.getenv("SCORING_API_URL", DEFAULT_API_URL)
132
  MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024
133
  LOCAL_FILE_STORE_PATH = "./Data"
 
139
  # --- Environment Variables & API Keys ---
140
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
141
  HUGGINGFACE_TOKEN = os.environ.get("HF_TOKEN")
142
+ TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY")
143
 
144
  # --- Setup Logging ---
145
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(module)s:%(lineno)d - %(message)s')
 
256
  name_without_ext, current_ext = os.path.splitext(effective_save_path)
257
  if not current_ext:
258
  content_type_header = r.headers.get('content-type', '')
259
+ content_type_val = content_type_header.split(';').strip() if content_type_header else ''
260
  if content_type_val:
261
  guessed_ext = mimetypes.guess_extension(content_type_val)
262
  if guessed_ext: effective_save_path += guessed_ext; logger.info(f"Added guessed ext: {guessed_ext}")
 
325
 
326
  @lc_tool_decorator
327
  def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
328
+ """Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-1.5-flash-latest) for tasks like image description, Q&A about the image, or text generation based on the image. Input: JSON '{\"file_identifier\": \"IMAGE_FILENAME_OR_URL\", \"text_prompt\": \"Your question or instruction related to the image.\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\" (optional)}'. Returns the model's text response."""
329
  global google_genai_client
330
  if not google_genai_client: return "Error: google-genai SDK client not initialized."
331
+ if not PIL_TESSERACT_AVAILABLE : return "Error: Pillow (PIL) library not available for image processing."
332
  try:
333
  data = json.loads(action_input_json_str)
334
  file_identifier = data.get("file_identifier")
 
343
  except Exception as e_img_open: return f"Error opening image file {local_image_path}: {str(e_img_open)}"
344
 
345
  model_id_for_client = f"models/{GEMINI_FLASH_MULTIMODAL_MODEL_NAME}" if not GEMINI_FLASH_MULTIMODAL_MODEL_NAME.startswith("models/") else GEMINI_FLASH_MULTIMODAL_MODEL_NAME
346
+ response = google_genai_client.models.generate_content(
347
  model=model_id_for_client, contents=[pil_image, text_prompt]
348
  )
349
  logger.info(f"Direct Multimodal Tool: Response received from {model_id_for_client} received.")
 
362
  TOOL USAGE:
363
  - To use a tool, your response must include a `tool_calls` attribute in the AIMessage. Each tool call should be a dictionary with "name", "args" (a dictionary of arguments), and "id".
364
  - For file tools ('read_pdf_tool', 'ocr_image_tool', 'transcribe_audio_tool', 'direct_multimodal_gemini_tool'): The `args` field must be a dictionary with a single key 'action_input_json_str' whose value is a JSON STRING. Example: {{"action_input_json_str": "{{\\"file_identifier\\": \\"file.pdf\\", \\"task_id\\": \\"123\\"}}"}}.
365
+ - 'tavily_search_results_json': `args` is like '{{"query": "search query"}}'.
366
  - 'python_repl': `args` is like '{{"query": "python code string"}}'. Use print() for output.
367
  RESPONSE FORMAT:
368
  Final AIMessage should contain ONLY the answer in 'content' and NO 'tool_calls'. If using tools, 'content' can be thought process, with 'tool_calls'.
 
373
  Tools: {tools}
374
  Process: Question -> Thought -> Action (ONE of [{tool_names}]) -> Action Input -> Observation -> Thought ... -> Final Answer: [exact answer]
375
  Tool Inputs:
376
+ - tavily_search_results_json: Your search query string.
377
  - python_repl: Python code string. Use print(). For Excel/CSV, use pandas: import pandas as pd; df = pd.read_excel('./Data/TASKID_filename.xlsx'); print(df.head())
378
  - read_pdf_tool, ocr_image_tool, transcribe_audio_tool, direct_multimodal_gemini_tool: JSON string like '{{"file_identifier": "FILENAME_OR_URL", "task_id": "CURRENT_TASK_ID_IF_FILENAME"}}'.
379
  If tool fails or info missing, Final Answer: N/A. Do NOT use unlisted tools.
 
395
  google_api_key=GOOGLE_API_KEY,
396
  temperature=0.0,
397
  timeout=120,
398
+ convert_system_message_to_human=False
399
  )
400
  logger.info(f"LangChain LLM (Planner) initialized: {GEMINI_MODEL_NAME} (Using default safety settings, convert_system_message_to_human=False)")
401
  except Exception as e:
 
408
  if WHISPER_AVAILABLE: TOOLS.append(transcribe_audio_tool)
409
  if google_genai_client and PIL_TESSERACT_AVAILABLE: TOOLS.append(direct_multimodal_gemini_tool); logger.info("Added 'direct_multimodal_gemini_tool'.")
410
  else: logger.warning("'direct_multimodal_gemini_tool' NOT added (client or PIL missing).")
411
+ try:
412
+ search_tool = TavilySearchResults(max_results=3)
413
+ TOOLS.append(search_tool)
414
+ logger.info("Added 'TavilySearchResults' tool.")
415
+ except Exception as e: logger.warning(f"TavilySearchResults init failed: {e}")
416
  try: python_repl = PythonREPLTool(name="python_repl"); python_repl.description = "Python REPL. print() for output. The input is a single string of code."; TOOLS.append(python_repl)
417
  except Exception as e: logger.warning(f"PythonREPLTool init failed: {e}")
418
  logger.info(f"Final tools list for agent: {[t.name for t in TOOLS]}")
 
626
  1. **Login with Hugging Face** using the button below. Your HF username will be used for submission.
627
  2. Click 'Run Evaluation & Submit' to process GAIA questions (typically 20).
628
  3. **Goal: 30%+ (6/20).** Agent uses Gemini Pro ({GEMINI_MODEL_NAME}) as planner. Tools include Web Search, Python, PDF, OCR, Audio/YouTube, and a new Direct Multimodal tool using Gemini Flash ({GEMINI_FLASH_MULTIMODAL_MODEL_NAME}).
629
+ 4. Ensure `GOOGLE_API_KEY`, `HUGGINGFACE_TOKEN`, and `TAVILY_API_KEY` are Space secrets.
630
  5. Check Space logs for details. LangGraph is attempted (ReAct fallback).""")
631
 
632
  agent_status_display = gr.Markdown("**Agent Status:** Initializing...")
 
671
  missing_vars_startup_list_global.clear()
672
  if not GOOGLE_API_KEY: missing_vars_startup_list_global.append("GOOGLE_API_KEY")
673
  if not HUGGINGFACE_TOKEN: missing_vars_startup_list_global.append("HUGGINGFACE_TOKEN (for GAIA API)")
674
+ if not TAVILY_API_KEY: missing_vars_startup_list_global.append("TAVILY_API_KEY (for Tavily Search)")
675
 
676
  try:
677
  logger.info("Pre-initializing agent...")