Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -48,7 +48,7 @@ from langchain.prompts import PromptTemplate
|
|
48 |
from langchain.tools import BaseTool, tool as lc_tool_decorator
|
49 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
50 |
from langchain.agents import AgentExecutor, create_react_agent
|
51 |
-
from langchain_community.tools import
|
52 |
from langchain_experimental.tools import PythonREPLTool
|
53 |
|
54 |
# LangGraph Conditional Imports
|
@@ -127,7 +127,7 @@ except ImportError as e:
|
|
127 |
# --- Constants ---
|
128 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
129 |
GEMINI_MODEL_NAME = "gemini-2.5-pro"
|
130 |
-
GEMINI_FLASH_MULTIMODAL_MODEL_NAME = "gemini-
|
131 |
SCORING_API_BASE_URL = os.getenv("SCORING_API_URL", DEFAULT_API_URL)
|
132 |
MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024
|
133 |
LOCAL_FILE_STORE_PATH = "./Data"
|
@@ -139,6 +139,7 @@ WHISPER_MODEL: Optional[Any] = None
|
|
139 |
# --- Environment Variables & API Keys ---
|
140 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
141 |
HUGGINGFACE_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
142 |
|
143 |
# --- Setup Logging ---
|
144 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(module)s:%(lineno)d - %(message)s')
|
@@ -255,7 +256,7 @@ def _download_file(file_identifier: str, task_id_for_file: Optional[str] = None)
|
|
255 |
name_without_ext, current_ext = os.path.splitext(effective_save_path)
|
256 |
if not current_ext:
|
257 |
content_type_header = r.headers.get('content-type', '')
|
258 |
-
content_type_val = content_type_header.split(';')
|
259 |
if content_type_val:
|
260 |
guessed_ext = mimetypes.guess_extension(content_type_val)
|
261 |
if guessed_ext: effective_save_path += guessed_ext; logger.info(f"Added guessed ext: {guessed_ext}")
|
@@ -324,10 +325,10 @@ def transcribe_audio_tool(action_input_json_str: str) -> str:
|
|
324 |
|
325 |
@lc_tool_decorator
|
326 |
def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
|
327 |
-
"""Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-1.5-flash-
|
328 |
global google_genai_client
|
329 |
if not google_genai_client: return "Error: google-genai SDK client not initialized."
|
330 |
-
if not PIL_TESSERACT_AVAILABLE : return "Error: Pillow (PIL) library not available for image processing."
|
331 |
try:
|
332 |
data = json.loads(action_input_json_str)
|
333 |
file_identifier = data.get("file_identifier")
|
@@ -342,7 +343,7 @@ def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
|
|
342 |
except Exception as e_img_open: return f"Error opening image file {local_image_path}: {str(e_img_open)}"
|
343 |
|
344 |
model_id_for_client = f"models/{GEMINI_FLASH_MULTIMODAL_MODEL_NAME}" if not GEMINI_FLASH_MULTIMODAL_MODEL_NAME.startswith("models/") else GEMINI_FLASH_MULTIMODAL_MODEL_NAME
|
345 |
-
response = google_genai_client.models.generate_content(
|
346 |
model=model_id_for_client, contents=[pil_image, text_prompt]
|
347 |
)
|
348 |
logger.info(f"Direct Multimodal Tool: Response received from {model_id_for_client} received.")
|
@@ -361,7 +362,7 @@ You have access to the following tools. Use them if necessary.
|
|
361 |
TOOL USAGE:
|
362 |
- To use a tool, your response must include a `tool_calls` attribute in the AIMessage. Each tool call should be a dictionary with "name", "args" (a dictionary of arguments), and "id".
|
363 |
- For file tools ('read_pdf_tool', 'ocr_image_tool', 'transcribe_audio_tool', 'direct_multimodal_gemini_tool'): The `args` field must be a dictionary with a single key 'action_input_json_str' whose value is a JSON STRING. Example: {{"action_input_json_str": "{{\\"file_identifier\\": \\"file.pdf\\", \\"task_id\\": \\"123\\"}}"}}.
|
364 |
-
- '
|
365 |
- 'python_repl': `args` is like '{{"query": "python code string"}}'. Use print() for output.
|
366 |
RESPONSE FORMAT:
|
367 |
Final AIMessage should contain ONLY the answer in 'content' and NO 'tool_calls'. If using tools, 'content' can be thought process, with 'tool_calls'.
|
@@ -372,7 +373,7 @@ Goal: EXACT MATCH answer. No extra text/markdown.
|
|
372 |
Tools: {tools}
|
373 |
Process: Question -> Thought -> Action (ONE of [{tool_names}]) -> Action Input -> Observation -> Thought ... -> Final Answer: [exact answer]
|
374 |
Tool Inputs:
|
375 |
-
-
|
376 |
- python_repl: Python code string. Use print(). For Excel/CSV, use pandas: import pandas as pd; df = pd.read_excel('./Data/TASKID_filename.xlsx'); print(df.head())
|
377 |
- read_pdf_tool, ocr_image_tool, transcribe_audio_tool, direct_multimodal_gemini_tool: JSON string like '{{"file_identifier": "FILENAME_OR_URL", "task_id": "CURRENT_TASK_ID_IF_FILENAME"}}'.
|
378 |
If tool fails or info missing, Final Answer: N/A. Do NOT use unlisted tools.
|
@@ -394,7 +395,7 @@ def initialize_agent_and_tools(force_reinit=False):
|
|
394 |
google_api_key=GOOGLE_API_KEY,
|
395 |
temperature=0.0,
|
396 |
timeout=120,
|
397 |
-
convert_system_message_to_human=False
|
398 |
)
|
399 |
logger.info(f"LangChain LLM (Planner) initialized: {GEMINI_MODEL_NAME} (Using default safety settings, convert_system_message_to_human=False)")
|
400 |
except Exception as e:
|
@@ -407,8 +408,11 @@ def initialize_agent_and_tools(force_reinit=False):
|
|
407 |
if WHISPER_AVAILABLE: TOOLS.append(transcribe_audio_tool)
|
408 |
if google_genai_client and PIL_TESSERACT_AVAILABLE: TOOLS.append(direct_multimodal_gemini_tool); logger.info("Added 'direct_multimodal_gemini_tool'.")
|
409 |
else: logger.warning("'direct_multimodal_gemini_tool' NOT added (client or PIL missing).")
|
410 |
-
try:
|
411 |
-
|
|
|
|
|
|
|
412 |
try: python_repl = PythonREPLTool(name="python_repl"); python_repl.description = "Python REPL. print() for output. The input is a single string of code."; TOOLS.append(python_repl)
|
413 |
except Exception as e: logger.warning(f"PythonREPLTool init failed: {e}")
|
414 |
logger.info(f"Final tools list for agent: {[t.name for t in TOOLS]}")
|
@@ -622,7 +626,7 @@ with gr.Blocks(css=".gradio-container {max-width:1280px !important;margin:auto !
|
|
622 |
1. **Login with Hugging Face** using the button below. Your HF username will be used for submission.
|
623 |
2. Click 'Run Evaluation & Submit' to process GAIA questions (typically 20).
|
624 |
3. **Goal: 30%+ (6/20).** Agent uses Gemini Pro ({GEMINI_MODEL_NAME}) as planner. Tools include Web Search, Python, PDF, OCR, Audio/YouTube, and a new Direct Multimodal tool using Gemini Flash ({GEMINI_FLASH_MULTIMODAL_MODEL_NAME}).
|
625 |
-
4. Ensure `GOOGLE_API_KEY` and `
|
626 |
5. Check Space logs for details. LangGraph is attempted (ReAct fallback).""")
|
627 |
|
628 |
agent_status_display = gr.Markdown("**Agent Status:** Initializing...")
|
@@ -667,6 +671,7 @@ if __name__ == "__main__":
|
|
667 |
missing_vars_startup_list_global.clear()
|
668 |
if not GOOGLE_API_KEY: missing_vars_startup_list_global.append("GOOGLE_API_KEY")
|
669 |
if not HUGGINGFACE_TOKEN: missing_vars_startup_list_global.append("HUGGINGFACE_TOKEN (for GAIA API)")
|
|
|
670 |
|
671 |
try:
|
672 |
logger.info("Pre-initializing agent...")
|
|
|
48 |
from langchain.tools import BaseTool, tool as lc_tool_decorator
|
49 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
50 |
from langchain.agents import AgentExecutor, create_react_agent
|
51 |
+
from langchain_community.tools.tavily_search import TavilySearchResults
|
52 |
from langchain_experimental.tools import PythonREPLTool
|
53 |
|
54 |
# LangGraph Conditional Imports
|
|
|
127 |
# --- Constants ---
|
128 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
129 |
GEMINI_MODEL_NAME = "gemini-2.5-pro"
|
130 |
+
GEMINI_FLASH_MULTIMODAL_MODEL_NAME = "gemini-1.5-flash-latest"
|
131 |
SCORING_API_BASE_URL = os.getenv("SCORING_API_URL", DEFAULT_API_URL)
|
132 |
MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024
|
133 |
LOCAL_FILE_STORE_PATH = "./Data"
|
|
|
139 |
# --- Environment Variables & API Keys ---
|
140 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
141 |
HUGGINGFACE_TOKEN = os.environ.get("HF_TOKEN")
|
142 |
+
TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY")
|
143 |
|
144 |
# --- Setup Logging ---
|
145 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(module)s:%(lineno)d - %(message)s')
|
|
|
256 |
name_without_ext, current_ext = os.path.splitext(effective_save_path)
|
257 |
if not current_ext:
|
258 |
content_type_header = r.headers.get('content-type', '')
|
259 |
+
content_type_val = content_type_header.split(';').strip() if content_type_header else ''
|
260 |
if content_type_val:
|
261 |
guessed_ext = mimetypes.guess_extension(content_type_val)
|
262 |
if guessed_ext: effective_save_path += guessed_ext; logger.info(f"Added guessed ext: {guessed_ext}")
|
|
|
325 |
|
326 |
@lc_tool_decorator
|
327 |
def direct_multimodal_gemini_tool(action_input_json_str: str) -> str:
|
328 |
+
"""Processes an image file (URL or local path) along with a text prompt using a Gemini multimodal model (gemini-1.5-flash-latest) for tasks like image description, Q&A about the image, or text generation based on the image. Input: JSON '{\"file_identifier\": \"IMAGE_FILENAME_OR_URL\", \"text_prompt\": \"Your question or instruction related to the image.\", \"task_id\": \"TASK_ID_IF_GAIA_FILENAME_ONLY\" (optional)}'. Returns the model's text response."""
|
329 |
global google_genai_client
|
330 |
if not google_genai_client: return "Error: google-genai SDK client not initialized."
|
331 |
+
if not PIL_TESSERACT_AVAILABLE : return "Error: Pillow (PIL) library not available for image processing."
|
332 |
try:
|
333 |
data = json.loads(action_input_json_str)
|
334 |
file_identifier = data.get("file_identifier")
|
|
|
343 |
except Exception as e_img_open: return f"Error opening image file {local_image_path}: {str(e_img_open)}"
|
344 |
|
345 |
model_id_for_client = f"models/{GEMINI_FLASH_MULTIMODAL_MODEL_NAME}" if not GEMINI_FLASH_MULTIMODAL_MODEL_NAME.startswith("models/") else GEMINI_FLASH_MULTIMODAL_MODEL_NAME
|
346 |
+
response = google_genai_client.models.generate_content(
|
347 |
model=model_id_for_client, contents=[pil_image, text_prompt]
|
348 |
)
|
349 |
logger.info(f"Direct Multimodal Tool: Response received from {model_id_for_client} received.")
|
|
|
362 |
TOOL USAGE:
|
363 |
- To use a tool, your response must include a `tool_calls` attribute in the AIMessage. Each tool call should be a dictionary with "name", "args" (a dictionary of arguments), and "id".
|
364 |
- For file tools ('read_pdf_tool', 'ocr_image_tool', 'transcribe_audio_tool', 'direct_multimodal_gemini_tool'): The `args` field must be a dictionary with a single key 'action_input_json_str' whose value is a JSON STRING. Example: {{"action_input_json_str": "{{\\"file_identifier\\": \\"file.pdf\\", \\"task_id\\": \\"123\\"}}"}}.
|
365 |
+
- 'tavily_search_results_json': `args` is like '{{"query": "search query"}}'.
|
366 |
- 'python_repl': `args` is like '{{"query": "python code string"}}'. Use print() for output.
|
367 |
RESPONSE FORMAT:
|
368 |
Final AIMessage should contain ONLY the answer in 'content' and NO 'tool_calls'. If using tools, 'content' can be thought process, with 'tool_calls'.
|
|
|
373 |
Tools: {tools}
|
374 |
Process: Question -> Thought -> Action (ONE of [{tool_names}]) -> Action Input -> Observation -> Thought ... -> Final Answer: [exact answer]
|
375 |
Tool Inputs:
|
376 |
+
- tavily_search_results_json: Your search query string.
|
377 |
- python_repl: Python code string. Use print(). For Excel/CSV, use pandas: import pandas as pd; df = pd.read_excel('./Data/TASKID_filename.xlsx'); print(df.head())
|
378 |
- read_pdf_tool, ocr_image_tool, transcribe_audio_tool, direct_multimodal_gemini_tool: JSON string like '{{"file_identifier": "FILENAME_OR_URL", "task_id": "CURRENT_TASK_ID_IF_FILENAME"}}'.
|
379 |
If tool fails or info missing, Final Answer: N/A. Do NOT use unlisted tools.
|
|
|
395 |
google_api_key=GOOGLE_API_KEY,
|
396 |
temperature=0.0,
|
397 |
timeout=120,
|
398 |
+
convert_system_message_to_human=False
|
399 |
)
|
400 |
logger.info(f"LangChain LLM (Planner) initialized: {GEMINI_MODEL_NAME} (Using default safety settings, convert_system_message_to_human=False)")
|
401 |
except Exception as e:
|
|
|
408 |
if WHISPER_AVAILABLE: TOOLS.append(transcribe_audio_tool)
|
409 |
if google_genai_client and PIL_TESSERACT_AVAILABLE: TOOLS.append(direct_multimodal_gemini_tool); logger.info("Added 'direct_multimodal_gemini_tool'.")
|
410 |
else: logger.warning("'direct_multimodal_gemini_tool' NOT added (client or PIL missing).")
|
411 |
+
try:
|
412 |
+
search_tool = TavilySearchResults(max_results=3)
|
413 |
+
TOOLS.append(search_tool)
|
414 |
+
logger.info("Added 'TavilySearchResults' tool.")
|
415 |
+
except Exception as e: logger.warning(f"TavilySearchResults init failed: {e}")
|
416 |
try: python_repl = PythonREPLTool(name="python_repl"); python_repl.description = "Python REPL. print() for output. The input is a single string of code."; TOOLS.append(python_repl)
|
417 |
except Exception as e: logger.warning(f"PythonREPLTool init failed: {e}")
|
418 |
logger.info(f"Final tools list for agent: {[t.name for t in TOOLS]}")
|
|
|
626 |
1. **Login with Hugging Face** using the button below. Your HF username will be used for submission.
|
627 |
2. Click 'Run Evaluation & Submit' to process GAIA questions (typically 20).
|
628 |
3. **Goal: 30%+ (6/20).** Agent uses Gemini Pro ({GEMINI_MODEL_NAME}) as planner. Tools include Web Search, Python, PDF, OCR, Audio/YouTube, and a new Direct Multimodal tool using Gemini Flash ({GEMINI_FLASH_MULTIMODAL_MODEL_NAME}).
|
629 |
+
4. Ensure `GOOGLE_API_KEY`, `HUGGINGFACE_TOKEN`, and `TAVILY_API_KEY` are Space secrets.
|
630 |
5. Check Space logs for details. LangGraph is attempted (ReAct fallback).""")
|
631 |
|
632 |
agent_status_display = gr.Markdown("**Agent Status:** Initializing...")
|
|
|
671 |
missing_vars_startup_list_global.clear()
|
672 |
if not GOOGLE_API_KEY: missing_vars_startup_list_global.append("GOOGLE_API_KEY")
|
673 |
if not HUGGINGFACE_TOKEN: missing_vars_startup_list_global.append("HUGGINGFACE_TOKEN (for GAIA API)")
|
674 |
+
if not TAVILY_API_KEY: missing_vars_startup_list_global.append("TAVILY_API_KEY (for Tavily Search)")
|
675 |
|
676 |
try:
|
677 |
logger.info("Pre-initializing agent...")
|