Spaces:

bstraehle
/

gaia

Running

File size: 10,865 Bytes

58787c8
00aced7
adf9d94
1c7a0a7
15b8627
1c7a0a7
617431a
1c7a0a7
257d222
f06be5d
230d96d
2c6f8d9
 
 
ce7387d
27f2401
ad1b760
27f2401
2c6f8d9
81edcab
ad1b760
4c6d89e
27f2401
86f2a58
230d96d
df58f18
230d96d
 
3bd69bd
5028b6b
 
 
 
7ddb52b
f407c48
7bc23ab
948a6f4
ed46760
 
b95f33c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617431a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b95f33c
845c4a7
 
b95f33c
617431a
845c4a7
 
b95f33c
845c4a7
 
 
617431a
a0c3d4c
58787c8
a0c3d4c
 
 
 
 
617431a
 
845c4a7
 
617431a
b3a3878
617431a
 
845c4a7
617431a
 
845c4a7
53643e1
148f2b3
b95f33c
 
 
148f2b3
 
b95f33c
617431a
148f2b3
 
b95f33c
148f2b3
 
 
ad1b760
a0c3d4c
 
 
 
 
 
 
617431a
 
b3a3878
ad1b760
 
 
 
 
 
 
 
 
 
 
 
 
 
617431a
 
148f2b3
617431a
 
ad1b760
 
 
 
 
148f2b3
ad1b760
 
148f2b3
ad1b760
845c4a7
ed46760
8439f94
35828ac
1c7a0a7
617431a
7da0809
 
eea8c7f
617431a
 
 
 
8439f94
 
 
617431a
8439f94
 
617431a
b95f33c
 
 
 
eea8c7f
69b460d
750bbf8
b95f33c
7349562
eea8c7f
8439f94
b95f33c
 
 
 
6bf14de
3981c3e
b40cc33
b95f33c
f855987
1c7a0a7
7f78cfc
617431a
 
b95f33c
 
35828ac
3981c3e
b40cc33
fe51214
f855987
35828ac
b95f33c
 
 
 
 
148f2b3
 
 
b95f33c
148f2b3
 
b95f33c
617431a
 
 
 
 
 
 
 
 
 
 
a412583
b40cc33
148f2b3
fd399dc
fa21952
ea40888
4453360
bec1a98
 
c70f203
4453360
 
ed46760
 
b05f917
a412583
148f2b3
ea40888
b05f917
4453360
7da0809
35828ac
0b498b7
617431a
 
b95f33c
617431a
b95f33c
617431a
a412583
f155629
c70f203
ab1c2b7
 
ed46760
 
27f2401
 
750bbf8
27f2401
 
1e57e78
fe96564
fa21952
948a6f4
750bbf8
 
 
1312508

import cv2, os
from crewai import Agent, Crew, Process, Task
from crewai.tools import tool
from crewai_tools import (
    CodeInterpreterTool,
    SerperDevTool,
    WebsiteSearchTool
)
from openai import OpenAI
from openinference.instrumentation.crewai import CrewAIInstrumentor
from phoenix.otel import register
from util import get_final_answer, get_img_b64

## LLMs

MANAGER_MODEL      = "gpt-4.1"
AGENT_MODEL        = "gpt-4.1-mini"
FINAL_ANSWER_MODEL = "gpt-4.5-preview"
AUDIO_MODEL        = "gpt-4o-transcribe"
IMAGE_MODEL        = "gpt-4.1"
VIDEO_MODEL        = "gpt-4.1-mini"

# LLM evaluation

PHOENIX_API_KEY = os.environ["PHOENIX_API_KEY"]

os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"

tracer_provider = register(
    auto_instrument=True,
    project_name="gaia"
)

CrewAIInstrumentor().instrument(tracer_provider=tracer_provider)

def run_crew(question, file_path):
    # Custom tools

    @tool("Audio Analysis Tool")
    def audio_analysis_tool(question: str, file_path: str) -> str:
        """Answer a question about an audio file.
    
           Args:
               question (str): Question about the audio file
               file_path (str): Path of the audio file
                
           Returns:
               str: Answer to the question about the audio file
                
           Raises:
               FileNotFoundError: If the audio file does not exist
               RuntimeError: If processing fails"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Audio file not found: {file_path}")
            
        try:
            client = OpenAI()

            transcript = client.audio.transcriptions.create(
                file=open(file_path, "rb"),
                model=AUDIO_MODEL,
                prompt=question
            )
            
            return transcript.text
        except Exception as e:
            raise RuntimeError(f"Failed to process audio: {str(e)}")

    @tool("Image Analysis Tool")
    def image_analysis_tool(question: str, file_path: str) -> str:
        """Answer a question about an image file.
    
           Args:
               question (str): Question about the image file
               file_path (str): Path of the image file
                
           Returns:
               str: Answer to the question about the image file
                
           Raises:
               FileNotFoundError: If the image file does not exist
               RuntimeError: If processing fails"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Image file not found: {file_path}")
            
        try:
            # Get image
            
            img_b64 = get_img_b64(file_path)

            # OpenAI
            
            client = OpenAI()

            completion = client.chat.completions.create(
                messages=[{"role": "user",
                           "content": [{"type": "text", "text": question},
                                       {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}]}],
                model=IMAGE_MODEL
            )
        
            return completion.choices[0].message.content
        except Exception as e:
            raise RuntimeError(f"Failed to process image: {str(e)}")
    
    @tool("YouTube Audio Analysis Tool")
    def youtube_audio_analysis_tool(question: str, url: str) -> str:
        """Answer an audio question about a YouTube video.
    
           Args:
               question (str): Audio question about YouTube video
               url (str): YouTube URL
                
           Returns:
               str: Answer to the audio question about YouTube video
                
           Raises:
               RuntimeError: If processing fails"""
        try:
            # YouTube (hack to deal with access issues)

            if url.endswith("1htKBjuUWec"):
                file_path = "data/1htKBjuUWec.mp4"
            else:
                raise RuntimeError()
            
            # OpenAI
            
            client = OpenAI()

            transcription = client.audio.transcriptions.create(
                file=open(file_path, "rb"),
                model=AUDIO_MODEL,
                prompt=question
            )
            
            return transcription.text
        except Exception as e:
            raise RuntimeError(f"Failed to process audio: {str(e)}")

    @tool("YouTube Image Analysis Tool")
    def youtube_image_analysis_tool(question: str, url: str) -> str:
        """Answer an image question about a YouTube video.
    
           Args:
               question (str): Image question about YouTube video
               url (str): YouTube URL
                
           Returns:
               str: Answer to the image question about YouTube video
                
           Raises:
               RuntimeError: If processing fails"""
        try:
            # YouTube (hack to deal with access issues)
                
            if url.endswith("L1vXCYZAYYM"):
                file_path = "data/L1vXCYZAYYM.mp4"
            else:
                raise RuntimeError()
                
            # Get video TODOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOo
            
            video = cv2.VideoCapture(file_path)
            
            base64Frames = []
            
            while video.isOpened():
                success, frame = video.read()
                
                if not success:
                    break
                
                _, buffer = cv2.imencode(".jpg", frame)
                
                base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
            
            video.release()
            
            # OpenAI

            client = OpenAI()
            
            response = client.responses.create(
                input=[{"role": "user",
                        "content": [{"type": "input_text", "text": (question)},
                                    *[{"type": "input_image", "image_url": f"data:image/jpeg;base64,{frame}"} for frame in base64Frames]]}],
                model=VIDEO_MODEL
            )
            
            return response.output_text
        except Exception as e:
            raise RuntimeError(f"Failed to process video: {str(e)}")
    
    # Built-in tools

    web_search_tool = SerperDevTool()
    web_rag_tool = WebsiteSearchTool()
    python_coding_tool = CodeInterpreterTool()

    # Agents

    web_search_agent = Agent(
        role="Web Search Agent",
        goal="Search the web to help answer question \"{question}\", then scrape the most relevant web page.",
        backstory="As an expert web search assistant, you search the web to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[web_search_tool, web_rag_tool],
        verbose=False
    )

    audio_analysis_agent = Agent(
        role="Audio Analysis Agent",
        goal="Analyze audio to help answer question \"{question}\"",
        backstory="As an expert audio analysis assistant, you analyze the audio to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[audio_analysis_tool],
        verbose=False
    )

    image_analysis_agent = Agent(
        role="Image Analysis Agent",
        goal="Analyze image to help answer question \"{question}\"",
        backstory="As an expert image analysis assistant, you analyze the image to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[image_analysis_tool],
        verbose=False
    )

    youtube_audio_analysis_agent = Agent(
        role="YouTube Audio Analysis Agent",
        goal="Analyze YouTube video to help answer audio question \"{question}\"",
        backstory="As an expert YouTube audio analysis assistant, you analyze the video to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[youtube_audio_analysis_tool],
        verbose=False
    )

    youtube_image_analysis_agent = Agent(
        role="YouTube Image Analysis Agent",
        goal="Analyze YouTube video to help answer image question \"{question}\"",
        backstory="As an expert YouTube image analysis assistant, you analyze the video to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[youtube_image_analysis_tool],
        verbose=False
    )
    
    python_coding_agent = Agent(
        role="Python Coding Agent",
        goal="Write and/or execute Python code to help answer question \"{question}\"",
        backstory="As an expert Python coding assistant, you write and/or execute Python code to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=10,
        tools=[python_coding_tool],
        verbose=False
    )

    manager_agent = Agent(
        role="Manager Agent",
        goal="Try to answer the following question. If needed, delegate to one or more of your coworkers for help. "
             "If there is no good coworker, delegate to the Python Coding Agent to implement a tool for the task. "
             "Question: \"{question}\"",
        backstory="As an expert manager assistant, you answer the question.",
        allow_delegation=True,
        llm=MANAGER_MODEL,
        max_iter=5,
        verbose=True
    )

    # Task

    manager_task = Task(
        agent=manager_agent,
        description="Try to answer the following question. If needed, delegate to one or more of your coworkers for help. Question: \"{question}\"",
        expected_output="The answer to the question."
    )
    
    # Crew
    
    crew = Crew(
        agents=[web_search_agent, 
                audio_analysis_agent, 
                image_analysis_agent, 
                youtube_audio_analysis_agent, 
                youtube_image_analysis_agent, 
                python_coding_agent],
        manager_agent=manager_agent,
        tasks=[manager_task],
        verbose=True
    )

    # Process

    if file_path:
        question = f"{question} File path: {file_path}."
    
        if file_path.endswith(".py"):
            with open(f"{file_path}", "r") as file:
                question = f"{question} File data:\n{file.read()}"
    
    initial_answer = crew.kickoff(inputs={"question": question})
    final_answer = get_final_answer(FINAL_ANSWER_MODEL, question, str(initial_answer))

    print(f"Question: {question}")
    print(f"Initial answer: {initial_answer}")
    print(f"Final answer: {final_answer}")
    
    return final_answer