File size: 10,865 Bytes
58787c8
00aced7
adf9d94
1c7a0a7
15b8627
1c7a0a7
617431a
1c7a0a7
257d222
f06be5d
230d96d
2c6f8d9
 
 
ce7387d
27f2401
ad1b760
27f2401
2c6f8d9
81edcab
ad1b760
4c6d89e
27f2401
86f2a58
230d96d
df58f18
230d96d
 
3bd69bd
5028b6b
 
 
 
7ddb52b
f407c48
7bc23ab
948a6f4
ed46760
 
b95f33c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617431a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b95f33c
845c4a7
 
b95f33c
617431a
845c4a7
 
b95f33c
845c4a7
 
 
617431a
a0c3d4c
58787c8
a0c3d4c
 
 
 
 
617431a
 
845c4a7
 
617431a
b3a3878
617431a
 
845c4a7
617431a
 
845c4a7
53643e1
148f2b3
b95f33c
 
 
148f2b3
 
b95f33c
617431a
148f2b3
 
b95f33c
148f2b3
 
 
ad1b760
a0c3d4c
 
 
 
 
 
 
617431a
 
b3a3878
ad1b760
 
 
 
 
 
 
 
 
 
 
 
 
 
617431a
 
148f2b3
617431a
 
ad1b760
 
 
 
 
148f2b3
ad1b760
 
148f2b3
ad1b760
845c4a7
ed46760
8439f94
35828ac
1c7a0a7
617431a
7da0809
 
eea8c7f
617431a
 
 
 
8439f94
 
 
617431a
8439f94
 
617431a
b95f33c
 
 
 
eea8c7f
69b460d
750bbf8
b95f33c
7349562
eea8c7f
8439f94
b95f33c
 
 
 
6bf14de
3981c3e
b40cc33
b95f33c
f855987
1c7a0a7
7f78cfc
617431a
 
b95f33c
 
35828ac
3981c3e
b40cc33
fe51214
f855987
35828ac
b95f33c
 
 
 
 
148f2b3
 
 
b95f33c
148f2b3
 
b95f33c
617431a
 
 
 
 
 
 
 
 
 
 
a412583
b40cc33
148f2b3
fd399dc
fa21952
ea40888
4453360
bec1a98
 
c70f203
4453360
 
ed46760
 
b05f917
a412583
148f2b3
ea40888
b05f917
4453360
7da0809
35828ac
0b498b7
617431a
 
b95f33c
617431a
b95f33c
617431a
a412583
f155629
c70f203
ab1c2b7
 
ed46760
 
27f2401
 
750bbf8
27f2401
 
1e57e78
fe96564
fa21952
948a6f4
750bbf8
 
 
1312508
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import cv2, os
from crewai import Agent, Crew, Process, Task
from crewai.tools import tool
from crewai_tools import (
    CodeInterpreterTool,
    SerperDevTool,
    WebsiteSearchTool
)
from openai import OpenAI
from openinference.instrumentation.crewai import CrewAIInstrumentor
from phoenix.otel import register
from util import get_final_answer, get_img_b64

## LLMs

MANAGER_MODEL      = "gpt-4.1"
AGENT_MODEL        = "gpt-4.1-mini"
FINAL_ANSWER_MODEL = "gpt-4.5-preview"
AUDIO_MODEL        = "gpt-4o-transcribe"
IMAGE_MODEL        = "gpt-4.1"
VIDEO_MODEL        = "gpt-4.1-mini"

# LLM evaluation

PHOENIX_API_KEY = os.environ["PHOENIX_API_KEY"]

os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"

tracer_provider = register(
    auto_instrument=True,
    project_name="gaia"
)

CrewAIInstrumentor().instrument(tracer_provider=tracer_provider)

def run_crew(question, file_path):
    # Custom tools

    @tool("Audio Analysis Tool")
    def audio_analysis_tool(question: str, file_path: str) -> str:
        """Answer a question about an audio file.
    
           Args:
               question (str): Question about the audio file
               file_path (str): Path of the audio file
                
           Returns:
               str: Answer to the question about the audio file
                
           Raises:
               FileNotFoundError: If the audio file does not exist
               RuntimeError: If processing fails"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Audio file not found: {file_path}")
            
        try:
            client = OpenAI()

            transcript = client.audio.transcriptions.create(
                file=open(file_path, "rb"),
                model=AUDIO_MODEL,
                prompt=question
            )
            
            return transcript.text
        except Exception as e:
            raise RuntimeError(f"Failed to process audio: {str(e)}")

    @tool("Image Analysis Tool")
    def image_analysis_tool(question: str, file_path: str) -> str:
        """Answer a question about an image file.
    
           Args:
               question (str): Question about the image file
               file_path (str): Path of the image file
                
           Returns:
               str: Answer to the question about the image file
                
           Raises:
               FileNotFoundError: If the image file does not exist
               RuntimeError: If processing fails"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Image file not found: {file_path}")
            
        try:
            # Get image
            
            img_b64 = get_img_b64(file_path)

            # OpenAI
            
            client = OpenAI()

            completion = client.chat.completions.create(
                messages=[{"role": "user",
                           "content": [{"type": "text", "text": question},
                                       {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}]}],
                model=IMAGE_MODEL
            )
        
            return completion.choices[0].message.content
        except Exception as e:
            raise RuntimeError(f"Failed to process image: {str(e)}")
    
    @tool("YouTube Audio Analysis Tool")
    def youtube_audio_analysis_tool(question: str, url: str) -> str:
        """Answer an audio question about a YouTube video.
    
           Args:
               question (str): Audio question about YouTube video
               url (str): YouTube URL
                
           Returns:
               str: Answer to the audio question about YouTube video
                
           Raises:
               RuntimeError: If processing fails"""
        try:
            # YouTube (hack to deal with access issues)

            if url.endswith("1htKBjuUWec"):
                file_path = "data/1htKBjuUWec.mp4"
            else:
                raise RuntimeError()
            
            # OpenAI
            
            client = OpenAI()

            transcription = client.audio.transcriptions.create(
                file=open(file_path, "rb"),
                model=AUDIO_MODEL,
                prompt=question
            )
            
            return transcription.text
        except Exception as e:
            raise RuntimeError(f"Failed to process audio: {str(e)}")

    @tool("YouTube Image Analysis Tool")
    def youtube_image_analysis_tool(question: str, url: str) -> str:
        """Answer an image question about a YouTube video.
    
           Args:
               question (str): Image question about YouTube video
               url (str): YouTube URL
                
           Returns:
               str: Answer to the image question about YouTube video
                
           Raises:
               RuntimeError: If processing fails"""
        try:
            # YouTube (hack to deal with access issues)
                
            if url.endswith("L1vXCYZAYYM"):
                file_path = "data/L1vXCYZAYYM.mp4"
            else:
                raise RuntimeError()
                
            # Get video TODOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOo
            
            video = cv2.VideoCapture(file_path)
            
            base64Frames = []
            
            while video.isOpened():
                success, frame = video.read()
                
                if not success:
                    break
                
                _, buffer = cv2.imencode(".jpg", frame)
                
                base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
            
            video.release()
            
            # OpenAI

            client = OpenAI()
            
            response = client.responses.create(
                input=[{"role": "user",
                        "content": [{"type": "input_text", "text": (question)},
                                    *[{"type": "input_image", "image_url": f"data:image/jpeg;base64,{frame}"} for frame in base64Frames]]}],
                model=VIDEO_MODEL
            )
            
            return response.output_text
        except Exception as e:
            raise RuntimeError(f"Failed to process video: {str(e)}")
    
    # Built-in tools

    web_search_tool = SerperDevTool()
    web_rag_tool = WebsiteSearchTool()
    python_coding_tool = CodeInterpreterTool()

    # Agents

    web_search_agent = Agent(
        role="Web Search Agent",
        goal="Search the web to help answer question \"{question}\", then scrape the most relevant web page.",
        backstory="As an expert web search assistant, you search the web to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[web_search_tool, web_rag_tool],
        verbose=False
    )

    audio_analysis_agent = Agent(
        role="Audio Analysis Agent",
        goal="Analyze audio to help answer question \"{question}\"",
        backstory="As an expert audio analysis assistant, you analyze the audio to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[audio_analysis_tool],
        verbose=False
    )

    image_analysis_agent = Agent(
        role="Image Analysis Agent",
        goal="Analyze image to help answer question \"{question}\"",
        backstory="As an expert image analysis assistant, you analyze the image to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[image_analysis_tool],
        verbose=False
    )

    youtube_audio_analysis_agent = Agent(
        role="YouTube Audio Analysis Agent",
        goal="Analyze YouTube video to help answer audio question \"{question}\"",
        backstory="As an expert YouTube audio analysis assistant, you analyze the video to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[youtube_audio_analysis_tool],
        verbose=False
    )

    youtube_image_analysis_agent = Agent(
        role="YouTube Image Analysis Agent",
        goal="Analyze YouTube video to help answer image question \"{question}\"",
        backstory="As an expert YouTube image analysis assistant, you analyze the video to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=3,
        tools=[youtube_image_analysis_tool],
        verbose=False
    )
    
    python_coding_agent = Agent(
        role="Python Coding Agent",
        goal="Write and/or execute Python code to help answer question \"{question}\"",
        backstory="As an expert Python coding assistant, you write and/or execute Python code to help answer the question.",
        allow_delegation=False,
        llm=AGENT_MODEL,
        max_iter=10,
        tools=[python_coding_tool],
        verbose=False
    )

    manager_agent = Agent(
        role="Manager Agent",
        goal="Try to answer the following question. If needed, delegate to one or more of your coworkers for help. "
             "If there is no good coworker, delegate to the Python Coding Agent to implement a tool for the task. "
             "Question: \"{question}\"",
        backstory="As an expert manager assistant, you answer the question.",
        allow_delegation=True,
        llm=MANAGER_MODEL,
        max_iter=5,
        verbose=True
    )

    # Task

    manager_task = Task(
        agent=manager_agent,
        description="Try to answer the following question. If needed, delegate to one or more of your coworkers for help. Question: \"{question}\"",
        expected_output="The answer to the question."
    )
    
    # Crew
    
    crew = Crew(
        agents=[web_search_agent, 
                audio_analysis_agent, 
                image_analysis_agent, 
                youtube_audio_analysis_agent, 
                youtube_image_analysis_agent, 
                python_coding_agent],
        manager_agent=manager_agent,
        tasks=[manager_task],
        verbose=True
    )

    # Process

    if file_path:
        question = f"{question} File path: {file_path}."
    
        if file_path.endswith(".py"):
            with open(f"{file_path}", "r") as file:
                question = f"{question} File data:\n{file.read()}"
    
    initial_answer = crew.kickoff(inputs={"question": question})
    final_answer = get_final_answer(FINAL_ANSWER_MODEL, question, str(initial_answer))

    print(f"Question: {question}")
    print(f"Initial answer: {initial_answer}")
    print(f"Final answer: {final_answer}")
    
    return final_answer