韩宇 commited on
Commit
1b7e88c
·
1 Parent(s): ecb4abf
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +127 -12
  2. agent/__init__.py +0 -0
  3. agent/conclude/__init__.py +0 -0
  4. agent/conclude/conclude.py +87 -0
  5. agent/conclude/sys_prompt.prompt +13 -0
  6. agent/conclude/user_prompt.prompt +7 -0
  7. agent/conclude/webpage_conclude.py +81 -0
  8. agent/memories/__init__.py +0 -0
  9. agent/memories/video_ltm_milvus.py +238 -0
  10. agent/misc/scene.py +249 -0
  11. agent/tools/video_rewinder/rewinder.py +99 -0
  12. agent/tools/video_rewinder/rewinder_sys_prompt.prompt +7 -0
  13. agent/tools/video_rewinder/rewinder_user_prompt.prompt +1 -0
  14. agent/video_preprocessor/__init__.py +0 -0
  15. agent/video_preprocessor/sys_prompt.prompt +18 -0
  16. agent/video_preprocessor/user_prompt.prompt +4 -0
  17. agent/video_preprocessor/video_preprocess.py +254 -0
  18. agent/video_preprocessor/webpage_vp.py +252 -0
  19. agent/video_qa/__init__.py +0 -0
  20. agent/video_qa/qa.py +82 -0
  21. agent/video_qa/sys_prompt.prompt +8 -0
  22. agent/video_qa/user_prompt.prompt +1 -0
  23. agent/video_qa/webpage_qa.py +73 -0
  24. app.py +136 -0
  25. calculator_code.py +4 -0
  26. compile_container.py +18 -0
  27. configs/llms/gpt4o.yml +7 -0
  28. configs/llms/json_res.yml +6 -0
  29. configs/llms/text_encoder.yml +3 -0
  30. configs/llms/text_res.yml +6 -0
  31. configs/llms/text_res_stream.yml +7 -0
  32. configs/tools/all_tools.yml +12 -0
  33. configs/workers/conclude.yml +4 -0
  34. configs/workers/dnc_workflow.yml +18 -0
  35. configs/workers/video_preprocessor.yml +12 -0
  36. configs/workers/video_qa.yml +5 -0
  37. container.yaml +154 -0
  38. docs/images/local-ai.png +0 -0
  39. docs/images/video_understanding_workflow_diagram.png +0 -0
  40. docs/local-ai.md +87 -0
  41. omagent_core/__init__.py +0 -0
  42. omagent_core/advanced_components/__init__.py +0 -0
  43. omagent_core/advanced_components/worker/__init__.py +0 -0
  44. omagent_core/advanced_components/worker/conclude/__init__.py +0 -0
  45. omagent_core/advanced_components/worker/conqueror/__init__.py +0 -0
  46. omagent_core/advanced_components/worker/divider/__init__.py +0 -0
  47. omagent_core/advanced_components/worker/task_exit_monitor/__init__.py +0 -0
  48. omagent_core/advanced_components/worker/video_preprocess/__init__.py +0 -0
  49. omagent_core/advanced_components/workflow/cot/README.md +36 -0
  50. omagent_core/advanced_components/workflow/cot/agent/cot_reasoning/cot_reasoning.py +67 -0
README.md CHANGED
@@ -1,12 +1,127 @@
1
- ---
2
- title: OmAgent
3
- emoji: 🦀
4
- colorFrom: yellow
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.16.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Video Understanding Example
2
+
3
+ This example demonstrates how to use the framework for hour-long video understanding task. The example code can be found in the `examples/video_understanding` directory.
4
+
5
+ ```bash
6
+ cd examples/video_understanding
7
+ ```
8
+
9
+ ## Overview
10
+
11
+ This example implements a video understanding task workflow based on the DnC workflow, which consists of following components:
12
+
13
+ 1. **Video Preprocess Task**
14
+ - Preprocess the video with audio information via speech-to-text capability
15
+ - It detects the scene boundaries, splits the video into several chunks and extract frames at specified intervals
16
+ - Each scene chunk is summarized by MLLM with detailed information, cached and updated into vector database for Q&A retrieval
17
+ - Video metadata and video file md5 are transferred for filtering
18
+
19
+ 2. **Video QA Task**
20
+ - Take the user input question about the video
21
+ - Retrieve related information from the vector database with the question
22
+ - Extract the approximate start and end time of the video segment related to the question
23
+ - Generate video object from serialized data in short-term memory(stm)
24
+ - Build init task tree with the question to DnC task
25
+
26
+ 3. **Divide and Conquer Task**
27
+ - Execute the task tree with the question
28
+ - Detailed information is referred to the [DnC Example](./DnC.md#overview)
29
+
30
+ The system uses Redis for state management, Milvus for long-tern memory storage and Conductor for workflow orchestration.
31
+
32
+ ### This whole workflow is looked like the following diagram:
33
+
34
+ ![Video Understanding Workflow](docsmages/video_understanding_workflow_diagram.png)
35
+
36
+ ## Prerequisites
37
+
38
+ - Python 3.10+
39
+ - Required packages installed (see requirements.txt)
40
+ - Access to OpenAI API or compatible endpoint (see configs/llms/*.yml)
41
+ - [Optional] Access to Bing API for WebSearch tool (see configs/tools/*.yml)
42
+ - Redis server running locally or remotely
43
+ - Conductor server running locally or remotely
44
+
45
+ ## Configuration
46
+
47
+ The container.yaml file is a configuration file that manages dependencies and settings for different components of the system, including Conductor connections, Redis connections, and other service configurations. To set up your configuration:
48
+
49
+ 1. Generate the container.yaml file:
50
+ ```bash
51
+ python compile_container.py
52
+ ```
53
+ This will create a container.yaml file with default settings under `examples/video_understanding`.
54
+
55
+
56
+ 2. Configure your LLM and tool settings in `configs/llms/*.yml` and `configs/tools/*.yml`:
57
+ - Set your OpenAI API key or compatible endpoint through environment variable or by directly modifying the yml file
58
+ ```bash
59
+ export custom_openai_key="your_openai_api_key"
60
+ export custom_openai_endpoint="your_openai_endpoint"
61
+ ```
62
+ - [Optional] Set your Bing API key or compatible endpoint through environment variable or by directly modifying the yml file
63
+ ```bash
64
+ export bing_api_key="your_bing_api_key"
65
+ ```
66
+ **Note: It isn't mandatory to set the Bing API key, as the WebSearch tool will rollback to use duckduckgo search. But it is recommended to set it for better search quality.**
67
+ - The default text encoder configuration uses OpenAI `text-embedding-3-large` with **3072** dimensions, make sure you change the dim value of `MilvusLTM` in `container.yaml`
68
+ - Configure other model settings like temperature as needed through environment variable or by directly modifying the yml file
69
+
70
+ 3. Update settings in the generated `container.yaml`:
71
+ - Modify Redis connection settings:
72
+ - Set the host, port and credentials for your Redis instance
73
+ - Configure both `redis_stream_client` and `redis_stm_client` sections
74
+ - Update the Conductor server URL under conductor_config section
75
+ - Configure MilvusLTM in `components` section:
76
+ - Set the `storage_name` and `dim` for MilvusLTM
77
+ - Set `dim` is to **3072** if you use default OpenAI encoder, make sure to modify corresponding dimension if you use other custom text encoder model or endpoint
78
+ - Adjust other settings as needed
79
+ - Configure hyper-parameters for video preprocess task in `examples/video_understanding/configs/workers/video_preprocessor.yml`
80
+ - `use_cache`: Whether to use cache for the video preprocess task
81
+ - `scene_detect_threshold`: The threshold for scene detection, which is used to determine if a scene change occurs in the video, min value means more scenes will be detected, default value is **27**
82
+ - `frame_extraction_interval`: The interval between frames to extract from the video, default value is **5**
83
+ - `kernel_size`: The size of the kernel for scene detection, should be **odd** number, default value is automatically calculated based on the resolution of the video. For hour-long videos, it is recommended to leave it blank, but for short videos, it is recommended to set a smaller value, like **3**, **5** to make it more sensitive to the scene change
84
+ - `stt.endpoint`: The endpoint for the speech-to-text service, default uses OpenAI ASR service
85
+ - `stt.api_key`: The API key for the speech-to-text service, default uses OpenAI API key
86
+ - Adjust any other component settings as needed
87
+
88
+ ## Running the Example
89
+
90
+ 1. Run the video understanding example via Webpage:
91
+
92
+ ```bash
93
+ python run_webpage.py
94
+ ```
95
+
96
+ First, select a video or upload a video file on the left; after the video preprocessing is completed, ask questions about the video content on the right.
97
+
98
+
99
+ 2. Run the video understanding example, currently only supports CLI usage:
100
+
101
+ ```bash
102
+ python run_cli.py
103
+ ```
104
+
105
+ First time you need to input the video file path, it will take a while to preprocess the video and store the information into vector database.
106
+ After the video is preprocessed, you can input your question about the video and the system will answer it. Note that the agent may give the wrong or vague answer, especially some questions are related the name of the characters in the video.
107
+
108
+ ## Troubleshooting
109
+
110
+ If you encounter issues:
111
+ - Verify Redis is running and accessible
112
+ - Try smaller `scene_detect_threshold` and `frame_extraction_interval` if you find too many scenes are detected
113
+ - Check your OpenAI API key is valid
114
+ - Check your Bing API key is valid if search results are not as expected
115
+ - Check the `dim` value in `MilvusLTM` in `container.yaml` is set correctly, currently unmatched dimension setting will not raise error but lose partial of the information(we will add more checks in the future)
116
+ - Ensure all dependencies are installed correctly
117
+ - Review logs for any error messages
118
+ - **Open an issue on GitHub if you can't find a solution, we will do our best to help you out!**
119
+
120
+
121
+ 4. Run the video understanding example, currently only supports Webpage usage:
122
+
123
+ ```bash
124
+ python run_webpage.py
125
+ ```
126
+
127
+ First, select a video or upload a video file on the left; after the video preprocessing is completed, ask questions about the video content on the right.
agent/__init__.py ADDED
File without changes
agent/conclude/__init__.py ADDED
File without changes
agent/conclude/conclude.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import List
3
+
4
+ from omagent_core.advanced_components.workflow.dnc.schemas.dnc_structure import \
5
+ TaskTree
6
+ from omagent_core.engine.worker.base import BaseWorker
7
+ from omagent_core.memories.ltms.ltm import LTM
8
+ from omagent_core.models.llms.base import BaseLLMBackend
9
+ from omagent_core.models.llms.prompt import PromptTemplate
10
+ from omagent_core.utils.logger import logging
11
+ from omagent_core.utils.registry import registry
12
+ from collections.abc import Iterator
13
+ from pydantic import Field
14
+
15
+ CURRENT_PATH = root_path = Path(__file__).parents[0]
16
+
17
+
18
+ @registry.register_worker()
19
+ class Conclude(BaseLLMBackend, BaseWorker):
20
+ prompts: List[PromptTemplate] = Field(
21
+ default=[
22
+ PromptTemplate.from_file(
23
+ CURRENT_PATH.joinpath("sys_prompt.prompt"), role="system"
24
+ ),
25
+ PromptTemplate.from_file(
26
+ CURRENT_PATH.joinpath("user_prompt.prompt"), role="user"
27
+ ),
28
+ ]
29
+ )
30
+
31
+ def _run(self, dnc_structure: dict, last_output: str, *args, **kwargs):
32
+ """A conclude node that summarizes and completes the root task.
33
+
34
+ This component acts as the final node that:
35
+ - Takes the root task and its execution results
36
+ - Generates a final conclusion/summary of the entire task execution
37
+ - Formats and presents the final output in a clear way
38
+ - Cleans up any temporary state/memory used during execution
39
+
40
+ The conclude node is responsible for providing a coherent final response that
41
+ addresses the original root task objective based on all the work done by
42
+ previous nodes.
43
+
44
+ Args:
45
+ agent_task (dict): The task tree containing the root task and results
46
+ last_output (str): The final output from previous task execution
47
+ *args: Additional arguments
48
+ **kwargs: Additional keyword arguments
49
+
50
+ Returns:
51
+ dict: Final response containing the conclusion/summary
52
+ """
53
+ task = TaskTree(**dnc_structure)
54
+ self.callback.info(
55
+ agent_id=self.workflow_instance_id,
56
+ progress=f"Conclude",
57
+ message=f"{task.get_current_node().task}",
58
+ )
59
+ chat_complete_res = self.simple_infer(
60
+ task=task.get_root().task,
61
+ result=str(last_output),
62
+ img_placeholders="".join(
63
+ list(self.stm(self.workflow_instance_id).get("image_cache", {}).keys())
64
+ ),
65
+ )
66
+ if isinstance(chat_complete_res, Iterator):
67
+ last_output = "Answer: "
68
+ self.callback.send_incomplete(
69
+ agent_id=self.workflow_instance_id, msg="Answer: "
70
+ )
71
+ for chunk in chat_complete_res:
72
+ if len(chunk.choices) > 0:
73
+ current_msg = chunk.choices[0].delta.content if chunk.choices[0].delta.content is not None else ''
74
+ self.callback.send_incomplete(
75
+ agent_id=self.workflow_instance_id,
76
+ msg=f"{current_msg}",
77
+ )
78
+ last_output += current_msg
79
+ self.callback.send_answer(agent_id=self.workflow_instance_id, msg="")
80
+ else:
81
+ last_output = chat_complete_res["choices"][0]["message"]["content"]
82
+ self.callback.send_answer(
83
+ agent_id=self.workflow_instance_id,
84
+ msg=f'Answer: {chat_complete_res["choices"][0]["message"]["content"]}',
85
+ )
86
+ self.stm(self.workflow_instance_id).clear()
87
+ return {"last_output": last_output}
agent/conclude/sys_prompt.prompt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ As the final stage of our task processing workflow, your role is to inform the user about the final execution result of the task.
2
+ Your task includes two parts:
3
+ 1. Verify the result, ensure it is a valid result of the user's question or task.
4
+ 2. Image may be visual prompted by adding bound boxes and labels to the image, this is the important information.
5
+ 3. Generate the output message since you may get some raw data, you have to get the useful information and generate a detailed message.
6
+
7
+ The task may complete successfully or it can be failed for some reason. You just need to honestly express the situation.
8
+
9
+ *** Important Notice ***
10
+ 1. Please use the language used in the question when responding.
11
+ 2. Your response MUST be based on the results provided to you. Do not attempt to solve the problem on your own or try to correct any errors.
12
+ 3. Do not mention your source of information. Present the response as if it were your own.
13
+ 4. Handle the conversions between different units carefully.
agent/conclude/user_prompt.prompt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Now, it's your turn to complete the task.
2
+
3
+ Task (The task you need to complete.): {{task}}
4
+ result (The result from former agents.): {{result}}
5
+ images: {{img_placeholders}}
6
+
7
+ Now show your super capability as a super agent that beyond regular AIs or LLMs!
agent/conclude/webpage_conclude.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Iterator, List
3
+
4
+ from omagent_core.advanced_components.workflow.dnc.schemas.dnc_structure import \
5
+ TaskTree
6
+ from omagent_core.engine.worker.base import BaseWorker
7
+ from omagent_core.memories.ltms.ltm import LTM
8
+ from omagent_core.models.llms.base import BaseLLMBackend
9
+ from omagent_core.models.llms.prompt import PromptTemplate
10
+ from omagent_core.utils.logger import logging
11
+ from omagent_core.utils.registry import registry
12
+ from openai import Stream
13
+ from pydantic import Field
14
+
15
+ CURRENT_PATH = root_path = Path(__file__).parents[0]
16
+
17
+
18
+ @registry.register_worker()
19
+ class WebpageConclude(BaseLLMBackend, BaseWorker):
20
+ prompts: List[PromptTemplate] = Field(
21
+ default=[
22
+ PromptTemplate.from_file(
23
+ CURRENT_PATH.joinpath("sys_prompt.prompt"), role="system"
24
+ ),
25
+ PromptTemplate.from_file(
26
+ CURRENT_PATH.joinpath("user_prompt.prompt"), role="user"
27
+ ),
28
+ ]
29
+ )
30
+
31
+ def _run(self, dnc_structure: dict, last_output: str, *args, **kwargs):
32
+ """A conclude node that summarizes and completes the root task.
33
+
34
+ This component acts as the final node that:
35
+ - Takes the root task and its execution results
36
+ - Generates a final conclusion/summary of the entire task execution
37
+ - Formats and presents the final output in a clear way
38
+ - Cleans up any temporary state/memory used during execution
39
+
40
+ The conclude node is responsible for providing a coherent final response that
41
+ addresses the original root task objective based on all the work done by
42
+ previous nodes.
43
+
44
+ Args:
45
+ agent_task (dict): The task tree containing the root task and results
46
+ last_output (str): The final output from previous task execution
47
+ *args: Additional arguments
48
+ **kwargs: Additional keyword arguments
49
+
50
+ Returns:
51
+ dict: Final response containing the conclusion/summary
52
+ """
53
+ task = TaskTree(**dnc_structure)
54
+ self.callback.info(
55
+ agent_id=self.workflow_instance_id,
56
+ progress=f"Conclude",
57
+ message=f"{task.get_current_node().task}",
58
+ )
59
+ chat_complete_res = self.simple_infer(
60
+ task=task.get_root().task,
61
+ result=str(last_output),
62
+ img_placeholders="".join(
63
+ list(self.stm(self.workflow_instance_id).get("image_cache", {}).keys())
64
+ ),
65
+ )
66
+ if isinstance(chat_complete_res, Iterator):
67
+ last_output = "Answer: "
68
+ for chunk in chat_complete_res:
69
+ if len(chunk.choices) > 0:
70
+ current_msg = chunk.choices[0].delta.content if chunk.choices[0].delta.content is not None else ''
71
+ last_output += current_msg
72
+ self.callback.send_answer(agent_id=self.workflow_instance_id, msg=last_output)
73
+ else:
74
+ last_output = chat_complete_res["choices"][0]["message"]["content"]
75
+ self.callback.send_answer(
76
+ agent_id=self.workflow_instance_id,
77
+ msg=f'Answer: {chat_complete_res["choices"][0]["message"]["content"]}',
78
+ )
79
+ self.callback.send_answer(agent_id=self.workflow_instance_id, msg=f"Token usage: {self.token_usage}")
80
+ self.stm(self.workflow_instance_id).clear()
81
+ return {"last_output": last_output}
agent/memories/__init__.py ADDED
File without changes
agent/memories/video_ltm_milvus.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import pickle
3
+ from typing import Any, Iterable, List, Optional, Tuple
4
+
5
+ from omagent_core.memories.ltms.ltm_base import LTMBase
6
+ from omagent_core.services.connectors.milvus import MilvusConnector
7
+ from omagent_core.utils.registry import registry
8
+ from pydantic import Field
9
+ from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,
10
+ utility)
11
+
12
+
13
+ @registry.register_component()
14
+ class VideoMilvusLTM(LTMBase):
15
+ milvus_ltm_client: MilvusConnector
16
+ storage_name: str = Field(default="default")
17
+ dim: int = Field(default=128)
18
+
19
+ def model_post_init(self, __context: Any) -> None:
20
+ pass
21
+
22
+ def _create_collection(self) -> None:
23
+ # Check if collection exists
24
+ if not self.milvus_ltm_client._client.has_collection(self.storage_name):
25
+ index_params = self.milvus_ltm_client._client.prepare_index_params()
26
+ # Define field schemas
27
+ key_field = FieldSchema(
28
+ name="key", dtype=DataType.VARCHAR, is_primary=True, max_length=256
29
+ )
30
+ value_field = FieldSchema(
31
+ name="value", dtype=DataType.JSON, description="Json value"
32
+ )
33
+ embedding_field = FieldSchema(
34
+ name="embedding",
35
+ dtype=DataType.FLOAT_VECTOR,
36
+ description="Embedding vector",
37
+ dim=self.dim,
38
+ )
39
+ index_params = self.milvus_ltm_client._client.prepare_index_params()
40
+
41
+ # Create collection schema
42
+ schema = CollectionSchema(
43
+ fields=[key_field, value_field, embedding_field],
44
+ description="Key-Value storage with embeddings",
45
+ )
46
+ for field in schema.fields:
47
+ if (
48
+ field.dtype == DataType.FLOAT_VECTOR
49
+ or field.dtype == DataType.BINARY_VECTOR
50
+ ):
51
+ index_params.add_index(
52
+ field_name=field.name,
53
+ index_name=field.name,
54
+ index_type="FLAT",
55
+ metric_type="COSINE",
56
+ params={"nlist": 128},
57
+ )
58
+ self.milvus_ltm_client._client.create_collection(
59
+ self.storage_name, schema=schema, index_params=index_params
60
+ )
61
+
62
+ # Create index separately after collection creation
63
+ print(f"Created storage {self.storage_name} successfully")
64
+
65
+ def __getitem__(self, key: Any) -> Any:
66
+ key_str = str(key)
67
+ expr = f'key == "{key_str}"'
68
+ res = self.milvus_ltm_client._client.query(
69
+ self.storage_name, expr, output_fields=["value"]
70
+ )
71
+ if res:
72
+ value = res[0]["value"]
73
+ # value_bytes = base64.b64decode(value_base64)
74
+ # value = pickle.loads(value_bytes)
75
+ return value
76
+ else:
77
+ raise KeyError(f"Key {key} not found")
78
+
79
+ def __setitem__(self, key: Any, value: Any) -> None:
80
+ self._create_collection()
81
+
82
+ key_str = str(key)
83
+
84
+ # Check if value is a dictionary containing 'value' and 'embedding'
85
+ if isinstance(value, dict) and "value" in value and "embedding" in value:
86
+ actual_value = value["value"]
87
+ embedding = value["embedding"]
88
+ else:
89
+ raise ValueError(
90
+ "When setting an item, value must be a dictionary containing 'value' and 'embedding' keys."
91
+ )
92
+
93
+ # Serialize the actual value and encode it to base64
94
+ # value_bytes = pickle.dumps(actual_value)
95
+ # value_base64 = base64.b64encode(value_bytes).decode('utf-8')
96
+
97
+ # Ensure the embedding is provided
98
+ if embedding is None:
99
+ raise ValueError("An embedding vector must be provided.")
100
+
101
+ # Check if the key exists and delete it if it does
102
+ if key_str in self:
103
+ self.__delitem__(key_str)
104
+
105
+ # Prepare data for insertion (as a list of dictionaries)
106
+ data = [
107
+ {
108
+ "key": key_str,
109
+ "value": actual_value,
110
+ "embedding": embedding,
111
+ }
112
+ ]
113
+
114
+ # Insert the new record
115
+ self.milvus_ltm_client._client.insert(
116
+ collection_name=self.storage_name, data=data
117
+ )
118
+
119
+ def __delitem__(self, key: Any) -> None:
120
+ key_str = str(key)
121
+ if key_str in self:
122
+ expr = f'key == "{key_str}"'
123
+ self.milvus_ltm_client._client.delete(self.storage_name, expr)
124
+ else:
125
+ raise KeyError(f"Key {key} not found")
126
+
127
+ def __contains__(self, key: Any) -> bool:
128
+ key_str = str(key)
129
+ expr = f'key == "{key_str}"'
130
+ # Adjust the query call to match the expected signature
131
+ res = self.milvus_ltm_client._client.query(
132
+ self.storage_name, # Pass the collection name as the first argument
133
+ filter=expr,
134
+ output_fields=["key"],
135
+ )
136
+ return len(res) > 0
137
+
138
+ """
139
+ def __len__(self) -> int:
140
+ milvus_ltm.collection.flush()
141
+ return self.collection.num_entities
142
+ """
143
+
144
+ def __len__(self) -> int:
145
+ expr = 'key != ""' # Expression to match all entities
146
+ # self.milvus_ltm_client._client.load(refresh=True)
147
+ results = self.milvus_ltm_client._client.query(
148
+ self.storage_name, expr, output_fields=["key"], consistency_level="Strong"
149
+ )
150
+ return len(results)
151
+
152
+ def keys(self, limit=10) -> Iterable[Any]:
153
+ expr = ""
154
+ res = self.milvus_ltm_client._client.query(
155
+ self.storage_name, expr, output_fields=["key"], limit=limit
156
+ )
157
+ return (item["key"] for item in res)
158
+
159
+ def values(self) -> Iterable[Any]:
160
+ expr = 'key != ""' # Expression to match all active entities
161
+ self.milvus_ltm_client._client.load(refresh=True)
162
+ res = self.milvus_ltm_client._client.query(
163
+ self.storage_name, expr, output_fields=["value"], consistency_level="Strong"
164
+ )
165
+ for item in res:
166
+ value_base64 = item["value"]
167
+ value_bytes = base64.b64decode(value_base64)
168
+ value = pickle.loads(value_bytes)
169
+ yield value
170
+
171
+ def items(self) -> Iterable[Tuple[Any, Any]]:
172
+ expr = 'key != ""'
173
+ res = self.milvus_ltm_client._client.query(
174
+ self.storage_name, expr, output_fields=["key", "value"]
175
+ )
176
+ for item in res:
177
+ key = item["key"]
178
+ value = item["value"]
179
+ # value_bytes = base64.b64decode(value_base64)
180
+ # value = pickle.loads(value_bytes)
181
+ yield (key, value)
182
+
183
+ def get(self, key: Any, default: Any = None) -> Any:
184
+ try:
185
+ return self[key]
186
+ except KeyError:
187
+ return default
188
+
189
+ def clear(self) -> None:
190
+ expr = (
191
+ 'key != ""' # This expression matches all records where 'key' is not empty
192
+ )
193
+ self.milvus_ltm_client._client.delete(self.storage_name, filter=expr)
194
+
195
+ def pop(self, key: Any, default: Any = None) -> Any:
196
+ try:
197
+ value = self[key]
198
+ self.__delitem__(key)
199
+ return value
200
+ except KeyError:
201
+ if default is not None:
202
+ return default
203
+ else:
204
+ raise
205
+
206
+ def update(self, other: Iterable[Tuple[Any, Any]]) -> None:
207
+ for key, value in other:
208
+ self[key] = value
209
+
210
+ def get_by_vector(
211
+ self,
212
+ embedding: List[float],
213
+ top_k: int = 10,
214
+ threshold: float = 0.0,
215
+ filter: str = "",
216
+ ) -> List[Tuple[Any, Any, float]]:
217
+ search_params = {
218
+ "metric_type": "COSINE",
219
+ "params": {"nprobe": 10, "range_filter": 1, "radius": threshold},
220
+ }
221
+ results = self.milvus_ltm_client._client.search(
222
+ self.storage_name,
223
+ data=[embedding],
224
+ anns_field="embedding",
225
+ search_params=search_params,
226
+ limit=top_k,
227
+ output_fields=["key", "value"],
228
+ consistency_level="Strong",
229
+ filter=filter,
230
+ )
231
+
232
+ items = []
233
+ for match in results[0]:
234
+ key = match.get("entity").get("key")
235
+ value = match.get("entity").get("value")
236
+ items.append((key, value))
237
+
238
+ return items
agent/misc/scene.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional, Tuple, Union
2
+
3
+ from PIL import Image
4
+ from pydantic import BaseModel
5
+ from pydub import AudioSegment
6
+ from pydub.effects import normalize
7
+ from scenedetect import (ContentDetector, FrameTimecode, SceneManager,
8
+ VideoStream, open_video)
9
+
10
+
11
+ class Scene(BaseModel):
12
+ start: FrameTimecode
13
+ end: FrameTimecode
14
+ stt_res: Optional[Dict] = None
15
+ summary: Optional[Dict] = None
16
+
17
+ class Config:
18
+ """Configuration for this pydantic object."""
19
+
20
+ arbitrary_types_allowed = True
21
+
22
+ @classmethod
23
+ def init(cls, start: FrameTimecode, end: FrameTimecode, summary: dict = None):
24
+ return cls(start=start, end=end, summary=summary)
25
+
26
+ @property
27
+ def conversation(self):
28
+ # for self deployed whisper
29
+ if isinstance(self.stt_res, list):
30
+ output_conversation = "\n".join(
31
+ [f"{item.get('text', None)}" for item in self.stt_res]
32
+ )
33
+ else:
34
+ output_conversation = self.stt_res
35
+ return output_conversation
36
+
37
+
38
+ class VideoScenes(BaseModel):
39
+ stream: VideoStream
40
+ audio: Union[AudioSegment, None]
41
+ scenes: List[Scene]
42
+ frame_extraction_interval: int
43
+
44
+ class Config:
45
+ """Configuration for this pydantic object."""
46
+
47
+ extra = "allow"
48
+ arbitrary_types_allowed = True
49
+
50
+ @classmethod
51
+ def load(
52
+ cls,
53
+ video_path: str,
54
+ threshold: int = 27,
55
+ min_scene_len: int = 1,
56
+ frame_extraction_interval: int = 5,
57
+ show_progress: bool = False,
58
+ kernel_size: Optional[int] = None,
59
+ ):
60
+ """Load a video file.
61
+
62
+ Args:
63
+ video_path (str): The path of the video file. Only support local file.
64
+ threshold (int): The scene detection threshold.
65
+ min_scene_len (int): Once a cut is detected, this long time must pass before a new one can
66
+ be added to the scene list. Count in seconds, defaults to 1.
67
+ show_progress (bool, optional): Whether to display the progress bar when processing the video. Defaults to False.
68
+ """
69
+ video = open_video(video_path)
70
+ scene_manager = SceneManager()
71
+ weight = ContentDetector.Components(
72
+ delta_hue=1.0,
73
+ delta_sat=1.0,
74
+ delta_lum=0.0,
75
+ delta_edges=1.0,
76
+ )
77
+ if kernel_size is None:
78
+ scene_manager.add_detector(
79
+ ContentDetector(
80
+ threshold=threshold,
81
+ min_scene_len=int(video.frame_rate * min_scene_len),
82
+ weights=weight,
83
+ )
84
+ )
85
+ else:
86
+ scene_manager.add_detector(
87
+ ContentDetector(
88
+ threshold=threshold,
89
+ min_scene_len=int(video.frame_rate * min_scene_len),
90
+ weights=weight,
91
+ kernel_size=kernel_size,
92
+ )
93
+ )
94
+ scene_manager.detect_scenes(video, show_progress=show_progress)
95
+ scenes = scene_manager.get_scene_list(start_in_scene=True)
96
+
97
+ try:
98
+ audio = AudioSegment.from_file(video_path)
99
+ audio = normalize(audio)
100
+ except (IndexError, OSError):
101
+ audio = None
102
+ return cls(
103
+ stream=video,
104
+ scenes=[Scene.init(*scene) for scene in scenes],
105
+ audio=audio,
106
+ frame_extraction_interval=frame_extraction_interval,
107
+ )
108
+
109
+ def get_video_frames(
110
+ self, scene: Union[int, Scene, Tuple[FrameTimecode]], interval: int = None
111
+ ) -> Tuple[List[Image.Image], List[float]]:
112
+ """Get the frames of a scene.
113
+
114
+ Args:
115
+ scene (Union[int, Scene, Tuple[FrameTimecode]]): The scene to get frames. Can be the index of the scene, the scene object or a tuple of start and end frame timecode.
116
+ interval (int, optional): The interval of the frames to get. Defaults to None.
117
+ Raises:
118
+ ValueError: If the type of scene is not int, Scene or tuple.
119
+
120
+ Returns:
121
+ List[ndarray]: The frames of the scene.
122
+ """
123
+ if isinstance(scene, int):
124
+ scene = self.scenes[scene]
125
+ start, end = scene.start, scene.end
126
+ elif isinstance(scene, Scene):
127
+ start, end = scene.start, scene.end
128
+ elif isinstance(scene, tuple):
129
+ start, end = scene
130
+ else:
131
+ raise ValueError(
132
+ f"scene should be int, Scene or tuple, not {type(scene).__name__}"
133
+ )
134
+ self.stream.seek(start)
135
+ frames = []
136
+ time_stamps = []
137
+ if interval is None:
138
+ interval = self.frame_extraction_interval * self.stream.frame_rate
139
+ scene_len = end.get_frames() - start.get_frames()
140
+ if scene_len / 10 > interval:
141
+ interval = int(scene_len / 10) + 1
142
+ for index in range(scene_len):
143
+ if index % interval == 0:
144
+ f = self.stream.read()
145
+ frames.append(Image.fromarray(f))
146
+ time_stamps.append(self.stream.position.get_seconds())
147
+ else:
148
+ self.stream.read(decode=False)
149
+ self.stream.seek(0)
150
+ return frames, time_stamps
151
+
152
+ def get_audio_clip(
153
+ self, scene: Union[int, Scene, Tuple[FrameTimecode]]
154
+ ) -> AudioSegment:
155
+ """Get the audio clip of a scene.
156
+
157
+ Args:
158
+ scene (Union[int, Scene, Tuple[FrameTimecode]]): The scene to get audio clip. Can be the index of the scene, the scene object or a tuple of start and end frame timecode.
159
+
160
+ Raises:
161
+ ValueError: If the type of scene is not int, Scene or tuple.
162
+
163
+ Returns:
164
+ AudioSegment: The audio clip of the scene.
165
+ """
166
+ if self.audio is None:
167
+ return None
168
+ if isinstance(scene, int):
169
+ scene = self.scenes[scene]
170
+ start, end = scene.start, scene.end
171
+ elif isinstance(scene, Scene):
172
+ start, end = scene.start, scene.end
173
+ elif isinstance(scene, tuple):
174
+ start, end = scene
175
+ else:
176
+ raise ValueError(
177
+ f"scene should be int, Scene or tuple, not {type(scene).__name__}"
178
+ )
179
+
180
+ return self.audio[
181
+ int(start.get_seconds() * 1000) : int(end.get_seconds() * 1000)
182
+ ]
183
+
184
+ def __len__(self):
185
+ return len(self.scenes)
186
+
187
+ def __iter__(self):
188
+ self.index = 0
189
+ return self
190
+
191
+ def __next__(self):
192
+ if self.index >= len(self.scenes):
193
+ raise StopIteration
194
+ scene = self.scenes[self.index]
195
+ self.index += 1
196
+ return scene
197
+
198
+ def __getitem__(self, index):
199
+ return self.scenes[index]
200
+
201
+ def __setitem__(self, index, value):
202
+ self.scenes[index] = value
203
+
204
+ def to_serializable(self) -> dict:
205
+ """Convert VideoScenes to a serializable dictionary."""
206
+ scenes_data = []
207
+ for scene in self.scenes:
208
+ scenes_data.append(
209
+ {
210
+ "start_frame": scene.start.frame_num,
211
+ "end_frame": scene.end.frame_num,
212
+ "stt_res": scene.stt_res,
213
+ "summary": scene.summary,
214
+ }
215
+ )
216
+
217
+ return {
218
+ "video_path": self.stream.path,
219
+ "frame_rate": self.stream.frame_rate,
220
+ "scenes": scenes_data,
221
+ "frame_extraction_interval": self.frame_extraction_interval,
222
+ }
223
+
224
+ @classmethod
225
+ def from_serializable(cls, data: dict):
226
+ """Rebuild VideoScenes from serialized data."""
227
+ video = open_video(data["video_path"])
228
+ try:
229
+ audio = AudioSegment.from_file(data["video_path"])
230
+ audio = normalize(audio)
231
+ except Exception:
232
+ audio = None
233
+
234
+ # Rebuild scenes list
235
+ scenes = []
236
+ for scene_data in data["scenes"]:
237
+ start = FrameTimecode(scene_data["start_frame"], data["frame_rate"])
238
+ end = FrameTimecode(scene_data["end_frame"], data["frame_rate"])
239
+ scene = Scene.init(start, end)
240
+ scene.stt_res = scene_data["stt_res"]
241
+ scene.summary = scene_data["summary"]
242
+ scenes.append(scene)
243
+
244
+ return cls(
245
+ stream=video,
246
+ scenes=scenes,
247
+ audio=audio,
248
+ frame_extraction_interval=data["frame_extraction_interval"],
249
+ )
agent/tools/video_rewinder/rewinder.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from pathlib import Path
4
+ from typing import List
5
+
6
+ import json_repair
7
+ from omagent_core.models.llms.base import BaseLLMBackend
8
+ from omagent_core.models.llms.prompt import PromptTemplate
9
+ from omagent_core.tool_system.base import ArgSchema, BaseTool
10
+ from omagent_core.utils.logger import logging
11
+ from omagent_core.utils.registry import registry
12
+ from pydantic import Field
13
+ from scenedetect import FrameTimecode
14
+
15
+ from ...misc.scene import VideoScenes
16
+
17
+ CURRENT_PATH = Path(__file__).parents[0]
18
+
19
+ ARGSCHEMA = {
20
+ "start_time": {
21
+ "type": "number",
22
+ "description": "Start time (in seconds) of the video to extract frames from.",
23
+ "required": True,
24
+ },
25
+ "end_time": {
26
+ "type": "number",
27
+ "description": "End time (in seconds) of the video to extract frames from.",
28
+ "required": True,
29
+ },
30
+ "number": {
31
+ "type": "number",
32
+ "description": "Number of frames of extraction. More frames means more details but more cost. Do not exceed 10.",
33
+ "required": True,
34
+ },
35
+ }
36
+
37
+
38
+ @registry.register_tool()
39
+ class Rewinder(BaseTool, BaseLLMBackend):
40
+ args_schema: ArgSchema = ArgSchema(**ARGSCHEMA)
41
+ description: str = (
42
+ "Rollback and extract frames from video which is already loaded to get more specific details for further analysis."
43
+ )
44
+ prompts: List[PromptTemplate] = Field(
45
+ default=[
46
+ PromptTemplate.from_file(
47
+ CURRENT_PATH.joinpath("rewinder_sys_prompt.prompt"),
48
+ role="system",
49
+ ),
50
+ PromptTemplate.from_file(
51
+ CURRENT_PATH.joinpath("rewinder_user_prompt.prompt"),
52
+ role="user",
53
+ ),
54
+ ]
55
+ )
56
+
57
+ def _run(
58
+ self, start_time: float = 0.0, end_time: float = None, number: int = 1
59
+ ) -> str:
60
+ if self.stm(self.workflow_instance_id).get("video", None) is None:
61
+ raise ValueError("No video is loaded.")
62
+ else:
63
+ video: VideoScenes = VideoScenes.from_serializable(
64
+ self.stm(self.workflow_instance_id)["video"]
65
+ )
66
+ if number > 10:
67
+ logging.warning("Number of frames exceeds 10. Will extract 10 frames.")
68
+ number = 10
69
+
70
+ start = FrameTimecode(timecode=start_time, fps=video.stream.frame_rate)
71
+ if end_time is None:
72
+ end = video.stream.duration
73
+ else:
74
+ end = FrameTimecode(timecode=end_time, fps=video.stream.frame_rate)
75
+
76
+ if start_time == end_time:
77
+ frames, time_stamps = video.get_video_frames(
78
+ (start, end + 1), video.stream.frame_rate
79
+ )
80
+ else:
81
+ interval = int((end.get_frames() - start.get_frames()) / number)
82
+ frames, time_stamps = video.get_video_frames((start, end), interval)
83
+
84
+ # self.stm.image_cache.clear()
85
+ payload = []
86
+ for i, (frame, time_stamp) in enumerate(zip(frames, time_stamps)):
87
+ payload.append(f"timestamp_{time_stamp}")
88
+ payload.append(frame)
89
+ res = self.infer(input_list=[{"timestamp_with_images": payload}])[0]["choices"][
90
+ 0
91
+ ]["message"]["content"]
92
+ image_contents = json_repair.loads(res)
93
+ self.stm(self.workflow_instance_id)["image_cache"] = {}
94
+ return f"extracted_frames described as: {image_contents}."
95
+
96
+ async def _arun(
97
+ self, start_time: float = 0.0, end_time: float = None, number: int = 1
98
+ ) -> str:
99
+ return self._run(start_time, end_time, number=number)
agent/tools/video_rewinder/rewinder_sys_prompt.prompt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ You are an intelligent agent, your job is to describe the content of each image and provide a summary for all the images.
2
+ The format should be in JSON data as follows:
3
+ ```json{
4
+ "timestamp_x": "Description of the image at timestamp_x",
5
+ ...
6
+ "timestamp_start - timestamp_end": "Summary of all images, where start is the timestamp of the first image and end is the timestamp of the last image"
7
+ }```
agent/tools/video_rewinder/rewinder_user_prompt.prompt ADDED
@@ -0,0 +1 @@
 
 
1
+ {{timestamp_with_images}}
agent/video_preprocessor/__init__.py ADDED
File without changes
agent/video_preprocessor/sys_prompt.prompt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are the most powerful AI agent with the ability to see images. You will help me with video content comprehension and analysis based on continuous video frame extraction and textual content of video conversations.
2
+
3
+ --- Output ---
4
+ You will be provided with a series of video frame images arranged in the order of video playback. Please help me answer the following questions and provide the output in the specified json format.
5
+
6
+ {
7
+ "time": Optional[str].The time information of current video clip in terms of periods like morning or evening, seasons like spring or autumn, or specific years and time points. Please make sure to directly obtain the information from the provided context without inference or fabrication. If the relevant information cannot be obtained, please return null.
8
+ "location": Optional[str]. Describe the location where the current event is taking place, including scene details. If the relevant information cannot be obtained, please return null.
9
+ "character": Optional[str]. Provide a detailed description of the current characters, including their names, relationships, and what they are doing, etc. If the relevant information cannot be obtained, please return null.
10
+ "events": List[str]. List and describe all the detailed events in the video content in chronological order. Please integrate the information provided by the video frames and the textual information in the audio, and do not overlook any key points.
11
+ "scene": List[str]. Give some detailed description of the scene of the video. This includes, but is not limited to, scene information, textual information, character status expressions, and events displayed in the video.
12
+ "summary": str. Provide an detailed overall description and summary of the content of this video clip. Ensure that it remains objective and does not include any speculation or fabrication. This field is mandatory.
13
+ }
14
+
15
+
16
+ *** Important Notice ***
17
+ 1. You will be provided with the video frames and speech-to-text results. You have enough information to answer the questions.
18
+ 2. Sometimes the speech-to-text results maybe empty since there are no person talking. Please analyze based on the information in the images in this situation.
agent/video_preprocessor/user_prompt.prompt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Now, it's your turn to complete the task.
2
+
3
+ Textual content of video conversations: {{stt_res}}
4
+ Frame images of video playback: {{img_placeholders}}
agent/video_preprocessor/video_preprocess.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import pickle
3
+ import time
4
+ from pathlib import Path
5
+ from typing import List, Optional, Union
6
+
7
+ import json_repair
8
+ from omagent_core.engine.worker.base import BaseWorker
9
+ from omagent_core.models.asr.stt import STT
10
+ from omagent_core.models.encoders.openai_encoder import OpenaiTextEmbeddingV3
11
+ from omagent_core.models.llms.base import BaseLLMBackend
12
+ from omagent_core.models.llms.prompt import PromptTemplate
13
+ from omagent_core.utils.registry import registry
14
+ from pydantic import Field, field_validator
15
+ from pydub import AudioSegment
16
+ from pydub.effects import normalize
17
+ from scenedetect import open_video
18
+
19
+ from ..misc.scene import VideoScenes
20
+
21
+ CURRENT_PATH = root_path = Path(__file__).parents[0]
22
+
23
+
24
+ @registry.register_worker()
25
+ class VideoPreprocessor(BaseLLMBackend, BaseWorker):
26
+ prompts: List[PromptTemplate] = Field(
27
+ default=[
28
+ PromptTemplate.from_file(
29
+ CURRENT_PATH.joinpath("sys_prompt.prompt"), role="system"
30
+ ),
31
+ PromptTemplate.from_file(
32
+ CURRENT_PATH.joinpath("user_prompt.prompt"), role="user"
33
+ ),
34
+ ]
35
+ )
36
+ text_encoder: OpenaiTextEmbeddingV3
37
+
38
+ stt: STT
39
+ scene_detect_threshold: Union[float, int] = 27
40
+ min_scene_len: int = 1
41
+ frame_extraction_interval: int = 5
42
+ kernel_size: Optional[int] = None
43
+ show_progress: bool = True
44
+
45
+ use_cache: bool = False
46
+ cache_dir: str = "./video_cache"
47
+
48
+ @field_validator("stt", mode="before")
49
+ @classmethod
50
+ def validate_asr(cls, stt):
51
+ if isinstance(stt, STT):
52
+ return stt
53
+ elif isinstance(stt, dict):
54
+ return STT(**stt)
55
+ else:
56
+ raise ValueError("Invalid STT type.")
57
+
58
+ def calculate_md5(self, file_path):
59
+ md5_hash = hashlib.md5()
60
+ with open(file_path, "rb") as file:
61
+ for byte_block in iter(lambda: file.read(4096), b""):
62
+ md5_hash.update(byte_block)
63
+ return md5_hash.hexdigest()
64
+
65
+ def _run(self, test: str, *args, **kwargs):
66
+ """
67
+ Process video files by:
68
+ 1. Calculating MD5 hash of input video for caching
69
+ 2. Loading video from cache if available and use_cache=True
70
+ 3. Otherwise, processing video by:
71
+ - Extracting audio and video streams
72
+ - Detecting scene boundaries
73
+ - Extracting frames at specified intervals
74
+ - Generating scene summaries using LLM
75
+ - Caching results for future use
76
+
77
+ Args:
78
+ video_path (str): Path to input video file
79
+ *args: Variable length argument list
80
+ **kwargs: Arbitrary keyword arguments
81
+
82
+ Returns:
83
+ dict: Dictionary containing video_md5 and video_path
84
+ """
85
+ video_path = self.input.read_input(
86
+ workflow_instance_id=self.workflow_instance_id,
87
+ input_prompt="Please input the video path:",
88
+ )["messages"][0]["content"][0]["data"]
89
+ video_md5 = self.calculate_md5(video_path)
90
+ kwargs["video_md5"] = video_md5
91
+
92
+ cache_path = (
93
+ Path(self.cache_dir)
94
+ .joinpath(video_path.replace("/", "-"))
95
+ .joinpath("video_cache.pkl")
96
+ )
97
+ # Load video from cache if available
98
+ if self.use_cache and cache_path.exists():
99
+ with open(cache_path, "rb") as f:
100
+ loaded_scene = pickle.load(f)
101
+ try:
102
+ audio = AudioSegment.from_file(video_path)
103
+ audio = normalize(audio)
104
+ except Exception:
105
+ audio = None
106
+ video = VideoScenes(
107
+ stream=open_video(video_path),
108
+ audio=audio,
109
+ scenes=loaded_scene,
110
+ frame_extraction_interval=self.frame_extraction_interval,
111
+ )
112
+ self.callback.send_block(
113
+ agent_id=self.workflow_instance_id,
114
+ msg="Loaded video scenes from cache.\nResume the interrupted transfer for results with scene.summary of None.",
115
+ )
116
+ for index, scene in enumerate(video.scenes):
117
+ if scene.summary is None:
118
+ self.callback.send_block(
119
+ agent_id=self.workflow_instance_id,
120
+ msg=f"Resume the interrupted transfer for scene {index}.",
121
+ )
122
+ video_frames, time_stamps = video.get_video_frames(scene)
123
+ try:
124
+ chat_complete_res = self.infer(
125
+ input_list=[
126
+ {
127
+ "stt_res": scene.conversation,
128
+ "img_placeholders": "".join(
129
+ [
130
+ f"<image_{i}>"
131
+ for i in range(len(video_frames))
132
+ ]
133
+ ),
134
+ }
135
+ ],
136
+ images=video_frames,
137
+ )
138
+ scene.summary = chat_complete_res[0]["choices"][0][
139
+ "message"
140
+ ]["content"]
141
+ scene_info = scene.summary.get("scene", [])
142
+ events = scene.summary.get("events", [])
143
+ start_time = scene.start.get_seconds()
144
+ end_time = scene.end.get_seconds()
145
+ content = (
146
+ f"Time in video: {scene.summary.get('time', 'null')}\n"
147
+ f"Location: {scene.summary.get('location', 'null')}\n"
148
+ f"Character': {scene.summary.get('character', 'null')}\n"
149
+ f"Events: {events}\n"
150
+ f"Scene: {scene_info}\n"
151
+ f"Summary: {scene.summary.get('summary', '')}"
152
+ )
153
+ content_vector = self.text_encoder.infer([content])[0]
154
+ self.ltm[index] = {
155
+ "value": {
156
+ "video_md5": video_md5,
157
+ "content": content,
158
+ "start_time": start_time,
159
+ "end_time": end_time,
160
+ },
161
+ "embedding": content_vector,
162
+ }
163
+ except Exception as e:
164
+ self.callback.error(
165
+ f"Failed to resume scene {index}: {e}. Set to default."
166
+ )
167
+ scene.summary = {
168
+ "time": "",
169
+ "location": "",
170
+ "character": "",
171
+ "events": [],
172
+ "scene": [],
173
+ "summary": "",
174
+ }
175
+ self.stm(self.workflow_instance_id)["video"] = video.to_serializable()
176
+ # Cache the processed video scenes
177
+ with open(cache_path, "wb") as f:
178
+ pickle.dump(video.scenes, f)
179
+
180
+ # Process video if not loaded from cache
181
+ if not self.stm(self.workflow_instance_id).get("video", None):
182
+ video = VideoScenes.load(
183
+ video_path=video_path,
184
+ threshold=self.scene_detect_threshold,
185
+ min_scene_len=self.min_scene_len,
186
+ frame_extraction_interval=self.frame_extraction_interval,
187
+ show_progress=self.show_progress,
188
+ kernel_size=self.kernel_size,
189
+ )
190
+ self.stm(self.workflow_instance_id)["video"] = video.to_serializable()
191
+
192
+ for index, scene in enumerate(video.scenes):
193
+ print(f"Processing scene {index} / {len(video.scenes)}...")
194
+ audio_clip = video.get_audio_clip(scene)
195
+ if audio_clip is None:
196
+ scene.stt_res = {"text": ""}
197
+ else:
198
+ scene.stt_res = self.stt.infer(audio_clip)
199
+ video_frames, time_stamps = video.get_video_frames(scene)
200
+ try:
201
+ face_rec = registry.get_tool("FaceRecognition")
202
+ for frame in video_frames:
203
+ objs = face_rec.infer(frame)
204
+ face_rec.visual_prompting(frame, objs)
205
+ except Exception:
206
+ pass
207
+ try:
208
+ chat_complete_res = self.infer(
209
+ input_list=[
210
+ {
211
+ "stt_res": scene.conversation,
212
+ "img_placeholders": "".join(
213
+ [f"<image_{i}>" for i in range(len(video_frames))]
214
+ ),
215
+ }
216
+ ],
217
+ images=video_frames,
218
+ )
219
+ scene.summary = chat_complete_res[0]["choices"][0]["message"][
220
+ "content"
221
+ ]
222
+ scene_info = scene.summary.get("scene", [])
223
+ events = scene.summary.get("events", [])
224
+ start_time = scene.start.get_seconds()
225
+ end_time = scene.end.get_seconds()
226
+ content = (
227
+ f"Time in video: {scene.summary.get('time', 'null')}\n"
228
+ f"Location: {scene.summary.get('location', 'null')}\n"
229
+ f"Character': {scene.summary.get('character', 'null')}\n"
230
+ f"Events: {events}\n"
231
+ f"Scene: {scene_info}\n"
232
+ f"Summary: {scene.summary.get('summary', '')}"
233
+ )
234
+ content_vector = self.text_encoder.infer([content])[0]
235
+ self.ltm[index] = {
236
+ "value": {
237
+ "video_md5": video_md5,
238
+ "content": content,
239
+ "start_time": start_time,
240
+ "end_time": end_time,
241
+ },
242
+ "embedding": content_vector,
243
+ }
244
+ except Exception as e:
245
+ self.callback.error(f"Failed to process scene {index}: {e}")
246
+ scene.summary = None
247
+
248
+ if self.use_cache and not cache_path.exists():
249
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
250
+ with open(cache_path, "wb") as f:
251
+ pickle.dump(video.scenes, f)
252
+ return {
253
+ "video_md5": video_md5
254
+ }
agent/video_preprocessor/webpage_vp.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import pickle
3
+ import time
4
+ from pathlib import Path
5
+ from typing import List, Optional, Union
6
+
7
+ import json_repair
8
+ from omagent_core.engine.worker.base import BaseWorker
9
+ from omagent_core.models.asr.stt import STT
10
+ from omagent_core.models.encoders.openai_encoder import OpenaiTextEmbeddingV3
11
+ from omagent_core.models.llms.base import BaseLLMBackend
12
+ from omagent_core.models.llms.prompt import PromptTemplate
13
+ from omagent_core.utils.registry import registry
14
+ from pydantic import Field, field_validator
15
+ from pydub import AudioSegment
16
+ from pydub.effects import normalize
17
+ from scenedetect import open_video
18
+
19
+ from ..misc.scene import VideoScenes
20
+
21
+ CURRENT_PATH = root_path = Path(__file__).parents[0]
22
+
23
+
24
+ @registry.register_worker()
25
+ class WebpageVideoPreprocessor(BaseLLMBackend, BaseWorker):
26
+ prompts: List[PromptTemplate] = Field(
27
+ default=[
28
+ PromptTemplate.from_file(
29
+ CURRENT_PATH.joinpath("sys_prompt.prompt"), role="system"
30
+ ),
31
+ PromptTemplate.from_file(
32
+ CURRENT_PATH.joinpath("user_prompt.prompt"), role="user"
33
+ ),
34
+ ]
35
+ )
36
+ text_encoder: OpenaiTextEmbeddingV3
37
+
38
+ stt: STT
39
+ scene_detect_threshold: Union[float, int] = 27
40
+ min_scene_len: int = 1
41
+ frame_extraction_interval: int = 5
42
+ kernel_size: Optional[int] = None
43
+ show_progress: bool = True
44
+
45
+ use_cache: bool = False
46
+ cache_dir: str = "./video_cache"
47
+
48
+ @field_validator("stt", mode="before")
49
+ @classmethod
50
+ def validate_asr(cls, stt):
51
+ if isinstance(stt, STT):
52
+ return stt
53
+ elif isinstance(stt, dict):
54
+ return STT(**stt)
55
+ else:
56
+ raise ValueError("Invalid STT type.")
57
+
58
+ def calculate_md5(self, file_path):
59
+ md5_hash = hashlib.md5()
60
+ with open(file_path, "rb") as file:
61
+ for byte_block in iter(lambda: file.read(4096), b""):
62
+ md5_hash.update(byte_block)
63
+ return md5_hash.hexdigest()
64
+
65
+ def _run(self, video_path: str, *args, **kwargs):
66
+ """
67
+ Process video files by:
68
+ 1. Calculating MD5 hash of input video for caching
69
+ 2. Loading video from cache if available and use_cache=True
70
+ 3. Otherwise, processing video by:
71
+ - Extracting audio and video streams
72
+ - Detecting scene boundaries
73
+ - Extracting frames at specified intervals
74
+ - Generating scene summaries using LLM
75
+ - Caching results for future use
76
+
77
+ Args:
78
+ video_path (str): Path to input video file
79
+ *args: Variable length argument list
80
+ **kwargs: Arbitrary keyword arguments
81
+
82
+ Returns:
83
+ dict: Dictionary containing video_md5 and video_path
84
+ """
85
+ video_md5 = self.calculate_md5(video_path)
86
+ kwargs["video_md5"] = video_md5
87
+
88
+ cache_path = (
89
+ Path(self.cache_dir)
90
+ .joinpath(video_path.replace("/", "-"))
91
+ .joinpath("video_cache.pkl")
92
+ )
93
+ # Load video from cache if available
94
+ if self.use_cache and cache_path.exists():
95
+ with open(cache_path, "rb") as f:
96
+ loaded_scene = pickle.load(f)
97
+ try:
98
+ audio = AudioSegment.from_file(video_path)
99
+ audio = normalize(audio)
100
+ except Exception:
101
+ audio = None
102
+ video = VideoScenes(
103
+ stream=open_video(video_path),
104
+ audio=audio,
105
+ scenes=loaded_scene,
106
+ frame_extraction_interval=self.frame_extraction_interval,
107
+ )
108
+ self.callback.send_block(
109
+ agent_id=self.workflow_instance_id,
110
+ msg="Loaded video scenes from cache.\nResume the interrupted transfer for results with scene.summary of None.",
111
+ )
112
+ for index, scene in enumerate(video.scenes):
113
+ if scene.summary is None:
114
+ self.callback.send_block(
115
+ agent_id=self.workflow_instance_id,
116
+ msg=f"Resume the interrupted transfer for scene {index}.",
117
+ )
118
+ video_frames, time_stamps = video.get_video_frames(scene)
119
+ try:
120
+ chat_complete_res = self.infer(
121
+ input_list=[
122
+ {
123
+ "stt_res": scene.conversation,
124
+ "img_placeholders": "".join(
125
+ [
126
+ f"<image_{i}>"
127
+ for i in range(len(video_frames))
128
+ ]
129
+ ),
130
+ }
131
+ ],
132
+ images=video_frames,
133
+ )
134
+ scene.summary = chat_complete_res[0]["choices"][0][
135
+ "message"
136
+ ]["content"]
137
+ scene_info = scene.summary.get("scene", [])
138
+ events = scene.summary.get("events", [])
139
+ start_time = scene.start.get_seconds()
140
+ end_time = scene.end.get_seconds()
141
+ content = (
142
+ f"Time in video: {scene.summary.get('time', 'null')}\n"
143
+ f"Location: {scene.summary.get('location', 'null')}\n"
144
+ f"Character': {scene.summary.get('character', 'null')}\n"
145
+ f"Events: {events}\n"
146
+ f"Scene: {scene_info}\n"
147
+ f"Summary: {scene.summary.get('summary', '')}"
148
+ )
149
+ content_vector = self.text_encoder.infer([content])[0]
150
+ self.ltm[index] = {
151
+ "value": {
152
+ "video_md5": video_md5,
153
+ "content": content,
154
+ "start_time": start_time,
155
+ "end_time": end_time,
156
+ },
157
+ "embedding": content_vector,
158
+ }
159
+ except Exception as e:
160
+ self.callback.error(
161
+ f"Failed to resume scene {index}: {e}. Set to default."
162
+ )
163
+ scene.summary = {
164
+ "time": "",
165
+ "location": "",
166
+ "character": "",
167
+ "events": [],
168
+ "scene": [],
169
+ "summary": "",
170
+ }
171
+ self.stm(self.workflow_instance_id)["video"] = video.to_serializable()
172
+ # Cache the processed video scenes
173
+ with open(cache_path, "wb") as f:
174
+ pickle.dump(video.scenes, f)
175
+
176
+ # Process video if not loaded from cache
177
+ if not self.stm(self.workflow_instance_id).get("video", None):
178
+ video = VideoScenes.load(
179
+ video_path=video_path,
180
+ threshold=self.scene_detect_threshold,
181
+ min_scene_len=self.min_scene_len,
182
+ frame_extraction_interval=self.frame_extraction_interval,
183
+ show_progress=self.show_progress,
184
+ kernel_size=self.kernel_size,
185
+ )
186
+ self.stm(self.workflow_instance_id)["video"] = video.to_serializable()
187
+
188
+ for index, scene in enumerate(video.scenes):
189
+ print(f"Processing scene {index} / {len(video.scenes)}...")
190
+ audio_clip = video.get_audio_clip(scene)
191
+ if audio_clip is None:
192
+ scene.stt_res = {"text": ""}
193
+ else:
194
+ scene.stt_res = self.stt.infer(audio_clip)
195
+ video_frames, time_stamps = video.get_video_frames(scene)
196
+ try:
197
+ face_rec = registry.get_tool("FaceRecognition")
198
+ for frame in video_frames:
199
+ objs = face_rec.infer(frame)
200
+ face_rec.visual_prompting(frame, objs)
201
+ except Exception:
202
+ pass
203
+ try:
204
+ chat_complete_res = self.infer(
205
+ input_list=[
206
+ {
207
+ "stt_res": scene.conversation,
208
+ "img_placeholders": "".join(
209
+ [f"<image_{i}>" for i in range(len(video_frames))]
210
+ ),
211
+ }
212
+ ],
213
+ images=video_frames,
214
+ )
215
+ scene.summary = chat_complete_res[0]["choices"][0]["message"][
216
+ "content"
217
+ ]
218
+ scene_info = scene.summary.get("scene", [])
219
+ events = scene.summary.get("events", [])
220
+ start_time = scene.start.get_seconds()
221
+ end_time = scene.end.get_seconds()
222
+ content = (
223
+ f"Time in video: {scene.summary.get('time', 'null')}\n"
224
+ f"Location: {scene.summary.get('location', 'null')}\n"
225
+ f"Character': {scene.summary.get('character', 'null')}\n"
226
+ f"Events: {events}\n"
227
+ f"Scene: {scene_info}\n"
228
+ f"Summary: {scene.summary.get('summary', '')}"
229
+ )
230
+ content_vector = self.text_encoder.infer([content])[0]
231
+ self.ltm[index] = {
232
+ "value": {
233
+ "video_md5": video_md5,
234
+ "content": content,
235
+ "start_time": start_time,
236
+ "end_time": end_time,
237
+ },
238
+ "embedding": content_vector,
239
+ }
240
+ except Exception as e:
241
+ self.callback.error(f"Failed to process scene {index}: {e}")
242
+ scene.summary = None
243
+
244
+ if self.use_cache and not cache_path.exists():
245
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
246
+ with open(cache_path, "wb") as f:
247
+ pickle.dump(video.scenes, f)
248
+ return {
249
+ "video_md5": video_md5,
250
+ "video_path": video_path,
251
+ "instance_id": self.workflow_instance_id,
252
+ }
agent/video_qa/__init__.py ADDED
File without changes
agent/video_qa/qa.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from pathlib import Path
4
+ from typing import List
5
+
6
+ import json_repair
7
+ from omagent_core.advanced_components.workflow.dnc.schemas.dnc_structure import \
8
+ TaskTree
9
+ from omagent_core.engine.worker.base import BaseWorker
10
+ from omagent_core.memories.ltms.ltm import LTM
11
+ from omagent_core.models.encoders.openai_encoder import OpenaiTextEmbeddingV3
12
+ from omagent_core.models.llms.base import BaseLLMBackend
13
+ from omagent_core.models.llms.prompt import PromptTemplate
14
+ from omagent_core.utils.registry import registry
15
+ from pydantic import Field
16
+
17
+ from ..misc.scene import VideoScenes
18
+
19
+ CURRENT_PATH = root_path = Path(__file__).parents[0]
20
+
21
+
22
+ @registry.register_worker()
23
+ class VideoQA(BaseWorker, BaseLLMBackend):
24
+ prompts: List[PromptTemplate] = Field(
25
+ default=[
26
+ PromptTemplate.from_file(
27
+ CURRENT_PATH.joinpath("sys_prompt.prompt"), role="system"
28
+ ),
29
+ PromptTemplate.from_file(
30
+ CURRENT_PATH.joinpath("user_prompt.prompt"), role="user"
31
+ ),
32
+ ]
33
+ )
34
+ text_encoder: OpenaiTextEmbeddingV3
35
+
36
+ def _run(self, video_md5: str, *args, **kwargs):
37
+ self.stm(self.workflow_instance_id)["image_cache"] = {}
38
+ self.stm(self.workflow_instance_id)["former_results"] = {}
39
+ question = self.input.read_input(
40
+ workflow_instance_id=self.workflow_instance_id,
41
+ input_prompt="Please input your question:",
42
+ )["messages"][0]["content"][0]["data"]
43
+ chat_complete_res = self.simple_infer(question=question)
44
+ content = chat_complete_res["choices"][0]["message"]["content"]
45
+ content = json_repair.loads(content)
46
+ try:
47
+ start_time = (
48
+ None if content.get("start_time", -1) == -1 else content.get("start_time")
49
+ )
50
+ end_time = (
51
+ None if content.get("end_time", -1) == -1 else content.get("end_time")
52
+ )
53
+ except Exception as e:
54
+ start_time = None
55
+ end_time = None
56
+ question_vector = self.text_encoder.infer([question])[0]
57
+ filter_expr = ""
58
+ if video_md5 is not None:
59
+ filter_expr = f"value['video_md5']=='{video_md5}'"
60
+ if start_time is not None and end_time is not None:
61
+ filter_expr += f" and (value['start_time']>={max(0, start_time - 10)} and value['end_time']<={end_time + 10})"
62
+ elif start_time is not None:
63
+ filter_expr += f" and value['start_time']>={max(0, start_time - 10)}"
64
+ elif end_time is not None:
65
+ filter_expr += f" and value['end_time']<={end_time + 10}"
66
+ related_information = self.ltm.get_by_vector(
67
+ embedding=question_vector, top_k=5, threshold=0.2, filter=filter_expr
68
+ )
69
+ related_information = [
70
+ f"Time span: {each['start_time']} - {each['end_time']}\n{each['content']}"
71
+ for _, each in related_information
72
+ ]
73
+ video = VideoScenes.from_serializable(
74
+ self.stm(self.workflow_instance_id)["video"]
75
+ )
76
+ self.stm(self.workflow_instance_id)["extra"] = {
77
+ "video_information": "video is already loaded in the short-term memory(stm).",
78
+ "video_duration_seconds(s)": video.stream.duration.get_seconds(),
79
+ "frame_rate": video.stream.frame_rate,
80
+ "video_summary": "\n---\n".join(related_information),
81
+ }
82
+ return {"query": question, "last_output": None}
agent/video_qa/sys_prompt.prompt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ You are a master of time extraction, capable of analyzing the temporal relationships in question and extracting timestamps, with the extracted times in seconds.
2
+ Time format in question is in the form of "HH:MM:SS" or "MM:SS".
3
+ ---
4
+ The output should be a json object as follows:
5
+ {
6
+ "start_time": start time in seconds, -1 if not found,
7
+ "end_time": end time in seconds, -1 if not found,
8
+ }
agent/video_qa/user_prompt.prompt ADDED
@@ -0,0 +1 @@
 
 
1
+ Question: {{question}}
agent/video_qa/webpage_qa.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import List
3
+
4
+ import json_repair
5
+ from pydantic import Field
6
+
7
+ from omagent_core.engine.worker.base import BaseWorker
8
+ from omagent_core.models.encoders.openai_encoder import OpenaiTextEmbeddingV3
9
+ from omagent_core.models.llms.base import BaseLLMBackend
10
+ from omagent_core.models.llms.prompt import PromptTemplate
11
+ from omagent_core.utils.registry import registry
12
+ from ..misc.scene import VideoScenes
13
+
14
+ CURRENT_PATH = root_path = Path(__file__).parents[0]
15
+
16
+
17
+ @registry.register_worker()
18
+ class WebpageVideoQA(BaseWorker, BaseLLMBackend):
19
+ prompts: List[PromptTemplate] = Field(
20
+ default=[
21
+ PromptTemplate.from_file(
22
+ CURRENT_PATH.joinpath("sys_prompt.prompt"), role="system"
23
+ ),
24
+ PromptTemplate.from_file(
25
+ CURRENT_PATH.joinpath("user_prompt.prompt"), role="user"
26
+ ),
27
+ ]
28
+ )
29
+ text_encoder: OpenaiTextEmbeddingV3
30
+
31
+ def _run(self, video_md5: str, video_path: str, instance_id: str, question: str, *args, **kwargs):
32
+ self.stm(self.workflow_instance_id)["image_cache"] = {}
33
+ self.stm(self.workflow_instance_id)["former_results"] = {}
34
+ chat_complete_res = self.simple_infer(question=question)
35
+ content = chat_complete_res["choices"][0]["message"]["content"]
36
+ content = json_repair.loads(content)
37
+ try:
38
+ start_time = (
39
+ None if content.get("start_time", -1) == -1 else content.get("start_time")
40
+ )
41
+ end_time = (
42
+ None if content.get("end_time", -1) == -1 else content.get("end_time")
43
+ )
44
+ except Exception as e:
45
+ start_time = None
46
+ end_time = None
47
+ question_vector = self.text_encoder.infer([question])[0]
48
+ filter_expr = ""
49
+ if video_md5 is not None:
50
+ filter_expr = f"value['video_md5']=='{video_md5}'"
51
+ if start_time is not None and end_time is not None:
52
+ filter_expr += f" and (value['start_time']>={max(0, start_time - 10)} and value['end_time']<={end_time + 10})"
53
+ elif start_time is not None:
54
+ filter_expr += f" and value['start_time']>={max(0, start_time - 10)}"
55
+ elif end_time is not None:
56
+ filter_expr += f" and value['end_time']<={end_time + 10}"
57
+ related_information = self.ltm.get_by_vector(
58
+ embedding=question_vector, top_k=5, threshold=0.2, filter=filter_expr
59
+ )
60
+ related_information = [
61
+ f"Time span: {each['start_time']} - {each['end_time']}\n{each['content']}"
62
+ for _, each in related_information
63
+ ]
64
+ video = VideoScenes.from_serializable(
65
+ self.stm(self.workflow_instance_id)["video"]
66
+ )
67
+ self.stm(self.workflow_instance_id)["extra"] = {
68
+ "video_information": "video is already loaded in the short-term memory(stm).",
69
+ "video_duration_seconds(s)": video.stream.duration.get_seconds(),
70
+ "frame_rate": video.stream.frame_rate,
71
+ "video_summary": "\n---\n".join(related_information),
72
+ }
73
+ return {"query": question, "last_output": None}
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import required modules and components
2
+ import base64
3
+ import hashlib
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+
8
+ from Crypto.Cipher import AES
9
+
10
+
11
+ class Encrypt(object):
12
+
13
+ @staticmethod
14
+ def pad(s):
15
+ AES_BLOCK_SIZE = 16 # Bytes
16
+ return s + (AES_BLOCK_SIZE - len(s) % AES_BLOCK_SIZE) * \
17
+ chr(AES_BLOCK_SIZE - len(s) % AES_BLOCK_SIZE)
18
+
19
+ @staticmethod
20
+ def unpad(s):
21
+ return s[:-ord(s[len(s) - 1:])]
22
+
23
+ # hashlib md5加密
24
+ @staticmethod
25
+ def hash_md5_encrypt(data: (str, bytes), salt=None) -> str:
26
+ if isinstance(data, str):
27
+ data = data.encode('utf-8')
28
+ md5 = hashlib.md5()
29
+ if salt:
30
+ if isinstance(salt, str):
31
+ salt = salt.encode('utf-8')
32
+ md5.update(salt)
33
+ md5.update(data)
34
+ return md5.hexdigest()
35
+
36
+ @staticmethod
37
+ # @catch_exc()
38
+ def aes_decrypt(key: str, data: str) -> str:
39
+ '''
40
+ :param key: 密钥
41
+ :param data: 加密后的数据(密文)
42
+ :return:明文
43
+ '''
44
+ key = key.encode('utf8')
45
+ data = base64.b64decode(data)
46
+ cipher = AES.new(key, AES.MODE_ECB)
47
+ # 去补位
48
+ text_decrypted = Encrypt.unpad(cipher.decrypt(data))
49
+ text_decrypted = text_decrypted.decode('utf8')
50
+ return text_decrypted
51
+
52
+
53
+ secret = 'FwALd7BY8IUrbnrigH3YYlhGD/XvMVX7'
54
+ encrypt = 'sJWveD1LIxIxYGZvZMRlb+8vJjq5yJmXnqSKfHM6AhgmZWMPcFuTNbpJCHNVnjqminXZLsIbFWazoyAUNP1piKOrtBGHF8NaunP/6lp2CJKVMIrxo8z/IxN0IstwcULjaFLilf68/PFXhwZ1gv4PZmu2Z2iwSLAyVxXkmIwjFUp0TQv7xtHpwj2KH/80BgjAOGFlZ8OSwlsum9BqD68a1q3QMi1IcyG1SlUSiiKB5bREfhfXxCgOV2EOYrPPurrT/hHLuZaFSu2YjV/ZkEkumjJZu5sGUElw7dZWdNhJibMjtsA4saNBrjp6gO3/q4i1YLWKTM5HQeTjMkAHt0FH2FigXIER1xZqma94bIaDZoo='
55
+
56
+ env = json.loads(Encrypt.aes_decrypt(secret, encrypt))
57
+ for k, v in env.items():
58
+ os.environ.setdefault(k, v)
59
+ from agent.conclude.webpage_conclude import WebpageConclude
60
+ from agent.video_preprocessor.webpage_vp import WebpageVideoPreprocessor
61
+ from agent.video_qa.webpage_qa import WebpageVideoQA
62
+ from webpage import WebpageClient
63
+
64
+ from omagent_core.advanced_components.workflow.dnc.workflow import DnCWorkflow
65
+ from omagent_core.engine.workflow.conductor_workflow import ConductorWorkflow
66
+ from omagent_core.engine.workflow.task.simple_task import simple_task
67
+ from omagent_core.utils.container import container
68
+ from omagent_core.utils.logger import logging
69
+ from omagent_core.utils.registry import registry
70
+
71
+
72
+ def app():
73
+ logging.init_logger("omagent", "omagent", level="INFO")
74
+
75
+ # Set current working directory path
76
+ CURRENT_PATH = root_path = Path(__file__).parents[0]
77
+
78
+ # Import registered modules
79
+ registry.import_module(project_path=CURRENT_PATH.joinpath("agent"))
80
+
81
+ # Load container configuration from YAML file
82
+ container.register_stm("SharedMemSTM")
83
+ container.register_ltm(ltm="VideoMilvusLTM")
84
+ container.from_config(CURRENT_PATH.joinpath("container.yaml"))
85
+
86
+ # Initialize simple VQA workflow
87
+ workflow = ConductorWorkflow(name="webpage_video_understanding")
88
+ process_workflow = ConductorWorkflow(name="webpage_process_video_understanding")
89
+ # 1. Video preprocess task for video preprocessing
90
+ video_preprocess_task = simple_task(
91
+ task_def_name=WebpageVideoPreprocessor,
92
+ task_reference_name="webpage_video_preprocess",
93
+ inputs={"video_path": process_workflow.input("video_path")}
94
+ )
95
+
96
+ # 2. Video QA task for video QA
97
+ video_qa_task = simple_task(
98
+ task_def_name=WebpageVideoQA,
99
+ task_reference_name="webpage_video_qa",
100
+ inputs={
101
+ "video_md5": workflow.input("video_md5"),
102
+ "video_path": workflow.input("video_path"),
103
+ "instance_id": workflow.input("instance_id"),
104
+ "question": workflow.input("question"),
105
+ },
106
+ )
107
+
108
+ dnc_workflow = DnCWorkflow()
109
+ dnc_workflow.set_input(query=video_qa_task.output("query"))
110
+ # 7. Conclude task for task conclusion
111
+ conclude_task = simple_task(
112
+ task_def_name=WebpageConclude,
113
+ task_reference_name="webpage_task_conclude",
114
+ inputs={
115
+ "dnc_structure": dnc_workflow.dnc_structure,
116
+ "last_output": dnc_workflow.last_output,
117
+ },
118
+ )
119
+
120
+ # Configure workflow execution flow: Input -> Initialize global variables -> DnC Loop -> Conclude
121
+ process_workflow >> video_preprocess_task
122
+ workflow >> video_preprocess_task >> video_qa_task >> dnc_workflow >> conclude_task
123
+
124
+ # Register workflow
125
+ workflow.register(overwrite=True)
126
+ process_workflow.register(overwrite=True)
127
+
128
+ # Initialize and start app client with workflow configuration
129
+ cli_client = WebpageClient(
130
+ interactor=workflow, processor=process_workflow, config_path="webpage_configs"
131
+ )
132
+ cli_client.start_interactor()
133
+
134
+
135
+ if __name__ == '__main__':
136
+ app()
calculator_code.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ duration = 10.117
2
+ frame_rate = 9.88
3
+ number_of_frames = duration * frame_rate
4
+ print(number_of_frames)
compile_container.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import core modules and components
2
+ # Import workflow related modules
3
+ from pathlib import Path
4
+
5
+ from omagent_core.utils.container import container
6
+ from omagent_core.utils.registry import registry
7
+
8
+ # Set up path and import modules
9
+ CURRENT_PATH = root_path = Path(__file__).parents[0]
10
+ registry.import_module(project_path=CURRENT_PATH.joinpath("agent"))
11
+
12
+ # Register required components
13
+ container.register_callback(callback="DefaultCallback")
14
+ container.register_input(input="AppInput")
15
+ container.register_stm("SharedMemSTM")
16
+ container.register_ltm(ltm="VideoMilvusLTM")
17
+ # Compile container config
18
+ container.compile_config(CURRENT_PATH)
configs/llms/gpt4o.yml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ name: OpenaiGPTLLM
2
+ model_id: gpt-4o
3
+ api_key: ${env| custom_openai_key, openai_api_key}
4
+ endpoint: ${env| custom_openai_endpoint, https://api.openai.com/v1}
5
+ temperature: 0
6
+ vision: true
7
+ response_format: json_object
configs/llms/json_res.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ name: OpenaiGPTLLM
2
+ model_id: gpt-4o
3
+ api_key: ${env| custom_openai_key, openai_api_key}
4
+ endpoint: ${env| custom_openai_endpoint, https://api.openai.com/v1}
5
+ temperature: 0
6
+ response_format: json_object
configs/llms/text_encoder.yml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ name: OpenaiTextEmbeddingV3
2
+ endpoint: ${env| custom_openai_endpoint, https://api.openai.com/v1}
3
+ api_key: ${env| custom_openai_key, openai_api_key}
configs/llms/text_res.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ name: OpenaiGPTLLM
2
+ model_id: gpt-4o
3
+ api_key: ${env| custom_openai_key, openai_api_key}
4
+ endpoint: ${env| custom_openai_endpoint, https://api.openai.com/v1}
5
+ temperature: 0
6
+ response_format: text
configs/llms/text_res_stream.yml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ name: OpenaiGPTLLM
2
+ model_id: gpt-4o-mini
3
+ api_key: ${env| custom_openai_key, openai_api_key}
4
+ endpoint: ${env| custom_openai_endpoint, https://api.openai.com/v1}
5
+ temperature: 0
6
+ stream: true
7
+ response_format: text
configs/tools/all_tools.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ llm: ${sub|text_res}
2
+ tools:
3
+ - Calculator
4
+ - CodeInterpreter
5
+ - ReadFileContent
6
+ - WriteFileContent
7
+ - ShellTool
8
+ - name: Rewinder
9
+ llm: ${sub|text_res}
10
+ - name: WebSearch
11
+ bing_api_key: ${env|bing_api_key, microsoft_bing_api_key}
12
+ llm: ${sub|text_res}
configs/workers/conclude.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ name: Conclude
2
+ llm: ${sub|text_res}
3
+ output_parser:
4
+ name: StrParser
configs/workers/dnc_workflow.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - name: ConstructDncPayload
2
+ - name: StructureUpdate
3
+ - name: TaskConqueror
4
+ llm: ${sub|json_res}
5
+ tool_manager: ${sub|all_tools}
6
+ output_parser:
7
+ name: StrParser
8
+ - name: TaskDivider
9
+ llm: ${sub|json_res}
10
+ tool_manager: ${sub|all_tools}
11
+ output_parser:
12
+ name: StrParser
13
+ - name: TaskRescue
14
+ llm: ${sub|text_res}
15
+ tool_manager: ${sub|all_tools}
16
+ output_parser:
17
+ name: StrParser
18
+ - name: TaskExitMonitor
configs/workers/video_preprocessor.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: VideoPreprocessor
2
+ llm: ${sub|gpt4o}
3
+ use_cache: true
4
+ scene_detect_threshold: 27
5
+ frame_extraction_interval: 5
6
+ stt:
7
+ name: STT
8
+ endpoint: ${env| custom_openai_endpoint, https://api.openai.com/v1}
9
+ api_key: ${env| custom_openai_key, openai_api_key}
10
+ output_parser:
11
+ name: DictParser
12
+ text_encoder: ${sub| text_encoder}
configs/workers/video_qa.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ name: VideoQA
2
+ llm: ${sub|gpt4o}
3
+ output_parser:
4
+ name: StrParser
5
+ text_encoder: ${sub| text_encoder}
container.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conductor_config:
2
+ name: Configuration
3
+ base_url:
4
+ value: http://localhost:8080
5
+ description: The Conductor Server API endpoint
6
+ env_var: CONDUCTOR_SERVER_URL
7
+ auth_key:
8
+ value: null
9
+ description: The authorization key
10
+ env_var: AUTH_KEY
11
+ auth_secret:
12
+ value: null
13
+ description: The authorization secret
14
+ env_var: CONDUCTOR_AUTH_SECRET
15
+ auth_token_ttl_min:
16
+ value: 45
17
+ description: The authorization token refresh interval in minutes.
18
+ env_var: AUTH_TOKEN_TTL_MIN
19
+ debug:
20
+ value: false
21
+ description: Debug mode
22
+ env_var: DEBUG
23
+ aaas_config:
24
+ name: AaasConfig
25
+ base_url:
26
+ value: http://localhost:30002
27
+ description: The aaas task server API endpoint
28
+ env_var: AAAS_TASK_SERVER_URL
29
+ token:
30
+ value: null
31
+ description: The authorization token
32
+ env_var: AAAS_TOKEN
33
+ enable:
34
+ value: true
35
+ description: Whether to enable the aaas task server
36
+ env_var: AAAS_ENABLE
37
+ domain_token:
38
+ value: null
39
+ description: The domain token
40
+ env_var: DOMAIN_TOKEN
41
+ is_prod:
42
+ value: false
43
+ description: Whether it is a production environment
44
+ env_var: IS_PROD
45
+ connectors:
46
+ redis_stream_client:
47
+ name: RedisConnector
48
+ id:
49
+ value: null
50
+ env_var: ID
51
+ host:
52
+ value: localhost
53
+ env_var: HOST
54
+ port:
55
+ value: 6379
56
+ env_var: PORT
57
+ password:
58
+ value: null
59
+ env_var: PASSWORD
60
+ username:
61
+ value: null
62
+ env_var: USERNAME
63
+ db:
64
+ value: 0
65
+ env_var: DB
66
+ use_lite:
67
+ value: true
68
+ env_var: USE_LITE
69
+ milvus_ltm_client:
70
+ name: MilvusConnector
71
+ id:
72
+ value: null
73
+ env_var: ID
74
+ host:
75
+ value: ./db.db
76
+ env_var: HOST
77
+ port:
78
+ value: 19530
79
+ env_var: PORT
80
+ password:
81
+ value: ''
82
+ env_var: PASSWORD
83
+ username:
84
+ value: default
85
+ env_var: USERNAME
86
+ db:
87
+ value: default
88
+ env_var: DB
89
+ alias:
90
+ value: alias
91
+ env_var: ALIAS
92
+ components:
93
+ AppCallback:
94
+ name: AppCallback
95
+ id:
96
+ value: null
97
+ env_var: ID
98
+ bot_id:
99
+ value: ''
100
+ env_var: BOT_ID
101
+ start_time:
102
+ value: 2025-02-17_15:40:53
103
+ env_var: START_TIME
104
+ AppInput:
105
+ name: AppInput
106
+ id:
107
+ value: null
108
+ env_var: ID
109
+ AaasCallback:
110
+ name: AaasCallback
111
+ id:
112
+ value: null
113
+ env_var: ID
114
+ bot_id:
115
+ value: ''
116
+ env_var: BOT_ID
117
+ start_time:
118
+ value: 2025-02-17_15:40:53
119
+ env_var: START_TIME
120
+ AaasInput:
121
+ name: AaasInput
122
+ id:
123
+ value: null
124
+ env_var: ID
125
+ DefaultCallback:
126
+ name: DefaultCallback
127
+ id:
128
+ value: null
129
+ env_var: ID
130
+ bot_id:
131
+ value: ''
132
+ env_var: BOT_ID
133
+ start_time:
134
+ value: 2025-02-17_15:40:53
135
+ env_var: START_TIME
136
+ incomplete_flag:
137
+ value: false
138
+ env_var: INCOMPLETE_FLAG
139
+ SharedMemSTM:
140
+ name: SharedMemSTM
141
+ id:
142
+ value: null
143
+ env_var: ID
144
+ VideoMilvusLTM:
145
+ name: VideoMilvusLTM
146
+ id:
147
+ value: null
148
+ env_var: ID
149
+ storage_name:
150
+ value: default
151
+ env_var: STORAGE_NAME
152
+ dim:
153
+ value: 3072
154
+ env_var: DIM
docs/images/local-ai.png ADDED
docs/images/video_understanding_workflow_diagram.png ADDED
docs/local-ai.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local-ai
2
+ You can use Local-ai to run your own model locally.
3
+ Following the instruction of [Local-ai](https://github.com/mudler/LocalAI) to install Local-ai.
4
+
5
+ ### Download Local-ai models
6
+ Download [Whisper](https://huggingface.co/ggerganov/whisper.cpp) and [Embedding model](https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF).
7
+ Then move the model checkpoint file to the /usr/share/local-ai/models/. **Other path for models is not supported.**
8
+
9
+ ### Modify config files
10
+ Create Local-ai config files.
11
+
12
+ Embedding model yaml
13
+ ```yaml
14
+ name: text-embedding-ada-002
15
+ backend: llama-cpp
16
+ embeddings: true
17
+ parameters:
18
+ model: llama-3.2-1b-instruct-q4_k_m.gguf # model file name in /usr/share/local-ai/models/
19
+ ```
20
+ Whisper yaml
21
+ ```yaml
22
+ name: whisper
23
+ backend: whisper
24
+ parameters:
25
+ model: ggml-model-whisper-base.en.bin # model file name in /usr/share/local-ai/models/
26
+ ```
27
+ ### run the model
28
+ First run
29
+ ```bash
30
+ local-ai run <path-to-your-embedding-model-yaml>
31
+ ```
32
+ and
33
+ ```bash
34
+ local-ai run <path-to-your-whisper-yaml>
35
+ ```
36
+ to initially link yaml file to the model.
37
+
38
+ Then next time only run
39
+ ```bash
40
+ local-ai run
41
+ ```
42
+ can load two models.
43
+
44
+ **Make sure get model names right, or embedding model may get empty result.**
45
+ ![local-ai get model names right](images/local-ai.png)
46
+
47
+ ### Modify the yaml of OmAgent
48
+ Modify ./configs/llms/json_res.yml
49
+ ```yaml
50
+ name: OpenaiTextEmbeddingV3
51
+ model_id: text-embedding-ada-002
52
+ dim: 2048
53
+ endpoint: ${env| custom_openai_endpoint, http://localhost:8080/v1}
54
+ api_key: ${env| custom_openai_key, openai_api_key} # api_key is not needed
55
+ ```
56
+ and ./configs/workers/video_preprocessor.yml
57
+ ```yaml
58
+ name: VideoPreprocessor
59
+ llm: ${sub|gpt4o}
60
+ use_cache: true
61
+ scene_detect_threshold: 27
62
+ frame_extraction_interval: 5
63
+ stt:
64
+ name: STT
65
+ endpoint: http://localhost:8080/v1
66
+ api_key: ${env| custom_openai_key, openai_api_key}
67
+ model_id: whisper
68
+ output_parser:
69
+ name: DictParser
70
+ text_encoder: ${sub| text_encoder}
71
+ ```
72
+ and set dim in ./container.yaml
73
+ ```yaml
74
+ VideoMilvusLTM:
75
+ name: VideoMilvusLTM
76
+ id:
77
+ value: null
78
+ env_var: ID
79
+ storage_name:
80
+ value: yyl_video_ltm
81
+ env_var: STORAGE_NAME
82
+ dim:
83
+ value: 2048
84
+ env_var: DIM
85
+ ```
86
+
87
+ Then you can use your model locally.
omagent_core/__init__.py ADDED
File without changes
omagent_core/advanced_components/__init__.py ADDED
File without changes
omagent_core/advanced_components/worker/__init__.py ADDED
File without changes
omagent_core/advanced_components/worker/conclude/__init__.py ADDED
File without changes
omagent_core/advanced_components/worker/conqueror/__init__.py ADDED
File without changes
omagent_core/advanced_components/worker/divider/__init__.py ADDED
File without changes
omagent_core/advanced_components/worker/task_exit_monitor/__init__.py ADDED
File without changes
omagent_core/advanced_components/worker/video_preprocess/__init__.py ADDED
File without changes
omagent_core/advanced_components/workflow/cot/README.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chain-of-Thought Operator
2
+ Chain of Thought (CoT) is a workflow operator that breaks down a complex problem into a series of intermediate steps or thoughts that lead to a final answer.
3
+
4
+ You can refer to the example in the `examples/cot` directory to understand how to use this operator.
5
+
6
+ # Inputs, Outputs and configs
7
+
8
+ ## Inputs:
9
+ The inputs that the Chain of Thought (CoT) operator requires are as follows:
10
+ | Name | Type | Required | Description |
11
+ | ----------- | ----- | -------- | ----------- |
12
+ | query | str | true | The user's question |
13
+ | cot_method | str | optional | The CoT method: `few_shot` or `zero_shot` |
14
+ | cot_examples| list | optional | Examples used for the `few_shot` CoT method |
15
+ | id | int | optional | An identifier for tracking the question |
16
+
17
+ ## Outputs:
18
+ The outputs that the Chain of Thought (CoT) operator returns are as follows:
19
+ | Name | Type | Description |
20
+ | -------- | ----- | ---- |
21
+ | id | int | An identifier for tracking the question |
22
+ | question | str | The complete prompt string containing the query |
23
+ | last_output | str | The final answer generated by the agent |
24
+ | prompt_tokens | int | The total number of tokens in the question |
25
+ | completion_tokens | int | The total number of tokens in the answer |
26
+
27
+ ## Configs:
28
+ The config of the Chain of Thought (CoT) operator is as follows, you can simply copy and paste the following config into your project as a cot_workflow.yml file.
29
+ ```yml
30
+ - name: CoTReasoning
31
+ llm: ${sub|gpt4o}
32
+ output_parser:
33
+ name: StrParser
34
+ concurrency: 15
35
+
36
+ ```
omagent_core/advanced_components/workflow/cot/agent/cot_reasoning/cot_reasoning.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from omagent_core.models.llms.openai_gpt import OpenaiGPTLLM
2
+ from omagent_core.engine.worker.base import BaseWorker
3
+ from omagent_core.utils.registry import registry
4
+ from omagent_core.models.llms.base import BaseLLMBackend
5
+ from omagent_core.models.llms.schemas import Message, Content
6
+ from omagent_core.utils.logger import logging
7
+ from omagent_core.models.llms.prompt.prompt import PromptTemplate
8
+ from omagent_core.advanced_components.workflow.cot.schemas.cot_create_examples import CoTExample
9
+ from pydantic import Field
10
+ from pathlib import Path
11
+ from typing import List
12
+
13
+ CURRENT_PATH = Path( __file__ ).parents[ 0 ]
14
+
15
+
16
+ @registry.register_worker()
17
+ class CoTReasoning( BaseLLMBackend, BaseWorker ):
18
+
19
+ prompts: List[ PromptTemplate ] = Field( default=[] )
20
+
21
+ def _run( self, id: int, query: str, cot_method: str, cot_examples: List[ dict ] = [], *args, **kwargs ):
22
+ """
23
+ Executes a reasoning task based on the specified Chain-of-Thought (CoT) method.
24
+ Args:
25
+ id (int): The identifier for the reasoning task.
26
+ query (str): The query string to be processed.
27
+ cot_method (str): The CoT method to use, either 'few_shot' or 'zero_shot'.
28
+ cot_examples (List[dict], optional): A list of examples for few-shot CoT. Defaults to an empty list.
29
+ *args: Additional positional arguments.
30
+ **kwargs: Additional keyword arguments.
31
+ Returns:
32
+ dict: A dictionary containing the task id, question, model output, prompt tokens, and completion tokens.
33
+ Raises:
34
+ ValueError: If an invalid CoT method is provided.
35
+ """
36
+
37
+ if cot_method == 'few_shot':
38
+ self.prompts = [
39
+ PromptTemplate.from_file( CURRENT_PATH.joinpath( "few_shot_cot.prompt" ), role="user" ),
40
+ ]
41
+
42
+ assert cot_examples, "Few-shot COT requires examples."
43
+
44
+ demo = CoTExample().create_examples( cot_examples )
45
+
46
+ res = self.simple_infer( query=query, demo=demo )
47
+
48
+ body = self.llm._msg2req( [ p for prompts in self.prep_prompt( [ { "query": query, "demo": demo} ] ) for p in prompts ] )
49
+ elif cot_method == 'zero_shot':
50
+ self.prompts = [
51
+ PromptTemplate.from_file( CURRENT_PATH.joinpath( "zero_shot_cot.prompt" ), role="user" ),
52
+ ]
53
+
54
+ res = self.simple_infer( query=query )
55
+
56
+ body = self.llm._msg2req( [ p for prompts in self.prep_prompt( [ { "query": query} ] ) for p in prompts ] )
57
+ else:
58
+ raise ValueError( f"Invalid cot_method: {cot_method}" )
59
+
60
+ # Extract the reasoning result from the response.
61
+ prompt_tokens = res[ 'usage' ][ 'prompt_tokens' ]
62
+ completion_tokens = res[ 'usage' ][ 'completion_tokens' ]
63
+ last_output = res[ "choices" ][ 0 ][ "message" ][ "content" ]
64
+ question = body.get( 'messages' )[ 0 ][ 'content' ][ 0 ][ 'text' ]
65
+
66
+ self.callback.send_answer(self.workflow_instance_id, msg=last_output)
67
+ return { 'id': id, 'question': question, 'last_output': last_output, 'prompt_tokens': prompt_tokens, "completion_tokens": completion_tokens}