Spaces:
Running
Running
Commit
·
3ed4749
0
Parent(s):
add files
Browse files- .gitattributes +44 -0
- .gitignore +165 -0
- README.md +14 -0
- app.py +533 -0
- infer.py +299 -0
- requirements.txt +7 -0
- visualizer.py +126 -0
.gitattributes
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
examples/I7pTpMjqNRM_1080p_small.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
examples/interface.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
examples/newyork.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
examples/puzzle.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
examples/000000001000.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
examples/000000018380.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
examples/bancopy.jpg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
examples/beijing.jpg filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
simhei.ttf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# poetry
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
+
#poetry.lock
|
| 103 |
+
|
| 104 |
+
# pdm
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
+
#pdm.lock
|
| 107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
+
# in version control.
|
| 109 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 110 |
+
.pdm.toml
|
| 111 |
+
|
| 112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 113 |
+
__pypackages__/
|
| 114 |
+
|
| 115 |
+
# Celery stuff
|
| 116 |
+
celerybeat-schedule
|
| 117 |
+
celerybeat.pid
|
| 118 |
+
|
| 119 |
+
# SageMath parsed files
|
| 120 |
+
*.sage.py
|
| 121 |
+
|
| 122 |
+
# Environments
|
| 123 |
+
.env
|
| 124 |
+
.venv
|
| 125 |
+
env/
|
| 126 |
+
venv/
|
| 127 |
+
ENV/
|
| 128 |
+
env.bak/
|
| 129 |
+
venv.bak/
|
| 130 |
+
|
| 131 |
+
# Spyder project settings
|
| 132 |
+
.spyderproject
|
| 133 |
+
.spyproject
|
| 134 |
+
|
| 135 |
+
# Rope project settings
|
| 136 |
+
.ropeproject
|
| 137 |
+
|
| 138 |
+
# mkdocs documentation
|
| 139 |
+
/site
|
| 140 |
+
|
| 141 |
+
# mypy
|
| 142 |
+
.mypy_cache/
|
| 143 |
+
.dmypy.json
|
| 144 |
+
dmypy.json
|
| 145 |
+
|
| 146 |
+
# Pyre type checker
|
| 147 |
+
.pyre/
|
| 148 |
+
|
| 149 |
+
# pytype static type analyzer
|
| 150 |
+
.pytype/
|
| 151 |
+
|
| 152 |
+
# Cython debug symbols
|
| 153 |
+
cython_debug/
|
| 154 |
+
|
| 155 |
+
# PyCharm
|
| 156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
+
#.idea/
|
| 161 |
+
|
| 162 |
+
.DS_Store
|
| 163 |
+
video_frames
|
| 164 |
+
examples
|
| 165 |
+
simhei.ttf
|
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Seed1.5 VL
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.29.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
short_description: Seed1.5-VL API Demo
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,533 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import cv2
|
| 5 |
+
import json
|
| 6 |
+
import time
|
| 7 |
+
import numpy as np
|
| 8 |
+
import gradio as gr
|
| 9 |
+
from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN
|
| 10 |
+
from visualizer import draw_boxes_points_with_labels
|
| 11 |
+
|
| 12 |
+
infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'), api_key=os.getenv('API_KEY'))
|
| 13 |
+
|
| 14 |
+
label_translations = {
|
| 15 |
+
"gr_chatinterface_ofl": {
|
| 16 |
+
"English": "Chatbot",
|
| 17 |
+
"中文": "对话界面"
|
| 18 |
+
},
|
| 19 |
+
"gr_chatinterface_ol": {
|
| 20 |
+
"English": "Chatbot",
|
| 21 |
+
"中文": "对话界面"
|
| 22 |
+
},
|
| 23 |
+
"gr_tab_ol": {
|
| 24 |
+
"English": "Online",
|
| 25 |
+
"中文": "在线模式"
|
| 26 |
+
},
|
| 27 |
+
"gr_tab_ofl": {
|
| 28 |
+
"English": "Offline",
|
| 29 |
+
"中文": "离线模式"
|
| 30 |
+
},
|
| 31 |
+
"gr_thinking": {
|
| 32 |
+
"English": ConversationModeI18N.D,
|
| 33 |
+
"中文": ConversationModeCN.D,
|
| 34 |
+
},
|
| 35 |
+
"gr_temperature": {
|
| 36 |
+
"English": "Temperature",
|
| 37 |
+
"中文": "温度系数"
|
| 38 |
+
},
|
| 39 |
+
"gr_webcam_image": {
|
| 40 |
+
"English": "🤳 Open Webcam",
|
| 41 |
+
"中文": "🤳 打开摄像头"
|
| 42 |
+
},
|
| 43 |
+
"gr_webcam_images": {
|
| 44 |
+
"English": "📹 Recorded Frames",
|
| 45 |
+
"中文": "📹 录制的视频帧"
|
| 46 |
+
},
|
| 47 |
+
"gr_chatinterface_ofl.textbox.placeholder": {
|
| 48 |
+
"English":
|
| 49 |
+
"Ask me anything. You can also drop in images and .mp4 videos.",
|
| 50 |
+
"中文": "有什么想问的?支持上传图片和.mp4视频。"
|
| 51 |
+
},
|
| 52 |
+
"gr_chatinterface_ol.textbox.placeholder": {
|
| 53 |
+
"English": "Ask me anything...",
|
| 54 |
+
"中文": "有什么想问的?"
|
| 55 |
+
},
|
| 56 |
+
"gr_clear_button": {
|
| 57 |
+
"English": "🧹 Clear History",
|
| 58 |
+
"中文": "🧹 清除历史对话"
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
def add_escape(text: str):
|
| 63 |
+
return text.replace('<', '\<').replace('>', '\>')
|
| 64 |
+
|
| 65 |
+
def remove_escape(text: str):
|
| 66 |
+
return text.replace('\<', '<').replace('\>', '>')
|
| 67 |
+
|
| 68 |
+
def plot_boxes_points_detections(image_path, message):
|
| 69 |
+
detection_pattern = r'\[\s*{.*?}\s*\]'
|
| 70 |
+
detection_matches = re.finditer(detection_pattern, message, flags=re.DOTALL)
|
| 71 |
+
bboxes, categories = [], []
|
| 72 |
+
for match in detection_matches:
|
| 73 |
+
matched_str = match.group(0)
|
| 74 |
+
detections = json.loads(matched_str)
|
| 75 |
+
for detection in detections:
|
| 76 |
+
cat, bbox_str = detection['category'], detection['bbox']
|
| 77 |
+
bbox_str = bbox_str.replace('<bbox>', '').replace('</bbox>', '').replace('</bbox', '')
|
| 78 |
+
bbox = list(map(float, bbox_str.split(' ')))
|
| 79 |
+
bboxes.append(bbox)
|
| 80 |
+
categories.append(cat)
|
| 81 |
+
if not bboxes:
|
| 82 |
+
box_pattern = r'<bbox>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</bbox>'
|
| 83 |
+
box_matches = re.finditer(box_pattern, message)
|
| 84 |
+
bboxes = [
|
| 85 |
+
[float(match.group(1)), float(match.group(2)),
|
| 86 |
+
float(match.group(3)), float(match.group(4))]
|
| 87 |
+
for match in box_matches
|
| 88 |
+
]
|
| 89 |
+
|
| 90 |
+
points = []
|
| 91 |
+
if not bboxes:
|
| 92 |
+
point_pattern = r'<point>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</point>'
|
| 93 |
+
point_matches = re.finditer(point_pattern, message)
|
| 94 |
+
points = [
|
| 95 |
+
[float(match.group(1)), float(match.group(2))]
|
| 96 |
+
for match in point_matches
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
if not bboxes and not points:
|
| 100 |
+
return
|
| 101 |
+
|
| 102 |
+
bboxes = np.array(bboxes, dtype='float') / 1000
|
| 103 |
+
points = np.array(points, dtype='float') / 1000
|
| 104 |
+
|
| 105 |
+
image = cv2.imread(image_path)
|
| 106 |
+
h, w, c = image.shape
|
| 107 |
+
if bboxes.size:
|
| 108 |
+
bboxes[:, 0::2] *= w
|
| 109 |
+
bboxes[:, 1::2] *= h
|
| 110 |
+
if points.size:
|
| 111 |
+
points[:, 0] *= w
|
| 112 |
+
points[:, 1] *= h
|
| 113 |
+
output_image = draw_boxes_points_with_labels(image, bboxes, points, categories)
|
| 114 |
+
return output_image
|
| 115 |
+
|
| 116 |
+
def general_chat(inputs: dict, gr_history: list, infer_history: list,
|
| 117 |
+
if_thinking: bool, temperature: float, online: bool = False):
|
| 118 |
+
if 'text' in inputs:
|
| 119 |
+
inputs['text'] = remove_escape(inputs['text'])
|
| 120 |
+
mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G
|
| 121 |
+
for response_text, infer_history, finished in infer(inputs=inputs,
|
| 122 |
+
history=infer_history,
|
| 123 |
+
mode=mode,
|
| 124 |
+
temperature=temperature,
|
| 125 |
+
online=online):
|
| 126 |
+
if if_thinking:
|
| 127 |
+
reasoning_text, response_text = response_text.split('</think>')
|
| 128 |
+
reasoning_text = reasoning_text.lstrip('<think>')
|
| 129 |
+
response_message = [{
|
| 130 |
+
"role": "assistant",
|
| 131 |
+
"content": add_escape(reasoning_text),
|
| 132 |
+
'metadata': {
|
| 133 |
+
'title': '🤔 Thinking'
|
| 134 |
+
}
|
| 135 |
+
}, {
|
| 136 |
+
"role": "assistant",
|
| 137 |
+
"content": add_escape(response_text)
|
| 138 |
+
}]
|
| 139 |
+
else:
|
| 140 |
+
response_message = [{
|
| 141 |
+
"role": "assistant",
|
| 142 |
+
"content": add_escape(response_text)
|
| 143 |
+
}]
|
| 144 |
+
if finished and len(inputs.get('files', [])) == 1 and not inputs['files'][0].endswith('.mp4'):
|
| 145 |
+
image_path = inputs['files'][0]
|
| 146 |
+
response_text = infer_history[-1]['content']
|
| 147 |
+
try:
|
| 148 |
+
if if_thinking:
|
| 149 |
+
reasoning_text, response_text = response_text.split('</think>')
|
| 150 |
+
output_image = plot_boxes_points_detections(image_path, response_text)
|
| 151 |
+
if output_image is not None:
|
| 152 |
+
response_message.append({
|
| 153 |
+
"role": "assistant",
|
| 154 |
+
"content": gr.Image(output_image),
|
| 155 |
+
})
|
| 156 |
+
except Exception as e:
|
| 157 |
+
print(e)
|
| 158 |
+
yield response_message, infer_history
|
| 159 |
+
|
| 160 |
+
def online_record_chat(text: str, gr_history: list, gr_webcam_images: list,
|
| 161 |
+
gr_counter: int, infer_history: list, if_thinking: bool,
|
| 162 |
+
temperature: float):
|
| 163 |
+
if not gr_webcam_images:
|
| 164 |
+
gr_webcam_images = []
|
| 165 |
+
gr_webcam_images = gr_webcam_images[gr_counter:]
|
| 166 |
+
inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
|
| 167 |
+
yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len(
|
| 168 |
+
gr_webcam_images), infer_history
|
| 169 |
+
for response_message, infer_history in general_chat(
|
| 170 |
+
inputs, gr_history, infer_history, if_thinking, temperature, online=True):
|
| 171 |
+
yield response_message, gr.skip(), infer_history
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
with gr.Blocks() as demo:
|
| 175 |
+
with gr.Column():
|
| 176 |
+
gr_title = gr.Markdown('# Seed1.5-VL')
|
| 177 |
+
with gr.Row():
|
| 178 |
+
gr.Markdown(
|
| 179 |
+
"""
|
| 180 |
+
<div style="display:flex; flex-direction:column; gap:10px;">
|
| 181 |
+
<a
|
| 182 |
+
href="https://github.com/ByteDance-Seed/Seed1.5-VL"
|
| 183 |
+
target="_blank"
|
| 184 |
+
style="
|
| 185 |
+
display: inline-flex;
|
| 186 |
+
align-items: center;
|
| 187 |
+
gap: 8px;
|
| 188 |
+
white-space: nowrap;
|
| 189 |
+
text-decoration: none;
|
| 190 |
+
"
|
| 191 |
+
>
|
| 192 |
+
<img
|
| 193 |
+
src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/github/github-original.svg"
|
| 194 |
+
alt="GitHub"
|
| 195 |
+
width="24"
|
| 196 |
+
>
|
| 197 |
+
Seed1.5-VL Cookbook
|
| 198 |
+
</a>
|
| 199 |
+
</div>
|
| 200 |
+
"""
|
| 201 |
+
)
|
| 202 |
+
gr.Markdown(
|
| 203 |
+
"""
|
| 204 |
+
<div style="display:flex; flex-direction:column; gap:10px;">
|
| 205 |
+
<a
|
| 206 |
+
href="https://huggingface.co/papers/2505.07062"
|
| 207 |
+
target="_blank"
|
| 208 |
+
style="
|
| 209 |
+
display: inline-flex;
|
| 210 |
+
align-items: center;
|
| 211 |
+
gap: 8px;
|
| 212 |
+
white-space: nowrap;
|
| 213 |
+
text-decoration: none;
|
| 214 |
+
"
|
| 215 |
+
>
|
| 216 |
+
<img
|
| 217 |
+
src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
|
| 218 |
+
alt="Paper"
|
| 219 |
+
width="24"
|
| 220 |
+
>
|
| 221 |
+
Seed1.5-VL Paper
|
| 222 |
+
</a>
|
| 223 |
+
</div>
|
| 224 |
+
""",
|
| 225 |
+
)
|
| 226 |
+
gr.Markdown(' ')
|
| 227 |
+
gr.Markdown(' ')
|
| 228 |
+
gr.Markdown(' ')
|
| 229 |
+
gr.Markdown(' ')
|
| 230 |
+
with gr.Row():
|
| 231 |
+
gr_lang_selector = gr.Dropdown(choices=["English", "中文"],
|
| 232 |
+
value="English",
|
| 233 |
+
label="🌐 English Interface/中文界面",
|
| 234 |
+
interactive=True,
|
| 235 |
+
min_width=400,
|
| 236 |
+
scale=0)
|
| 237 |
+
|
| 238 |
+
with gr.Tabs():
|
| 239 |
+
with gr.Tab("Offline") as gr_tab_ofl:
|
| 240 |
+
gr_infer_history = gr.State([])
|
| 241 |
+
gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
|
| 242 |
+
gr_temperature_hidden = gr.Slider(minimum=0.0,
|
| 243 |
+
maximum=2.0,
|
| 244 |
+
step=0.1,
|
| 245 |
+
value=0.0,
|
| 246 |
+
interactive=True,
|
| 247 |
+
visible=False)
|
| 248 |
+
gr_chatinterface_ofl = gr.ChatInterface(
|
| 249 |
+
fn=general_chat,
|
| 250 |
+
type="messages",
|
| 251 |
+
multimodal=True,
|
| 252 |
+
textbox=gr.MultimodalTextbox(
|
| 253 |
+
file_count="multiple",
|
| 254 |
+
file_types=["image", ".mp4"],
|
| 255 |
+
sources=["upload"],
|
| 256 |
+
stop_btn=True,
|
| 257 |
+
placeholder=label_translations[
|
| 258 |
+
'gr_chatinterface_ofl.textbox.placeholder']['English'],
|
| 259 |
+
),
|
| 260 |
+
additional_inputs=[
|
| 261 |
+
gr_infer_history, gr_thinking_hidden, gr_temperature_hidden
|
| 262 |
+
],
|
| 263 |
+
additional_outputs=[gr_infer_history],
|
| 264 |
+
)
|
| 265 |
+
def add_escape_fn(inputs: dict):
|
| 266 |
+
if inputs and 'text' in inputs:
|
| 267 |
+
inputs['text'] = add_escape(inputs['text'])
|
| 268 |
+
return inputs
|
| 269 |
+
gr_chatinterface_ofl.textbox.submit(
|
| 270 |
+
fn=add_escape_fn,
|
| 271 |
+
inputs=[gr_chatinterface_ofl.saved_input],
|
| 272 |
+
outputs=[gr_chatinterface_ofl.saved_input]
|
| 273 |
+
)
|
| 274 |
+
gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
|
| 275 |
+
fn=lambda: [],
|
| 276 |
+
outputs=[gr_infer_history])
|
| 277 |
+
with gr.Row():
|
| 278 |
+
gr_thinking_ofl = gr.Checkbox(
|
| 279 |
+
value=True,
|
| 280 |
+
label=label_translations['gr_thinking']['English'],
|
| 281 |
+
)
|
| 282 |
+
gr_thinking_ofl.change(lambda x: x,
|
| 283 |
+
inputs=gr_thinking_ofl,
|
| 284 |
+
outputs=gr_thinking_hidden)
|
| 285 |
+
gr_temperature_ofl = gr.Slider(
|
| 286 |
+
minimum=0.0,
|
| 287 |
+
maximum=2.0,
|
| 288 |
+
step=0.1,
|
| 289 |
+
value=0.0,
|
| 290 |
+
label=label_translations['gr_temperature']['English'],
|
| 291 |
+
interactive=True)
|
| 292 |
+
gr_temperature_ofl.change(lambda x: x,
|
| 293 |
+
inputs=gr_temperature_ofl,
|
| 294 |
+
outputs=gr_temperature_hidden)
|
| 295 |
+
gr_clear_button_ofl = gr.Button(value=label_translations['gr_clear_button']['English'])
|
| 296 |
+
def clear_history_fn():
|
| 297 |
+
return None, [], [], [], []
|
| 298 |
+
gr_clear_button_ofl.click(
|
| 299 |
+
fn=clear_history_fn,
|
| 300 |
+
outputs=[
|
| 301 |
+
gr_chatinterface_ofl.conversation_id,
|
| 302 |
+
gr_chatinterface_ofl.saved_conversations,
|
| 303 |
+
gr_chatinterface_ofl.chatbot,
|
| 304 |
+
gr_chatinterface_ofl.chatbot_state,
|
| 305 |
+
gr_infer_history
|
| 306 |
+
]
|
| 307 |
+
)
|
| 308 |
+
with gr.Column(visible=True) as gr_examples_en:
|
| 309 |
+
gr.Examples(
|
| 310 |
+
label='7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.',
|
| 311 |
+
examples=[
|
| 312 |
+
{
|
| 313 |
+
"text": "Who are you?",
|
| 314 |
+
"files": []
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"text": "Introduce this.",
|
| 318 |
+
"files": ["examples/bancopy.jpg"]
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"text":
|
| 322 |
+
"""Find Curry's "Good Night" celebration time.""",
|
| 323 |
+
"files":
|
| 324 |
+
["examples/I7pTpMjqNRM_1080p_small.mp4"]
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"text":
|
| 328 |
+
"Share your feelings.",
|
| 329 |
+
"files": [
|
| 330 |
+
"examples/newyork.jpg",
|
| 331 |
+
"examples/beijing.jpg"
|
| 332 |
+
]
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"text": "Look and answer.",
|
| 336 |
+
"files": ["examples/puzzle.png"]
|
| 337 |
+
},
|
| 338 |
+
{
|
| 339 |
+
"text": "Please point out all the hats on people's heads in the image, output concatenated point coordinates like <point>x y</point><point>x y</point>",
|
| 340 |
+
"files": ["examples/000000001000.jpeg"]
|
| 341 |
+
},
|
| 342 |
+
{
|
| 343 |
+
"text": """Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
|
| 344 |
+
"files": ["examples/000000018380.jpeg"]
|
| 345 |
+
}
|
| 346 |
+
],
|
| 347 |
+
inputs=[gr_chatinterface_ofl.textbox],
|
| 348 |
+
)
|
| 349 |
+
with gr.Column(visible=False) as gr_examples_cn:
|
| 350 |
+
gr.Examples(
|
| 351 |
+
label='七个示例:文本,图像,视频,多个图像/视频,视觉解谜,坐标定位,开放式物体检测。',
|
| 352 |
+
examples=[
|
| 353 |
+
{
|
| 354 |
+
"text": "你是谁?",
|
| 355 |
+
"files": []
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"text": "介绍一下。",
|
| 359 |
+
"files": ["examples/bancopy.jpg"]
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"text":
|
| 363 |
+
"找到库里的“晚安”庆祝时间段。",
|
| 364 |
+
"files":
|
| 365 |
+
["examples/I7pTpMjqNRM_1080p_small.mp4"]
|
| 366 |
+
},
|
| 367 |
+
{
|
| 368 |
+
"text":
|
| 369 |
+
"你有什么感想?",
|
| 370 |
+
"files": [
|
| 371 |
+
"examples/newyork.jpg",
|
| 372 |
+
"examples/beijing.jpg"
|
| 373 |
+
]
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"text": "看图回答。",
|
| 377 |
+
"files": ["examples/puzzle.png"]
|
| 378 |
+
},
|
| 379 |
+
{
|
| 380 |
+
"text": "请点出图像中所有戴在头上的帽子, 输出串联的点坐标<point>x y</point><point>x y</point>",
|
| 381 |
+
"files": ["examples/000000001000.jpeg"]
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"text": """请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表,就像:[{"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
|
| 385 |
+
"files": ["examples/000000018380.jpeg"]
|
| 386 |
+
}
|
| 387 |
+
],
|
| 388 |
+
inputs=[gr_chatinterface_ofl.textbox],
|
| 389 |
+
)
|
| 390 |
+
with gr.Tab("Online") as gr_tab_ol:
|
| 391 |
+
with gr.Row():
|
| 392 |
+
with gr.Column(scale=1):
|
| 393 |
+
gr_infer_history_ol = gr.State([])
|
| 394 |
+
gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
|
| 395 |
+
gr_temperature_hidden = gr.Slider(minimum=0.0,
|
| 396 |
+
maximum=2.0,
|
| 397 |
+
step=0.1,
|
| 398 |
+
value=1.0,
|
| 399 |
+
interactive=True,
|
| 400 |
+
visible=False)
|
| 401 |
+
with gr.Row():
|
| 402 |
+
with gr.Column(scale=1):
|
| 403 |
+
gr_webcam_image = gr.Image(
|
| 404 |
+
label=label_translations['gr_webcam_image']
|
| 405 |
+
['English'],
|
| 406 |
+
sources="webcam",
|
| 407 |
+
height=250,
|
| 408 |
+
type='filepath')
|
| 409 |
+
gr_webcam_images = gr.Gallery(
|
| 410 |
+
label=label_translations['gr_webcam_images']
|
| 411 |
+
['English'],
|
| 412 |
+
show_label=True,
|
| 413 |
+
format='webp',
|
| 414 |
+
columns=1,
|
| 415 |
+
height=250,
|
| 416 |
+
preview=True,
|
| 417 |
+
interactive=False)
|
| 418 |
+
gr_counter = gr.Number(value=0, visible=False)
|
| 419 |
+
with gr.Column(scale=3):
|
| 420 |
+
gr_chatinterface_ol = gr.ChatInterface(
|
| 421 |
+
fn=online_record_chat,
|
| 422 |
+
type="messages",
|
| 423 |
+
multimodal=False,
|
| 424 |
+
textbox=gr.
|
| 425 |
+
Textbox(placeholder=label_translations[
|
| 426 |
+
'gr_chatinterface_ol.textbox.placeholder']
|
| 427 |
+
['English'],
|
| 428 |
+
submit_btn=True,
|
| 429 |
+
stop_btn=True),
|
| 430 |
+
additional_inputs=[
|
| 431 |
+
gr_webcam_images, gr_counter,
|
| 432 |
+
gr_infer_history_ol, gr_thinking_hidden,
|
| 433 |
+
gr_temperature_hidden
|
| 434 |
+
],
|
| 435 |
+
additional_outputs=[
|
| 436 |
+
gr_counter, gr_infer_history_ol
|
| 437 |
+
],
|
| 438 |
+
)
|
| 439 |
+
|
| 440 |
+
def cache_webcam(recorded_image: str,
|
| 441 |
+
recorded_images: list):
|
| 442 |
+
if not recorded_images:
|
| 443 |
+
recorded_images = []
|
| 444 |
+
return recorded_images + [recorded_image]
|
| 445 |
+
|
| 446 |
+
gr_webcam_image.stream(
|
| 447 |
+
fn=cache_webcam,
|
| 448 |
+
inputs=[gr_webcam_image, gr_webcam_images],
|
| 449 |
+
outputs=[gr_webcam_images],
|
| 450 |
+
stream_every=1,
|
| 451 |
+
concurrency_limit=30,
|
| 452 |
+
)
|
| 453 |
+
with gr.Row():
|
| 454 |
+
gr_thinking_ol = gr.Checkbox(
|
| 455 |
+
value=True,
|
| 456 |
+
label=label_translations['gr_thinking']
|
| 457 |
+
['English'],
|
| 458 |
+
)
|
| 459 |
+
gr_thinking_ol.change(
|
| 460 |
+
lambda x: x,
|
| 461 |
+
inputs=gr_thinking_ol,
|
| 462 |
+
outputs=gr_thinking_hidden)
|
| 463 |
+
gr_temperature_ol = gr.Slider(
|
| 464 |
+
minimum=0.0,
|
| 465 |
+
maximum=2.0,
|
| 466 |
+
step=0.1,
|
| 467 |
+
value=1.0,
|
| 468 |
+
label=label_translations['gr_temperature']
|
| 469 |
+
['English'],
|
| 470 |
+
interactive=True)
|
| 471 |
+
gr_temperature_ol.change(
|
| 472 |
+
lambda x: x,
|
| 473 |
+
inputs=gr_temperature_ol,
|
| 474 |
+
outputs=gr_temperature_hidden)
|
| 475 |
+
gr_clear_button_ol = gr.Button(value=label_translations['gr_clear_button']['English'])
|
| 476 |
+
def clear_history_fn():
|
| 477 |
+
return None, [], [], [], []
|
| 478 |
+
gr_clear_button_ol.click(
|
| 479 |
+
fn=clear_history_fn,
|
| 480 |
+
outputs=[
|
| 481 |
+
gr_chatinterface_ol.conversation_id,
|
| 482 |
+
gr_chatinterface_ol.saved_conversations,
|
| 483 |
+
gr_chatinterface_ol.chatbot,
|
| 484 |
+
gr_chatinterface_ol.chatbot_state,
|
| 485 |
+
gr_infer_history_ol
|
| 486 |
+
]
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
def update_lang(lang: str):
|
| 490 |
+
return (
|
| 491 |
+
gr.update(label=label_translations['gr_chatinterface_ofl'][lang]),
|
| 492 |
+
gr.update(label=label_translations['gr_chatinterface_ol'][lang]),
|
| 493 |
+
gr.update(placeholder=label_translations[
|
| 494 |
+
'gr_chatinterface_ofl.textbox.placeholder'][lang]),
|
| 495 |
+
gr.update(placeholder=label_translations[
|
| 496 |
+
'gr_chatinterface_ol.textbox.placeholder'][lang]),
|
| 497 |
+
gr.update(label=label_translations['gr_tab_ofl'][lang]),
|
| 498 |
+
gr.update(label=label_translations['gr_tab_ol'][lang]),
|
| 499 |
+
gr.update(label=label_translations['gr_thinking'][lang]),
|
| 500 |
+
gr.update(label=label_translations['gr_thinking'][lang]),
|
| 501 |
+
gr.update(label=label_translations['gr_temperature'][lang]),
|
| 502 |
+
gr.update(label=label_translations['gr_temperature'][lang]),
|
| 503 |
+
gr.update(visible=lang == 'English'),
|
| 504 |
+
gr.update(visible=lang != 'English'),
|
| 505 |
+
gr.update(label=label_translations['gr_webcam_image'][lang]),
|
| 506 |
+
gr.update(label=label_translations['gr_webcam_images'][lang]),
|
| 507 |
+
gr.update(value=label_translations['gr_clear_button'][lang]),
|
| 508 |
+
gr.update(value=label_translations['gr_clear_button'][lang]),
|
| 509 |
+
)
|
| 510 |
+
|
| 511 |
+
gr_lang_selector.change(fn=update_lang,
|
| 512 |
+
inputs=[gr_lang_selector],
|
| 513 |
+
outputs=[
|
| 514 |
+
gr_chatinterface_ofl.chatbot,
|
| 515 |
+
gr_chatinterface_ol.chatbot,
|
| 516 |
+
gr_chatinterface_ofl.textbox,
|
| 517 |
+
gr_chatinterface_ol.textbox,
|
| 518 |
+
gr_tab_ofl,
|
| 519 |
+
gr_tab_ol,
|
| 520 |
+
gr_thinking_ofl,
|
| 521 |
+
gr_thinking_ol,
|
| 522 |
+
gr_temperature_ofl,
|
| 523 |
+
gr_temperature_ol,
|
| 524 |
+
gr_examples_en,
|
| 525 |
+
gr_examples_cn,
|
| 526 |
+
gr_webcam_image,
|
| 527 |
+
gr_webcam_images,
|
| 528 |
+
gr_clear_button_ofl,
|
| 529 |
+
gr_clear_button_ol,
|
| 530 |
+
])
|
| 531 |
+
demo.queue(default_concurrency_limit=100, max_size=100).launch(share=True,
|
| 532 |
+
max_threads=100,
|
| 533 |
+
ssr_mode=False)
|
infer.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
import math
|
| 5 |
+
import base64
|
| 6 |
+
import requests
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import decord
|
| 10 |
+
import numpy as np
|
| 11 |
+
from PIL import Image, ImageSequence
|
| 12 |
+
from torchvision.io import read_image, encode_jpeg
|
| 13 |
+
from torchvision.transforms.functional import resize
|
| 14 |
+
from torchvision.transforms import InterpolationMode
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ConversationModeI18N:
|
| 18 |
+
G = "General"
|
| 19 |
+
D = "Deep Thinking"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ConversationModeCN:
|
| 23 |
+
G = "常规"
|
| 24 |
+
D = "深度思考"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def round_by_factor(number: int, factor: int) -> int:
|
| 28 |
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
| 29 |
+
return round(number / factor) * factor
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def ceil_by_factor(number: int, factor: int) -> int:
|
| 33 |
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
| 34 |
+
return math.ceil(number / factor) * factor
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def floor_by_factor(number: int, factor: int) -> int:
|
| 38 |
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
| 39 |
+
return math.floor(number / factor) * factor
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_resized_hw_for_Navit(
|
| 43 |
+
height: int,
|
| 44 |
+
width: int,
|
| 45 |
+
min_pixels: int,
|
| 46 |
+
max_pixels: int,
|
| 47 |
+
max_ratio: int = 200,
|
| 48 |
+
factor: int = 28,
|
| 49 |
+
):
|
| 50 |
+
if max(height, width) / min(height, width) > max_ratio:
|
| 51 |
+
raise ValueError(
|
| 52 |
+
f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)}"
|
| 53 |
+
)
|
| 54 |
+
h_bar = max(factor, round_by_factor(height, factor))
|
| 55 |
+
w_bar = max(factor, round_by_factor(width, factor))
|
| 56 |
+
if h_bar * w_bar > max_pixels:
|
| 57 |
+
beta = math.sqrt((height * width) / max_pixels)
|
| 58 |
+
h_bar = floor_by_factor(height / beta, factor)
|
| 59 |
+
w_bar = floor_by_factor(width / beta, factor)
|
| 60 |
+
elif h_bar * w_bar < min_pixels:
|
| 61 |
+
beta = math.sqrt(min_pixels / (height * width))
|
| 62 |
+
h_bar = ceil_by_factor(height * beta, factor)
|
| 63 |
+
w_bar = ceil_by_factor(width * beta, factor)
|
| 64 |
+
return int(h_bar), int(w_bar)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class SeedVLInfer:
|
| 68 |
+
def __init__(
|
| 69 |
+
self,
|
| 70 |
+
model_id: str,
|
| 71 |
+
api_key: str,
|
| 72 |
+
base_url: str = 'https://ark.cn-beijing.volces.com/api/v3/chat/completions',
|
| 73 |
+
min_pixels: int = 4 * 28 * 28,
|
| 74 |
+
max_pixels: int = 5120 * 28 * 28,
|
| 75 |
+
video_sampling_strategy: dict = {
|
| 76 |
+
'sampling_fps':
|
| 77 |
+
1,
|
| 78 |
+
'min_n_frames':
|
| 79 |
+
16,
|
| 80 |
+
'max_video_length':
|
| 81 |
+
81920,
|
| 82 |
+
'max_pixels_choices': [
|
| 83 |
+
640 * 28 * 28, 512 * 28 * 28, 384 * 28 * 28, 256 * 28 * 28,
|
| 84 |
+
160 * 28 * 28, 128 * 28 * 28
|
| 85 |
+
],
|
| 86 |
+
'use_timestamp':
|
| 87 |
+
True,
|
| 88 |
+
},
|
| 89 |
+
):
|
| 90 |
+
self.base_url = base_url
|
| 91 |
+
self.api_key = api_key
|
| 92 |
+
self.model_id = model_id
|
| 93 |
+
self.min_pixels = min_pixels
|
| 94 |
+
self.max_pixels = max_pixels
|
| 95 |
+
self.sampling_fps = video_sampling_strategy.get('sampling_fps', 1)
|
| 96 |
+
self.min_n_frames = video_sampling_strategy.get('min_n_frames', 16)
|
| 97 |
+
self.max_video_length = video_sampling_strategy.get(
|
| 98 |
+
'max_video_length', 81920)
|
| 99 |
+
self.max_pixels_choices = video_sampling_strategy.get(
|
| 100 |
+
'max_pixels_choices', [
|
| 101 |
+
640 * 28 * 28, 512 * 28 * 28, 384 * 28 * 28, 256 * 28 * 28,
|
| 102 |
+
160 * 28 * 28, 128 * 28 * 28
|
| 103 |
+
])
|
| 104 |
+
self.use_timestamp = video_sampling_strategy.get('use_timestamp', True)
|
| 105 |
+
|
| 106 |
+
def preprocess_video(self, video_path: str):
|
| 107 |
+
try:
|
| 108 |
+
video_reader = decord.VideoReader(video_path, num_threads=2)
|
| 109 |
+
fps = video_reader.get_avg_fps()
|
| 110 |
+
except decord._ffi.base.DECORDError:
|
| 111 |
+
video_reader = [
|
| 112 |
+
frame.convert('RGB')
|
| 113 |
+
for frame in ImageSequence.Iterator(Image.open(video_path))
|
| 114 |
+
]
|
| 115 |
+
fps = 1
|
| 116 |
+
|
| 117 |
+
length = len(video_reader)
|
| 118 |
+
n_frames = min(
|
| 119 |
+
max(math.ceil(length / fps * self.sampling_fps),
|
| 120 |
+
self.min_n_frames), length)
|
| 121 |
+
frame_indices = np.linspace(0, length - 1,
|
| 122 |
+
n_frames).round().astype(int).tolist()
|
| 123 |
+
max_pixels = self.max_pixels
|
| 124 |
+
for round_idx, max_pixels in enumerate(self.max_pixels_choices):
|
| 125 |
+
is_last_round = round_idx == len(self.max_pixels_choices) - 1
|
| 126 |
+
if len(frame_indices
|
| 127 |
+
) * max_pixels / 28 / 28 > self.max_video_length:
|
| 128 |
+
if is_last_round:
|
| 129 |
+
max_frame_num = int(self.max_video_length / max_pixels *
|
| 130 |
+
28 * 28)
|
| 131 |
+
select_ids = np.linspace(
|
| 132 |
+
0,
|
| 133 |
+
len(frame_indices) - 1,
|
| 134 |
+
max_frame_num).round().astype(int).tolist()
|
| 135 |
+
frame_indices = [
|
| 136 |
+
frame_indices[select_id] for select_id in select_ids
|
| 137 |
+
]
|
| 138 |
+
else:
|
| 139 |
+
continue
|
| 140 |
+
else:
|
| 141 |
+
break
|
| 142 |
+
|
| 143 |
+
if hasattr(video_reader, "get_batch"):
|
| 144 |
+
video_clip = torch.from_numpy(
|
| 145 |
+
video_reader.get_batch(frame_indices).asnumpy()).permute(
|
| 146 |
+
0, 3, 1, 2)
|
| 147 |
+
else:
|
| 148 |
+
video_clip_array = torch.stack(
|
| 149 |
+
[np.array(video_reader[i]) for i in frame_indices], dim=0)
|
| 150 |
+
video_clip = torch.from_numpy(video_clip_array).permute(0, 3, 1, 2)
|
| 151 |
+
|
| 152 |
+
height, width = video_clip.shape[-2:]
|
| 153 |
+
resized_height, resized_width = get_resized_hw_for_Navit(
|
| 154 |
+
height,
|
| 155 |
+
width,
|
| 156 |
+
min_pixels=self.min_pixels,
|
| 157 |
+
max_pixels=max_pixels,
|
| 158 |
+
)
|
| 159 |
+
resized_video_clip = resize(video_clip,
|
| 160 |
+
(resized_height, resized_width),
|
| 161 |
+
interpolation=InterpolationMode.BICUBIC,
|
| 162 |
+
antialias=True)
|
| 163 |
+
if self.use_timestamp:
|
| 164 |
+
resized_video_clip = [
|
| 165 |
+
(round(i / fps, 1), f)
|
| 166 |
+
for i, f in zip(frame_indices, resized_video_clip)
|
| 167 |
+
]
|
| 168 |
+
return resized_video_clip
|
| 169 |
+
|
| 170 |
+
def preprocess_streaming_frame(self, frame: torch.Tensor):
|
| 171 |
+
height, width = frame.shape[-2:]
|
| 172 |
+
resized_height, resized_width = get_resized_hw_for_Navit(
|
| 173 |
+
height,
|
| 174 |
+
width,
|
| 175 |
+
min_pixels=self.min_pixels,
|
| 176 |
+
max_pixels=self.max_pixels_choices[0],
|
| 177 |
+
)
|
| 178 |
+
resized_frame = resize(frame[None], (resized_height, resized_width),
|
| 179 |
+
interpolation=InterpolationMode.BICUBIC,
|
| 180 |
+
antialias=True)[0]
|
| 181 |
+
return resized_frame
|
| 182 |
+
|
| 183 |
+
def encode_image(self, image: torch.Tensor) -> str:
|
| 184 |
+
if image.shape[0] == 4:
|
| 185 |
+
image = image[:3]
|
| 186 |
+
encoded = encode_jpeg(image)
|
| 187 |
+
return base64.b64encode(encoded.numpy()).decode('utf-8')
|
| 188 |
+
|
| 189 |
+
def construct_messages(self,
|
| 190 |
+
inputs: dict,
|
| 191 |
+
streaming_timestamp: int = None,
|
| 192 |
+
online: bool = False) -> list[dict]:
|
| 193 |
+
content = []
|
| 194 |
+
for i, path in enumerate(inputs.get('files', [])):
|
| 195 |
+
if path.endswith('.mp4'):
|
| 196 |
+
video = self.preprocess_video(video_path=path)
|
| 197 |
+
for frame in video:
|
| 198 |
+
if self.use_timestamp:
|
| 199 |
+
timestamp, frame = frame
|
| 200 |
+
content.append({
|
| 201 |
+
"type": "text",
|
| 202 |
+
"text": f'[{timestamp} second]',
|
| 203 |
+
})
|
| 204 |
+
content.append({
|
| 205 |
+
"type": "image_url",
|
| 206 |
+
"image_url": {
|
| 207 |
+
"url":
|
| 208 |
+
f"data:image/jpeg;base64,{self.encode_image(frame)}",
|
| 209 |
+
"detail": "high"
|
| 210 |
+
},
|
| 211 |
+
})
|
| 212 |
+
else:
|
| 213 |
+
image = read_image(path)
|
| 214 |
+
if online and path.endswith('.webp'):
|
| 215 |
+
streaming_timestamp = i
|
| 216 |
+
if streaming_timestamp is not None:
|
| 217 |
+
image = self.preprocess_streaming_frame(frame=image)
|
| 218 |
+
content.append({
|
| 219 |
+
"type": "image_url",
|
| 220 |
+
"image_url": {
|
| 221 |
+
"url":
|
| 222 |
+
f"data:image/jpeg;base64,{self.encode_image(image)}",
|
| 223 |
+
"detail": "high"
|
| 224 |
+
},
|
| 225 |
+
})
|
| 226 |
+
if streaming_timestamp is not None:
|
| 227 |
+
content.insert(-1, {
|
| 228 |
+
"type": "text",
|
| 229 |
+
"text": f'[{streaming_timestamp} second]',
|
| 230 |
+
})
|
| 231 |
+
query = inputs.get('text', '')
|
| 232 |
+
if query:
|
| 233 |
+
content.append({
|
| 234 |
+
"type": "text",
|
| 235 |
+
"text": query,
|
| 236 |
+
})
|
| 237 |
+
messages = [{
|
| 238 |
+
"role": "user",
|
| 239 |
+
"content": content,
|
| 240 |
+
}]
|
| 241 |
+
return messages
|
| 242 |
+
|
| 243 |
+
def request(self,
|
| 244 |
+
messages,
|
| 245 |
+
thinking: bool = True,
|
| 246 |
+
temperature: float = 1.0):
|
| 247 |
+
headers = {
|
| 248 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 249 |
+
"Content-Type": "application/json"
|
| 250 |
+
}
|
| 251 |
+
payload = {
|
| 252 |
+
"model": self.model_id,
|
| 253 |
+
"messages": messages,
|
| 254 |
+
"stream": True,
|
| 255 |
+
"thinking": {
|
| 256 |
+
"type": "enabled" if thinking else "disabled",
|
| 257 |
+
},
|
| 258 |
+
"temperature": temperature,
|
| 259 |
+
}
|
| 260 |
+
for _ in range(10):
|
| 261 |
+
try:
|
| 262 |
+
requested = requests.post(self.base_url,
|
| 263 |
+
headers=headers,
|
| 264 |
+
json=payload,
|
| 265 |
+
stream=True,
|
| 266 |
+
timeout=600)
|
| 267 |
+
break
|
| 268 |
+
except Exception as e:
|
| 269 |
+
time.sleep(0.1)
|
| 270 |
+
print(e)
|
| 271 |
+
content, reasoning_content = '', ''
|
| 272 |
+
for line in requested.iter_lines():
|
| 273 |
+
if not line:
|
| 274 |
+
continue
|
| 275 |
+
if line.startswith(b'data:'):
|
| 276 |
+
data = line[len("data: "):]
|
| 277 |
+
if data == b"[DONE]":
|
| 278 |
+
yield content, reasoning_content, True
|
| 279 |
+
break
|
| 280 |
+
delta = json.loads(data)['choices'][0]['delta']
|
| 281 |
+
content += delta['content']
|
| 282 |
+
reasoning_content += delta.get('reasoning_content', '')
|
| 283 |
+
yield content, reasoning_content, False
|
| 284 |
+
|
| 285 |
+
def __call__(self,
|
| 286 |
+
inputs: dict,
|
| 287 |
+
history: list[dict] = [],
|
| 288 |
+
mode: str = ConversationModeI18N.D,
|
| 289 |
+
temperature: float = 1.0,
|
| 290 |
+
online: bool = False):
|
| 291 |
+
messages = self.construct_messages(inputs=inputs, online=online)
|
| 292 |
+
updated_history = history + messages
|
| 293 |
+
for response, reasoning, finished in self.request(
|
| 294 |
+
messages=updated_history,
|
| 295 |
+
thinking=mode == ConversationModeI18N.D,
|
| 296 |
+
temperature=temperature):
|
| 297 |
+
if mode == ConversationModeI18N.D:
|
| 298 |
+
response = '<think>' + reasoning + '</think>' + response
|
| 299 |
+
yield response, updated_history + [{'role': 'assistant', 'content': response}], finished
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
supervision==0.25.1
|
| 2 |
+
openai==1.76.0
|
| 3 |
+
opencv-python==4.10.0.84
|
| 4 |
+
numpy==1.26.2
|
| 5 |
+
pillow==11.0.0
|
| 6 |
+
matplotlib==3.10.
|
| 7 |
+
decord==0.6.0
|
visualizer.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import supervision as sv
|
| 5 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 6 |
+
from supervision.annotators.utils import resolve_color
|
| 7 |
+
# visualization toos based on supervision
|
| 8 |
+
BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=2)
|
| 9 |
+
|
| 10 |
+
class LabelAnnotator(sv.LabelAnnotator):
|
| 11 |
+
|
| 12 |
+
@staticmethod
|
| 13 |
+
def resolve_text_background_xyxy(
|
| 14 |
+
center_coordinates,
|
| 15 |
+
text_wh,
|
| 16 |
+
position,
|
| 17 |
+
):
|
| 18 |
+
center_x, center_y = center_coordinates
|
| 19 |
+
text_w, text_h = text_wh
|
| 20 |
+
return center_x, center_y, center_x + text_w, center_y + text_h
|
| 21 |
+
|
| 22 |
+
def _draw_labels(
|
| 23 |
+
self,
|
| 24 |
+
scene: np.ndarray,
|
| 25 |
+
labels: list[str],
|
| 26 |
+
label_properties: np.ndarray,
|
| 27 |
+
detections,
|
| 28 |
+
custom_color_lookup,
|
| 29 |
+
) -> None:
|
| 30 |
+
assert len(labels) == len(label_properties) == len(detections), (
|
| 31 |
+
f"Number of label properties ({len(label_properties)}), "
|
| 32 |
+
f"labels ({len(labels)}) and detections ({len(detections)}) "
|
| 33 |
+
"do not match."
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
color_lookup = (
|
| 37 |
+
custom_color_lookup
|
| 38 |
+
if custom_color_lookup is not None
|
| 39 |
+
else self.color_lookup
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
font = ImageFont.truetype("simhei.ttf", int(30 * self.text_scale))
|
| 43 |
+
|
| 44 |
+
for idx, label_property in enumerate(label_properties):
|
| 45 |
+
background_color = resolve_color(
|
| 46 |
+
color=self.color,
|
| 47 |
+
detections=detections,
|
| 48 |
+
detection_idx=idx,
|
| 49 |
+
color_lookup=color_lookup,
|
| 50 |
+
)
|
| 51 |
+
text_color = resolve_color(
|
| 52 |
+
color=self.text_color,
|
| 53 |
+
detections=detections,
|
| 54 |
+
detection_idx=idx,
|
| 55 |
+
color_lookup=color_lookup,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
box_xyxy = label_property[:4]
|
| 59 |
+
text_height_padded = label_property[4]
|
| 60 |
+
self.draw_rounded_rectangle(
|
| 61 |
+
scene=scene,
|
| 62 |
+
xyxy=box_xyxy,
|
| 63 |
+
color=background_color.as_bgr(),
|
| 64 |
+
border_radius=self.border_radius,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
text_x = box_xyxy[0] + self.text_padding
|
| 68 |
+
text_y = box_xyxy[1]
|
| 69 |
+
|
| 70 |
+
scene_pil = Image.fromarray(cv2.cvtColor(scene, cv2.COLOR_BGR2RGB))
|
| 71 |
+
draw = ImageDraw.Draw(scene_pil)
|
| 72 |
+
draw.text(
|
| 73 |
+
(text_x, text_y),
|
| 74 |
+
labels[idx],
|
| 75 |
+
font=font,
|
| 76 |
+
fill=(text_color.r, text_color.g, text_color.b),
|
| 77 |
+
)
|
| 78 |
+
scene[:] = cv2.cvtColor(np.array(scene_pil), cv2.COLOR_RGB2BGR)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
LABEL_ANNOTATOR = LabelAnnotator(text_padding=4,
|
| 82 |
+
text_scale=0.5,
|
| 83 |
+
text_thickness=1)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
POINT_ANNOTATOR = sv.DotAnnotator(radius=6)
|
| 87 |
+
|
| 88 |
+
def draw_boxes_points_with_labels(
|
| 89 |
+
cv2_image,
|
| 90 |
+
boxes=None,
|
| 91 |
+
points=None,
|
| 92 |
+
classes=None,
|
| 93 |
+
output_path=None,
|
| 94 |
+
):
|
| 95 |
+
annotated_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
|
| 96 |
+
|
| 97 |
+
if boxes is not None and boxes.size:
|
| 98 |
+
detections = sv.Detections(
|
| 99 |
+
xyxy=boxes,
|
| 100 |
+
class_id=np.arange(len(boxes)),
|
| 101 |
+
confidence=np.ones(len(boxes))
|
| 102 |
+
)
|
| 103 |
+
annotated_image = BOUNDING_BOX_ANNOTATOR.annotate(
|
| 104 |
+
annotated_image, detections)
|
| 105 |
+
if points is not None and points.size:
|
| 106 |
+
points = np.concatenate([points, points], axis=1)
|
| 107 |
+
detections = sv.Detections(
|
| 108 |
+
xyxy=points,
|
| 109 |
+
class_id=np.arange(len(points)),
|
| 110 |
+
confidence=np.ones(len(points))
|
| 111 |
+
)
|
| 112 |
+
annotated_image = POINT_ANNOTATOR.annotate(
|
| 113 |
+
annotated_image, detections,
|
| 114 |
+
)
|
| 115 |
+
if classes:
|
| 116 |
+
annotated_image = LABEL_ANNOTATOR.annotate(
|
| 117 |
+
annotated_image, detections, labels=classes
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
if output_path:
|
| 121 |
+
cv2.imwrite(
|
| 122 |
+
output_path,
|
| 123 |
+
cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
return annotated_image
|