weihongliang commited on
Commit
c15db92
Β·
verified Β·
1 Parent(s): 0fe31ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -5
app.py CHANGED
@@ -2011,14 +2011,14 @@ def handle_example_click(mode, img_path_display, file_list_for_state, obj_info):
2011
  with gr.Blocks() as app:
2012
  #gr.Markdown("# Personalized Multimodal Understanding with RC-MLLM")
2013
  gr.Markdown("<div style='text-align: center;'><h1 style=' font-size: 28px; '>Personalized Multimodal Understanding with RC-MLLM</h1></div>")
2014
- gr.Markdown("**RC-MLLM** model is developed based on the Qwen2-VL model through a novel method called **RCVIT (Region-level Context-aware Visual Instruction Tuning)**, using the specially constructed **RCMU dataset** for training. Its core feature is the capability for **Region-level Context-aware Multimodal Understanding (RCMU)**. This means it can simultaneously understand both the visual content of specific regions/objects within an image and their associated textual information (utilizing bounding boxes coordinates), allowing it to respond to user instructions in a more context-aware manner. Simply put, RC-MLLM not only understands images but can also integrate the textual information linked to specific objects within the image for understanding. It achieves outstanding performance on RCMU tasks and is suitable for applications like multimodal RAG and personalized conversation.")
2015
 
2016
  markdown_content = """
2017
- πŸ“‘ [Arxiv](https://arxiv.org/abs/your-paper-id) |
2018
- πŸ€— [Checkpoint]() |
2019
  πŸ“ [Dataset](https://huggingface.co/your-model-name) |
2020
- [Github](https://github.com/your-username/your-repo) |
2021
- πŸš€ [Multimodal RAG with RC-MLLM](https://your-project-url.com)
2022
  """
2023
  gr.Markdown(markdown_content)
2024
 
 
2011
  with gr.Blocks() as app:
2012
  #gr.Markdown("# Personalized Multimodal Understanding with RC-MLLM")
2013
  gr.Markdown("<div style='text-align: center;'><h1 style=' font-size: 28px; '>Personalized Multimodal Understanding with RC-MLLM</h1></div>")
2014
+ gr.Markdown("**RC-MLLM** model is developed based on the Qwen2-VL model through a novel method called **RCVIT (Region-level Context-aware Visual Instruction Tuning)**, using the specially constructed **RCMU dataset** for training. Its core feature is the capability for **Region-level Context-aware Multimodal Understanding (RCMU)**. This means it can simultaneously understand both the visual content of specific regions/objects within an image and their associated textual information (utilizing bounding boxes coordinates), allowing it to respond to user instructions in a more context-aware manner. Simply put, RC-MLLM not only understands images but can also integrate the textual information linked to specific objects within the image for understanding. It achieves outstanding performance on RCMU tasks and is suitable for applications like personalized conversation.")
2015
 
2016
  markdown_content = """
2017
+ πŸ“‘ [Region-Level Context-Aware Multimodal Understanding](https://arxiv.org/abs/2508.12263) |
2018
+ πŸ€— Models:[RC-Qwen2VL-2b](https://huggingface.co/weihongliang/RC-Qwen2VL-2b/blob/main/README.md) [RC-Qwen2VL-7b](https://huggingface.co/weihongliang/RC-Qwen2VL-7b/blob/main/README.md)|
2019
  πŸ“ [Dataset](https://huggingface.co/your-model-name) |
2020
+ [Github](https://github.com/hongliang-wei/RC-MLLM) |
2021
+ πŸš€ [Celebrity Recognition and VQA Demo](https://huggingface.co/spaces/weihongliang/RCMLLM)
2022
  """
2023
  gr.Markdown(markdown_content)
2024