Spaces:

stevengrove
/

MindOmni

Runtime error

App Files Files Community

stevengrove commited on Jun 9

Commit

18d050b

verified ·

1 Parent(s): 067ef85

initial commit

Browse files

Files changed (26) hide show

.gitattributes +11 -0
License.txt +14 -0
app.py +176 -4
assets/example_outputs/case_1.png +3 -0
assets/example_outputs/case_2.png +3 -0
assets/example_outputs/case_3.png +3 -0
assets/example_outputs/case_4.png +3 -0
assets/example_outputs/case_5.png +3 -0
assets/example_outputs/case_6.png +3 -0
assets/example_outputs/case_7.png +3 -0
assets/framework.png +3 -0
assets/grpo_curve.png +3 -0
assets/inference.png +3 -0
assets/reasoning_case_com.png +3 -0
assets/tapdole.jpeg +0 -0
requirements.txt +21 -0
src/__init__.py +5 -0
src/image_decoder/__init__.py +6 -0
src/image_decoder/image_pipeline.py +273 -0
src/image_decoder/model.py +395 -0
src/image_decoder/modeling_phi3.py +1611 -0
src/image_decoder/processor.py +221 -0
src/image_decoder/scheduler.py +194 -0
src/image_decoder/transformer.py +179 -0
src/mindomni.py +219 -0
src/mllm.py +245 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/example_outputs/case_1.png filter=lfs diff=lfs merge=lfs -text
+assets/example_outputs/case_2.png filter=lfs diff=lfs merge=lfs -text
+assets/example_outputs/case_3.png filter=lfs diff=lfs merge=lfs -text
+assets/example_outputs/case_4.png filter=lfs diff=lfs merge=lfs -text
+assets/example_outputs/case_5.png filter=lfs diff=lfs merge=lfs -text
+assets/example_outputs/case_6.png filter=lfs diff=lfs merge=lfs -text
+assets/example_outputs/case_7.png filter=lfs diff=lfs merge=lfs -text
+assets/framework.png filter=lfs diff=lfs merge=lfs -text
+assets/grpo_curve.png filter=lfs diff=lfs merge=lfs -text
+assets/inference.png filter=lfs diff=lfs merge=lfs -text
+assets/reasoning_case_com.png filter=lfs diff=lfs merge=lfs -text

License.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+Tencent is pleased to support the open source community by making MindOmni available.
+Copyright (C) 2025 Tencent.  All rights reserved.
+MindOmni is licensed under the MIT License.
+Terms of the MIT License:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

app.py CHANGED Viewed

@@ -1,7 +1,179 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import os
+import argparse
+from functools import partial
+import torch
+import random
+import spaces
 import gradio as gr
+from src import MindOmni
+NEGATIVE_PROMPT = '''
+low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers.
+'''
+def parse_args():
+    args = argparse.ArgumentParser(description='MindOmni')
+    args.add_argument('--device', type=str, default='cuda')
+    args.add_argument('--dtype', type=str, default='bf16')
+    args.add_argument('--server_name', type=str, default='127.0.0.1')
+    args.add_argument('--port', type=int, default=8080)
+    args.add_argument('--model_path', type=str,
+                      default='your_path/MindOmni')
+    args = args.parse_args()
+    return args
+def build_model(args):
+    device = args.device
+    MindOmni_model = MindOmni.from_pretrained(args.model_path)
+    if args.dtype == "bf16":
+        dtype = torch.bfloat16
+    MindOmni_model.to(device=device, dtype=dtype)
+    MindOmni_model.eval()
+    return MindOmni_model
+@spaces.GPU(duration=180)
+def understand_func(
+        MindOmni_model, text, do_sample, temperature,
+        max_new_tokens, input_llm_images):
+    if input_llm_images is not None and not isinstance(input_llm_images, list):
+        input_llm_images = [input_llm_images]
+    answer = MindOmni_model.generate_text(
+        text, input_llm_images, do_sample, temperature,
+        max_new_tokens, only_understand=True)
+    return answer
+@spaces.GPU(duration=180)
+def generate_func(
+        MindOmni_model, text, use_cot, height, width, guidance_scale, inference_steps, seed, separate_cfg_infer, offload_model, max_input_image_size, randomize_seed, save_images, do_sample, temperature, max_new_tokens, input_llm_images, only_understand):
+    if input_llm_images is not None and not isinstance(input_llm_images, list):
+        input_llm_images = [input_llm_images]
+    if randomize_seed:
+        seed = random.randint(0, 10000000)
+    os.makedirs(os.path.dirname('/tmp/.unhold'), exist_ok=True)
+    with open('/tmp/.unhold', 'w') as f:
+        f.write('')
+    output, prompt_ = MindOmni_model.generate_image(
+        height, width, guidance_scale, inference_steps, separate_cfg_infer, offload_model, seed, max_input_image_size,
+        text, NEGATIVE_PROMPT, input_llm_images, do_sample, temperature, max_new_tokens, only_understand, use_cot=use_cot)
+    os.remove('/tmp/.unhold')
+    img = output[0]
+    if save_images:
+        # Save All Generated Images
+        from datetime import datetime
+        # Create outputs directory if it doesn't exist
+        os.makedirs('assets/outputs', exist_ok=True)
+        # Generate unique filename with timestamp
+        timestamp = datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
+        output_path = os.path.join('assets/outputs', f'{timestamp}.png')
+        # Save the image
+        img.save(output_path)
+    return img, prompt_, seed
+def build_gradio(args, MindOmni_model):
+    with gr.Blocks() as demo:
+        gr.Markdown("## 🪄 MindOmni Demo")
+        with gr.Tabs():
+            # ---------- GENERATE ----------
+            with gr.TabItem("🎨 Generate"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        g_prompt = gr.Textbox(label="Text prompt")
+                        g_image = gr.Image(label="Condition image (optional)", type="filepath")
+                        g_btn = gr.Button("🚀 Generate Image")
+                        with gr.Accordion("📚 Image Generation Args"):
+                            g_use_cot = gr.Checkbox(label="With thinking", value=False)
+                            g_do_sample = gr.Checkbox(label="Do sample", value=False)
+                            g_temperature = gr.Slider(0, 10, value=1, label="Temperature")
+                            g_max_new_tok = gr.Slider(32, 8192, value=512, label="Max new tokens")
+                            g_height = gr.Slider(128, 2048, value=1024, step=16, label="Height")
+                            g_width = gr.Slider(128, 2048, value=1024, step=16, label="Width")
+                            g_scale = gr.Slider(1.0, 5.0, value=3.0, step=0.1, label="Guidance Scale")
+                            g_steps = gr.Slider(1, 100, value=50, label="Inference Steps")
+                            g_seed = gr.Slider(0, 2**31 - 1, value=42, label="Seed")
+                            g_rand = gr.Checkbox(label="Randomize seed", value=False)
+                            g_max_img = gr.Slider(128, 2048, value=1024, step=16,
+                                                  label="Max input image size")
+                            g_sep_cfg = gr.Checkbox(label="Separate-CFG infer", value=True)
+                            g_offload = gr.Checkbox(label="Offload model to CPU", value=False)
+                            g_save = gr.Checkbox(label="Save generated images", value=False)
+                    with gr.Column(scale=1):
+                        g_out_img = gr.Image(label="Generated Image")
+                        g_prompt_out = gr.Textbox(label="MindOmni CoT Content")
+                        g_seed_out = gr.Textbox(label="Used seed")
+                        with gr.Accordion("🖼️ Prompt Examples: Text-only"):
+                            gr.Examples(
+                                examples=[
+                                    ["Futuristic city skyline at sunset, digital art", 42, False, False, False, 1024, 1024, "assets/example_outputs/case_1.png"],
+                                    ["An image of multiple apples, the quantity of apples is the solution of '2x + 6 = 16'.", 1723284, False, True, False, 512, 1024, "assets/example_outputs/case_2.png"],
+                                    ["A park with benches equal to the solution of 'x^2 -2x = 8'.", 4318852, False, True, False, 512, 512, "assets/example_outputs/case_3.png"],
+                                    ["An image of China's national treasure animal.", 42, False, True, False, 1024, 1024, "assets/example_outputs/case_4.png"],
+                                    ["Scene in the Sydney Opera House when New York is at noon.", 42, False, True, False, 1024, 1024, "assets/example_outputs/case_5.png"],
+                                    ["Generate an image of an animal with (3 + 6) lives", 7393438, False, True, False, 1024, 1024, "assets/example_outputs/case_6.png"],
+                                ],
+                                inputs=[g_prompt, g_seed, g_rand, g_use_cot, g_do_sample, g_height, g_width, g_out_img],
+                            )
+                        with gr.Accordion("🖼️ Prompt Examples: With reference image"):
+                            gr.Examples(
+                                examples=[
+                                    ["An image of the animal growing up", "assets/tapdole.jpeg", 42, False, True, True, 1024, 1024, "assets/example_outputs/case_7.png"]
+                                ],
+                                inputs=[g_prompt, g_image, g_seed, g_rand, g_use_cot, g_do_sample, g_height, g_width, g_out_img],
+                            )
+                g_btn.click(
+                    partial(generate_func, MindOmni_model),
+                    inputs=[g_prompt, g_use_cot, g_height, g_width, g_scale, g_steps,
+                            g_seed, g_sep_cfg, g_offload, g_max_img, g_rand, g_save,
+                            g_do_sample, g_temperature, g_max_new_tok,
+                            g_image, gr.State(False)],          # only_understand=False
+                    outputs=[g_out_img, g_prompt_out, g_seed_out])
+            # ---------- UNDERSTAND ----------
+            with gr.TabItem("🧠 Understand"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        u_prompt = gr.Textbox(label="Text prompt")
+                        u_image = gr.Image(label="Image (optional)", type="filepath")
+                        u_btn = gr.Button("🔍 Understand")
+                        with gr.Accordion("📚 Text Generation Args"):
+                            u_do_sample = gr.Checkbox(label="Do sample", value=False)
+                            u_temperature = gr.Slider(0, 10, value=1, label="Temperature")
+                            u_max_new_tok = gr.Slider(32, 8192, value=512, label="Max new tokens")
+                    with gr.Column(scale=1):
+                        u_answer = gr.Textbox(label="Answer", lines=8)
+                u_btn.click(
+                    partial(understand_func, MindOmni_model),
+                    inputs=[u_prompt, u_do_sample,
+                            u_temperature, u_max_new_tok, u_image],
+                    outputs=u_answer)
+        demo.launch(server_name=args.server_name, server_port=args.port)
+def main():
+    args = parse_args()
+    print(f'running args: {args}')
+    MindOmni_model = build_model(args)
+    build_gradio(args, MindOmni_model)
+if __name__ == '__main__':
+    main()

assets/example_outputs/case_1.png ADDED Viewed

Git LFS Details

SHA256: fc5d5930caa9582f75f622e2d22fe7ff41ed2d8d324d3fb2a452cdf6fe4b3d7d
Pointer size: 131 Bytes
Size of remote file: 903 kB

assets/example_outputs/case_2.png ADDED Viewed

Git LFS Details

SHA256: 9a912bf42c39967cf473cf0a79bc9c5ceb5294f677a4c31abd51dcd644fc861b
Pointer size: 131 Bytes
Size of remote file: 643 kB

assets/example_outputs/case_3.png ADDED Viewed

Git LFS Details

SHA256: 5c9f373ff129c5553be29bcabef537390e6790c82011592494b57f2ccb64fd67
Pointer size: 131 Bytes
Size of remote file: 483 kB

assets/example_outputs/case_4.png ADDED Viewed

Git LFS Details

SHA256: 82f75c052fed92ffa698dcaf8669e675ece8160b6b7f6a40f54de8eaf95e00c9
Pointer size: 132 Bytes
Size of remote file: 1.38 MB

assets/example_outputs/case_5.png ADDED Viewed

Git LFS Details

SHA256: 3124064afc37df34f7e4544881ce56d2d7c70a52095f5eb4e1a79d5e3b68ebd3
Pointer size: 131 Bytes
Size of remote file: 851 kB

assets/example_outputs/case_6.png ADDED Viewed

Git LFS Details

SHA256: a7f5806c38c4b11b0cd1d4ecf4a540fa4e5f5a30a485f18574ef01cf1d29e9cd
Pointer size: 132 Bytes
Size of remote file: 1.15 MB

assets/example_outputs/case_7.png ADDED Viewed

Git LFS Details

SHA256: e8fdba59a52d503b50785fad20a68d968cb8085829ab0f645a61c8bf5842e89b
Pointer size: 132 Bytes
Size of remote file: 1.25 MB

assets/framework.png ADDED Viewed

Git LFS Details

SHA256: db7bd9d42517f5c5ca029caa2d0df470fe726deb49159c135200d0b94bc8af7e
Pointer size: 131 Bytes
Size of remote file: 628 kB

assets/grpo_curve.png ADDED Viewed

Git LFS Details

SHA256: 50f7a896152152c034bcbfc685c31593ce28d1f4f790f96a1c3b0d4bf5487303
Pointer size: 131 Bytes
Size of remote file: 192 kB

assets/inference.png ADDED Viewed

Git LFS Details

SHA256: d0d385f141ab67ef2297667d157a5ffbab15f6bfd86e1e7ad2363babd9e3ae61
Pointer size: 131 Bytes
Size of remote file: 846 kB

assets/reasoning_case_com.png ADDED Viewed

Git LFS Details

SHA256: 56e354f1ca5bac5c6b4aa2bc728474093bb028335cfbf1d0deacea94bec0c2f0
Pointer size: 132 Bytes
Size of remote file: 2.74 MB

assets/tapdole.jpeg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+accelerate==1.7.0
+datasets==2.20.0
+decord==0.6.0
+deepspeed==0.16.5
+diffusers==0.30.3
+gradio==4.44.1
+gradio_client==1.3.0
+huggingface-hub==0.32.0
+numpy==1.26.3
+omegaconf==2.3.0
+pandas==2.2.3
+pathvalidate==3.2.1
+peft==0.13.2
+qwen-vl-utils==0.0.8
+safetensors==0.4.5
+scipy==1.13.1
+sympy==1.13.3
+timm==0.9.16
+tokenizers==0.21.1
+torch==2.4.0
+transformers==4.51.1

src/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .image_decoder import *    # noqa
+from .mllm import MindOmniMLLM, MindOmniMLLM_Model
+from .mindomni import MindOmni
+__all__ = ["MindOmniMLLM", "MindOmniMLLM_Model", "MindOmni"]

src/image_decoder/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .image_pipeline import ImageDecoderPipeline
+from .model import OmniGen
+from .modeling_phi3 import Phi3DecoderLayer
+from .processor import OmniGenProcessor
+__all__ = ["ImageDecoderPipeline", "OmniGen", "Phi3DecoderLayer", "OmniGenProcessor"]

src/image_decoder/image_pipeline.py ADDED Viewed

	@@ -0,0 +1,273 @@

+#  This code is based on OmniGen
+from typing import List, Union
+import gc
+from PIL import Image
+import torch
+try:
+    import torch_npu
+except Exception as e:
+    print(e)
+from diffusers.models import AutoencoderKL
+from diffusers.utils import logging
+import torch.nn as nn
+from .processor import OmniGenProcessor
+from .model import OmniGen
+from .scheduler import OmniGenScheduler
+logger = logging.get_logger(__name__)
+class ImageDecoderPipeline:
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        model: OmniGen,
+        connector: nn.Module,
+        processor: OmniGenProcessor,
+        device: Union[str, torch.device] = None,
+    ):
+        self.vae = vae
+        self.model = model
+        self.connector = connector
+        self.processor = processor
+        self.device = device
+        if device is None:
+            if torch.cuda.is_available():
+                self.device = torch.device("cuda")
+            elif torch_npu.npu.is_available():
+                self.device = torch.device("npu")
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            else:
+                logger.info("Don't detect any available GPUs, using CPU instead, this may take long time to generate image!!!")
+                self.device = torch.device("cpu")
+        # self.model.to(torch.bfloat16)
+        self.model.eval()
+        self.vae.eval()
+        self.model_cpu_offload = False
+    def to(self, device: Union[str, torch.device]):
+        if isinstance(device, str):
+            device = torch.device(device)
+        self.model.to(device)
+        self.vae.to(device)
+        self.device = device
+    def vae_encode(self, x, dtype):
+        if self.vae.config.shift_factor is not None:
+            x = self.vae.encode(x).latent_dist.sample()
+            x = (x - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        else:
+            x = self.vae.encode(x).latent_dist.sample().mul_(self.vae.config.scaling_factor)
+        x = x.to(dtype)
+        return x
+    def move_to_device(self, data):
+        if isinstance(data, list):
+            return [x.to(self.device) for x in data]
+        return data.to(self.device)
+    def enable_model_cpu_offload(self):
+        self.model_cpu_offload = True
+        self.model.to("cpu")
+        self.vae.to("cpu")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()  # Clear VRAM
+        elif torch_npu.npu.is_available():
+            torch_npu.npu.empty_cache()  # Clear VRAM
+        gc.collect()  # Run garbage collection to free system RAM
+    def disable_model_cpu_offload(self):
+        self.model_cpu_offload = False
+        self.model.to(self.device)
+        self.vae.to(self.device)
+    @torch.no_grad()
+    def __call__(
+        self,
+        context_hidden_state: Union[str, List[str]] = None,
+        neg_context_hidden_state: Union[str, List[str]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 3,
+        max_input_image_size: int = 1024,
+        separate_cfg_infer: bool = True,
+        offload_model: bool = False,
+        use_kv_cache: bool = True,
+        offload_kv_cache: bool = True,
+        dtype: torch.dtype = torch.bfloat16,
+        seed: int = None,
+        output_type: str = "pil",
+        tqdm_disable: bool = False,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            input_images (`List[str]` or `List[List[str]]`, *optional*):
+                The list of input images. We will replace the "<|image_i|>" in prompt with the 1-th image in list.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image. The number must be a multiple of 16.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image. The number must be a multiple of 16.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            use_img_guidance (`bool`, *optional*, defaults to True):
+                Defined as equation 3 in [Instrucpix2pix](https://arxiv.org/pdf/2211.09800).
+            img_guidance_scale (`float`, *optional*, defaults to 1.6):
+                Defined as equation 3 in [Instrucpix2pix](https://arxiv.org/pdf/2211.09800).
+            max_input_image_size (`int`, *optional*, defaults to 1024): the maximum size of input image, which will be used to crop the input image to the maximum size
+            separate_cfg_infer (`bool`, *optional*, defaults to False):
+                Perform inference on images with different guidance separately; this can save memory when generating images of large size at the expense of slower inference.
+            use_kv_cache (`bool`, *optional*, defaults to True): enable kv cache to speed up the inference
+            offload_kv_cache (`bool`, *optional*, defaults to True): offload the cached key and value to cpu, which can save memory but slow down the generation silightly
+            offload_model (`bool`, *optional*, defaults to False): offload the model to cpu, which can save memory but slow down the generation
+            use_input_image_size_as_output (bool, defaults to False): whether to use the input image size as the output image size, which can be used for single-image input, e.g., image editing task
+            seed (`int`, *optional*):
+                A random seed for generating output.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
+                data type for the model
+            output_type (`str`, *optional*, defaults to "pil"):
+                The type of the output image, which can be "pt" or "pil"
+        Examples:
+        Returns:
+            A list with the generated images.
+        """
+        # check inputs:
+        assert height % 16 == 0 and width % 16 == 0, "The height and width must be a multiple of 16."
+        if context_hidden_state is not None and not isinstance(context_hidden_state, list):
+            context_hidden_state = [context_hidden_state]
+            neg_context_hidden_state = [neg_context_hidden_state]
+        # set model and processor
+        if max_input_image_size != self.processor.max_image_size:
+            self.processor = OmniGenProcessor(max_image_size=max_input_image_size)
+        self.model.to(dtype)
+        if offload_model:
+            self.enable_model_cpu_offload()
+        else:
+            self.disable_model_cpu_offload()
+        input_data = self.processor(context_hidden_state, neg_context_hidden_state, height=height, width=width, separate_cfg_input=separate_cfg_infer)
+        num_prompt = len(context_hidden_state)
+        num_cfg = 1
+        latent_size_h, latent_size_w = height // 8, width // 8
+        if seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(seed)
+        else:
+            generator = None
+        latents = torch.randn(num_prompt, 4, latent_size_h, latent_size_w, device=self.device, generator=generator)
+        latents = torch.cat([latents] * (1 + num_cfg), 0).to(dtype)
+        model_kwargs = dict(cfg_scale=guidance_scale,
+                            use_kv_cache=use_kv_cache,
+                            offload_model=offload_model,
+                            )
+        # obtain the qwen feature
+        # if self.llm_processor is not None:
+        llm_input_embeds = []
+        with torch.no_grad():
+            # for seperate cfg infer mode
+            for i in range(len(input_data['context_hidden_state'])):
+                context_hidden_state = input_data['context_hidden_state'][i]
+                hidden_states = self.connector[0](context_hidden_state)
+                cache_position = torch.arange(0, hidden_states.shape[1], device=hidden_states.device)
+                mask_func = self.model.llm._update_causal_mask
+                cond_causal_mask = mask_func(
+                    input_data['connector_attention_mask'][i].to(self.device), hidden_states, cache_position, None, None)
+                for decoder_layer in self.connector[1:]:
+                    layer_out = decoder_layer(
+                        hidden_states,
+                        attention_mask=cond_causal_mask,
+                        position_ids=input_data['connector_position_ids'][i].to(self.device),
+                    )
+                    hidden_states = layer_out[0]
+                llm_input_embeds.append(hidden_states)
+            # import ipdb; ipdb.set_trace()
+            model_kwargs['llm_input_embeds'] = llm_input_embeds
+            model_kwargs['llm_attention_mask'] = self.move_to_device(input_data['llm_attention_mask'])
+            model_kwargs['llm_position_ids'] = self.move_to_device(input_data['llm_position_ids'])
+        if separate_cfg_infer:
+            func = self.model.forward_with_separate_cfg
+        else:
+            func = self.model.forward_with_cfg
+        if self.model_cpu_offload:
+            for name, param in self.model.named_parameters():
+                if 'layers' in name and 'layers.0' not in name:
+                    param.data = param.data.cpu()
+                else:
+                    param.data = param.data.to(self.device)
+            for buffer_name, buffer in self.model.named_buffers():
+                setattr(self.model, buffer_name, buffer.to(self.device))
+        # else:
+        #     self.model.to(self.device)
+        scheduler = OmniGenScheduler(num_steps=num_inference_steps)
+        samples = scheduler(latents, func, model_kwargs, use_kv_cache=use_kv_cache, offload_kv_cache=offload_kv_cache, tqdm_disable=tqdm_disable)
+        samples = samples.chunk((1 + num_cfg), dim=0)[0]
+        if self.model_cpu_offload:
+            self.model.to('cpu')
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()  # Clear VRAM
+            elif torch_npu.npu.is_available():
+                torch_npu.npu.empty_cache()  # Clear VRAM
+            gc.collect()
+        self.vae.to(self.device)
+        samples = samples.to(torch.float32)
+        if self.vae.config.shift_factor is not None:
+            samples = samples / self.vae.config.scaling_factor + self.vae.config.shift_factor
+        else:
+            samples = samples / self.vae.config.scaling_factor
+        samples = self.vae.decode(samples).sample
+        if self.model_cpu_offload:
+            self.vae.to('cpu')
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()  # Clear VRAM
+            elif torch_npu.npu.is_available():
+                torch_npu.npu.empty_cache()  # Clear VRAM
+            gc.collect()
+        samples = (samples * 0.5 + 0.5).clamp(0, 1)
+        if output_type == "pt":
+            output_images = samples
+        else:
+            output_samples = (samples * 255).to("cpu", dtype=torch.uint8)
+            output_samples = output_samples.permute(0, 2, 3, 1).numpy()
+            output_images = []
+            for i, sample in enumerate(output_samples):
+                output_images.append(Image.fromarray(sample))
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()  # Clear VRAM
+        elif torch_npu.npu.is_available():
+            torch_npu.npu.empty_cache()  # Clear VRAM
+        gc.collect()              # Run garbage collection to free system RAM
+        return output_images

src/image_decoder/model.py ADDED Viewed

	@@ -0,0 +1,395 @@

+# The code is revised from DiT
+import os
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from diffusers.loaders import PeftAdapterMixin
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+from .transformer import Phi3Transformer
+from transformers import Phi3Config
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t, dtype=torch.float32):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=1):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class PatchEmbedMR(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(
+            self,
+            patch_size: int = 2,
+            in_chans: int = 4,
+            embed_dim: int = 768,
+            bias: bool = True,
+    ):
+        super().__init__()
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
+        return x
+class OmniGen(nn.Module, PeftAdapterMixin):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        transformer_config: Phi3Config,
+        patch_size=2,
+        in_channels=4,
+        pe_interpolation: float = 1.0,
+        pos_embed_max_size: int = 192,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.pos_embed_max_size = pos_embed_max_size
+        hidden_size = transformer_config.hidden_size
+        self.x_embedder = PatchEmbedMR(patch_size, in_channels, hidden_size, bias=True)
+        self.input_x_embedder = PatchEmbedMR(patch_size, in_channels, hidden_size, bias=True)
+        self.time_token = TimestepEmbedder(hidden_size)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.pe_interpolation = pe_interpolation
+        pos_embed = get_2d_sincos_pos_embed(hidden_size, pos_embed_max_size, interpolation_scale=self.pe_interpolation, base_size=64)
+        self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=True)
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+        self.llm = Phi3Transformer(config=transformer_config)
+        self.llm.config.use_cache = False
+    @classmethod
+    def from_pretrained(cls, model_name):
+        if not os.path.exists(model_name):
+            cache_folder = os.getenv('HF_HUB_CACHE')
+            model_name = snapshot_download(repo_id=model_name,
+                                           cache_dir=cache_folder,
+                                           ignore_patterns=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'])
+        config = Phi3Config.from_pretrained(model_name)
+        model = cls(config)
+        if os.path.exists(os.path.join(model_name, 'model.safetensors')):
+            print("Loading safetensors")
+            ckpt = load_file(os.path.join(model_name, 'model.safetensors'))
+        else:
+            ckpt = torch.load(os.path.join(model_name, 'model.pt'), map_location='cpu')
+        module_keys = list(model.state_dict().keys())
+        pretrained_keys = list(ckpt.keys())
+        all_keys = module_keys + pretrained_keys
+        missing_modules = []
+        unexpected_modules = []
+        for item in all_keys:
+            if item in module_keys and item not in ckpt.keys():
+                missing_modules.append(item)
+            if item not in module_keys and item in ckpt.keys():
+                unexpected_modules.append(item)
+        print(f"loading {model.__class__.__name__} but missing modules: {missing_modules}, unexpected modules: {unexpected_modules}")
+        model.load_state_dict(ckpt, strict=False)
+        return model
+    def initialize_weights(self):
+        assert not hasattr(self, "llama")
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        w = self.input_x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.input_x_embedder.proj.bias, 0)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.time_token.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_token.mlp[2].weight, std=0.02)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        x = x.reshape(shape=(x.shape[0], h // self.patch_size, w // self.patch_size, self.patch_size, self.patch_size, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h, w))
+        return imgs
+    def cropped_pos_embed(self, height, width):
+        """Crops positional embeddings for SD3 compatibility."""
+        if self.pos_embed_max_size is None:
+            raise ValueError("`pos_embed_max_size` must be set for cropping.")
+        height = height // self.patch_size
+        width = width // self.patch_size
+        if height > self.pos_embed_max_size:
+            raise ValueError(
+                f"Height ({height}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        if width > self.pos_embed_max_size:
+            raise ValueError(
+                f"Width ({width}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        top = (self.pos_embed_max_size - height) // 2
+        left = (self.pos_embed_max_size - width) // 2
+        spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1)
+        spatial_pos_embed = spatial_pos_embed[:, top: top + height, left: left + width, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        return spatial_pos_embed
+    def patch_multiple_resolutions(self, latents, padding_latent=None, is_input_images=False):
+        if isinstance(latents, list):
+            return_list = False
+            if padding_latent is None:
+                padding_latent = [None] * len(latents)
+                return_list = True
+            patched_latents, num_tokens, shapes = [], [], []
+            for latent, padding in zip(latents, padding_latent):
+                height, width = latent.shape[-2:]
+                if is_input_images:
+                    latent = self.input_x_embedder(latent)
+                else:
+                    latent = self.x_embedder(latent)
+                pos_embed = self.cropped_pos_embed(height, width)
+                latent = latent + pos_embed
+                if padding is not None:
+                    latent = torch.cat([latent, padding], dim=-2)
+                patched_latents.append(latent)
+                num_tokens.append(pos_embed.size(1))
+                shapes.append([height, width])
+            if not return_list:
+                latents = torch.cat(patched_latents, dim=0)
+            else:
+                latents = patched_latents
+        else:
+            height, width = latents.shape[-2:]
+            if is_input_images:
+                latents = self.input_x_embedder(latents)
+            else:
+                latents = self.x_embedder(latents)
+            pos_embed = self.cropped_pos_embed(height, width)
+            latents = latents + pos_embed
+            num_tokens = latents.size(1)
+            shapes = [height, width]
+        return latents, num_tokens, shapes
+    def forward(self, x, timestep, padding_latent=None, past_key_values=None, return_past_key_values=True, offload_model: bool = False,
+                llm_input_embeds=None, llm_attention_mask=None, llm_position_ids=None, use_dist=False):
+        input_is_list = isinstance(x, list)
+        x, num_tokens, shapes = self.patch_multiple_resolutions(x, padding_latent)
+        time_token = self.time_token(timestep, dtype=x[0].dtype).unsqueeze(1)
+        if llm_input_embeds is not None:
+            condition_embeds_llm = llm_input_embeds
+            input_emb = torch.cat([condition_embeds_llm, time_token, x], dim=1)
+            attention_mask = llm_attention_mask
+            position_ids = llm_position_ids
+        else:
+            input_emb = torch.cat([time_token, x], dim=1)
+            attention_mask = llm_attention_mask
+            position_ids = llm_position_ids
+        output = self.llm(inputs_embeds=input_emb, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, offload_model=offload_model, output_hidden_states=True)
+        output, past_key_values, all_hidden_states = output.last_hidden_state, output.past_key_values, output.hidden_states
+        if not use_dist:
+            all_states_noise = None
+        if input_is_list:
+            image_embedding = output[:, -max(num_tokens):]
+            time_emb = self.t_embedder(timestep, dtype=x.dtype)
+            x = self.final_layer(image_embedding, time_emb)
+            latents = []
+            if use_dist:
+                all_states = torch.stack([hidden_states[:, -max(num_tokens):] for hidden_states in all_hidden_states], dim=1)   # b l s d
+                all_states_noise = []
+            for i in range(x.size(0)):
+                latent = x[i: i + 1, :num_tokens[i]]
+                latent = self.unpatchify(latent, shapes[i][0], shapes[i][1])
+                latents.append(latent)
+                if use_dist:
+                    all_states_noise.append(all_states[i, :, :num_tokens[i]])
+        else:
+            image_embedding = output[:, -num_tokens:]
+            time_emb = self.t_embedder(timestep, dtype=x.dtype)
+            x = self.final_layer(image_embedding, time_emb)
+            latents = self.unpatchify(x, shapes[0], shapes[1])
+            if use_dist:
+                all_states_noise = torch.stack([hidden_states[:, -num_tokens:] for hidden_states in all_hidden_states], dim=1)   # b l s d
+        if return_past_key_values:
+            return latents, past_key_values, all_states_noise
+        return latents, all_states_noise
+    @torch.no_grad()
+    def forward_with_separate_cfg(self, x, timestep, cfg_scale, past_key_values, use_kv_cache, offload_model,
+                                  llm_input_embeds=None, llm_attention_mask=None, llm_position_ids=None, llm_padded_input_ids=None, llm_image_sizes=None):
+        self.llm.config.use_cache = use_kv_cache
+        if past_key_values is None:
+            past_key_values = [None] * len(llm_attention_mask)
+        x = torch.split(x, len(x) // len(llm_attention_mask), dim=0)
+        timestep = timestep.to(x[0].dtype)
+        timestep = torch.split(timestep, len(timestep) // len(llm_input_embeds), dim=0)
+        model_out, pask_key_values = [], []
+        for i in range(len(llm_input_embeds)):
+            if llm_input_embeds is not None:
+                temp_out, temp_pask_key_values, _ = self.forward(x[i], timestep[i], past_key_values=past_key_values[i], return_past_key_values=True, offload_model=offload_model,
+                                                                 llm_input_embeds=llm_input_embeds[i], llm_attention_mask=llm_attention_mask[i], llm_position_ids=llm_position_ids[i])
+            else:
+                temp_out, temp_pask_key_values, _ = self.forward(x[i], timestep[i], past_key_values=past_key_values[i], return_past_key_values=True, offload_model=offload_model)
+            model_out.append(temp_out)
+            pask_key_values.append(temp_pask_key_values)
+        if len(model_out) == 2:
+            cond, uncond = model_out
+            cond = uncond + cfg_scale * (cond - uncond)
+            model_out = [cond, cond]
+        else:
+            return model_out[0]
+        return torch.cat(model_out, dim=0), pask_key_values

src/image_decoder/modeling_phi3.py ADDED Viewed

	@@ -0,0 +1,1611 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Phi-3 model."""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from transformers import Phi3Config
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
+_CONFIG_FOR_DOC = "Phi3Config"
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+    return causal_mask
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
+class Phi3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Phi3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
+class Phi3RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        warnings.warn(
+            "The class Phi3SuScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers. Please"
+            " use Phi3LongRoPEScaledRotaryEmbedding instead.",
+            FutureWarning,
+        )
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        warnings.warn(
+            "The class Phi3YarnScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers",
+            FutureWarning,
+        )
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = 0.1 * math.log(scale) + 1.0
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3LongRoPEScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = seq_len or torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Phi3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+        return self.down_proj(up_states)
+# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Phi3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = config.rope_scaling
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
+        self._init_rope()
+    def _init_rope(self):
+        if self.rope_scaling is None:
+            self.rotary_emb = Phi3RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            if scaling_type == "longrope":
+                self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config)
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.")
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights += causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Phi3FlashAttention2(Phi3Attention):
+    """
+    Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # Phi3FlashAttention2 attention does not support output_attentions
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = (
+            max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
+        )
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len, position_ids=position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_dropout = self.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+        if query_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.qkv_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=attn_dropout,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3
+# TODO @Arthur no longer copied from LLama after static cache
+class Phi3SdpaAttention(Phi3Attention):
+    """
+    Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from Phi3Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+PHI3_ATTENTION_CLASSES = {
+    "eager": Phi3Attention,
+    "flash_attention_2": Phi3FlashAttention2,
+    "sdpa": Phi3SdpaAttention,
+}
+class Phi3DecoderLayer(nn.Module):
+    def __init__(self, config: Phi3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+        self.mlp = Phi3MLP(config)
+        self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+PHI3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Phi3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
+    PHI3_START_DOCSTRING,
+)
+class Phi3PreTrainedModel(PreTrainedModel):
+    config_class = Phi3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _version = "0.0.5"
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+PHI3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
+    PHI3_START_DOCSTRING,
+)
+class Phi3Model(Phi3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
+    Args:
+        config: Phi3Config
+    """
+    def __init__(self, config: Phi3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.embed_dropout = nn.Dropout(config.embd_pdrop)
+        self.layers = nn.ModuleList(
+            [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            min_dtype=min_dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+class Phi3ForCausalLM(Phi3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
+    def set_decoder(self, decoder):
+        self.model = decoder
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
+    def get_decoder(self):
+        return self.model
+    # Ignore copy
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Phi3ForCausalLM
+        >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+        >>> prompt = "This is an example script ."
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
+        ```"""
+        if (
+            use_cache
+            and self.config.rope_scaling
+            and cache_position is not None
+            and cache_position[0] == self.config.original_max_position_embeddings
+        ):
+            logger.warning(
+                f"If you are not using the generate method, you may encounter nonsensical outputs after the {self.config.original_max_position_embeddings}th token, as the KV cache needs to be recomputed."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if labels is None and not is_torchdynamo_compiling():
+            logger.warning_once(
+                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
+            )
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        # TODO: remove the float() operation in v4.46
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # When the first time input length reached long and short factor switching point, enforce re-compute cache
+        # It will cause downside of slower at this single token position, however, better than current failure.
+        if (
+            past_key_values
+            and self.config.rope_scaling
+            and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+        ):
+            past_length = cache_position[0]
+            if past_length <= self.config.original_max_position_embeddings:
+                past_key_values = None
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+            dtype = self.lm_head.weight.dtype
+            min_dtype = torch.finfo(dtype).min
+            attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_length(),
+                dtype=dtype,
+                device=device,
+                min_dtype=min_dtype,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+@add_start_docstrings(
+    """
+    The [`Phi3Model`] with a sequence classification head on top (linear layer).
+    [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    PHI3_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
+class Phi3ForSequenceClassification(Phi3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Phi3Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = model_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + model_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=model_outputs.past_key_values,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    PHI3_START_DOCSTRING,
+)
+# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
+class Phi3ForTokenClassification(Phi3PreTrainedModel):
+    def __init__(self, config: Phi3Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Phi3Model(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_outputs = self.model(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = model_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+        if not return_dict:
+            output = (logits,) + model_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )

src/image_decoder/processor.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+from typing import Dict, List
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
+import numpy as np
+def crop_arr(pil_image, max_image_size):
+    while min(*pil_image.size) >= 2 * max_image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    if max(*pil_image.size) > max_image_size:
+        scale = max_image_size / max(*pil_image.size)
+        pil_image = pil_image.resize(
+            tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+        )
+    if min(*pil_image.size) < 16:
+        scale = 16 / min(*pil_image.size)
+        pil_image = pil_image.resize(
+            tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+        )
+    arr = np.array(pil_image)
+    crop_y1 = (arr.shape[0] % 16) // 2
+    crop_y2 = arr.shape[0] % 16 - crop_y1
+    crop_x1 = (arr.shape[1] % 16) // 2
+    crop_x2 = arr.shape[1] % 16 - crop_x1
+    arr = arr[crop_y1:arr.shape[0] - crop_y2, crop_x1:arr.shape[1] - crop_x2]
+    return Image.fromarray(arr)
+class OmniGenProcessor:
+    def __init__(self, max_image_size: int = 1024):
+        self.max_image_size = max_image_size
+        self.image_transform = transforms.Compose([
+            transforms.Lambda(lambda pil_image: crop_arr(pil_image, max_image_size)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        self.collator = OmniGenCollator()
+        self.separate_collator = OmniGenSeparateCollator()
+    @classmethod
+    def from_pretrained(cls, model_name):
+        if not os.path.exists(model_name):
+            cache_folder = os.getenv('HF_HUB_CACHE')
+            model_name = snapshot_download(repo_id=model_name,
+                                           cache_dir=cache_folder,
+                                           allow_patterns="*.json")
+        text_tokenizer = AutoTokenizer.from_pretrained(model_name)
+        return cls(text_tokenizer)
+    def process_image(self, image):
+        image = Image.open(image).convert('RGB')
+        return self.image_transform(image)
+    def __call__(self,
+                 context_hidden_state: List[torch.tensor],
+                 neg_context_hidden_state: List[torch.tensor],
+                 height: int = 1024,
+                 width: int = 1024,
+                 separate_cfg_input: bool = False,
+                 ) -> Dict:
+        input_data = []
+        for i in range(len(context_hidden_state)):
+            cur_context_hidden_state = context_hidden_state[i]
+            cur_neg_context_hidden_state = neg_context_hidden_state[i]
+            input_data.append((cur_context_hidden_state, cur_neg_context_hidden_state, [height, width]))
+        if separate_cfg_input:
+            return self.separate_collator(input_data)
+        return self.collator(input_data)
+class OmniGenCollator:
+    def __init__(self, pad_token_id=2, llm_pad_token_id=151643, hidden_size=3072):
+        self.llm_pad_token_id = llm_pad_token_id
+        self.pad_token_id = pad_token_id
+        self.hidden_size = hidden_size
+    def create_position(self, attention_mask, num_tokens_for_output_images):
+        position_ids = []
+        text_length = attention_mask.size(-1)
+        img_length = max(num_tokens_for_output_images)
+        for mask in attention_mask:
+            temp_l = torch.sum(mask)
+            temp_position = [0] * (text_length - temp_l) + [i for i in range(temp_l + img_length + 1)]  # we add a time embedding into the sequence, so add one more token
+            position_ids.append(temp_position)
+        return torch.LongTensor(position_ids)
+    def create_connector_position(self, llm_2d_attention_mask):
+        position_ids = []
+        text_length = llm_2d_attention_mask.size(-1)
+        # img_length = max(num_tokens_for_output_images)
+        for batch_idx, mask in enumerate(llm_2d_attention_mask):
+            temp_l = torch.sum(llm_2d_attention_mask[batch_idx])
+            # temp_position = [0]*(text_length-temp_l) + [i for i in range(temp_l+img_length+1)] # we add a time embedding into the sequence, so add one more token
+            temp_position = [0] * (text_length - temp_l) + [i for i in range(temp_l)]  # only condition for mllm like qwen
+            position_ids.append(temp_position)
+        return torch.LongTensor(position_ids)
+    def create_mask(self, attention_mask, num_tokens_for_output_images):
+        extended_mask = []
+        padding_images = []
+        text_length = attention_mask.size(-1)
+        img_length = max(num_tokens_for_output_images)
+        seq_len = text_length + img_length + 1  # we add a time embedding into the sequence, so add one more token
+        inx = 0
+        for mask in attention_mask:
+            temp_l = torch.sum(mask)
+            pad_l = text_length - temp_l
+            temp_mask = torch.tril(torch.ones(size=(temp_l + 1, temp_l + 1)))
+            image_mask = torch.zeros(size=(temp_l + 1, img_length))
+            temp_mask = torch.cat([temp_mask, image_mask], dim=-1)
+            image_mask = torch.ones(size=(img_length, temp_l + img_length + 1))
+            temp_mask = torch.cat([temp_mask, image_mask], dim=0)
+            if pad_l > 0:
+                pad_mask = torch.zeros(size=(temp_l + 1 + img_length, pad_l))
+                temp_mask = torch.cat([pad_mask, temp_mask], dim=-1)
+                pad_mask = torch.ones(size=(pad_l, seq_len))
+                temp_mask = torch.cat([pad_mask, temp_mask], dim=0)
+            true_img_length = num_tokens_for_output_images[inx]
+            pad_img_length = img_length - true_img_length
+            if pad_img_length > 0:
+                temp_mask[:, -pad_img_length:] = 0
+                temp_padding_imgs = torch.zeros(size=(1, pad_img_length, self.hidden_size))
+            else:
+                temp_padding_imgs = None
+            extended_mask.append(temp_mask.unsqueeze(0))
+            padding_images.append(temp_padding_imgs)
+            inx += 1
+        return torch.cat(extended_mask, dim=0), padding_images
+    def adjust_attention_for_input_images(self, attention_mask, image_sizes):
+        for b_inx in image_sizes.keys():
+            for start_inx, end_inx in image_sizes[b_inx]:
+                attention_mask[b_inx][start_inx:end_inx, start_inx:end_inx] = 1
+        return attention_mask
+    def pad_input(self, context_hidden_state):
+        # pad_token_id = self.llm_pad_token_id  # 151642 <|endoftext|> in qwen2.5vl
+        max_l = max([x.shape[1] for x in context_hidden_state])
+        attention_mask = []
+        for i in range(len(context_hidden_state)):
+            temp_hidden = context_hidden_state[i]
+            temp_l = temp_hidden.shape[1]
+            pad_l = max_l - temp_l
+            if pad_l == 0:
+                attention_mask.append([1] * max_l)
+            else:
+                attention_mask.append([0] * pad_l + [1] * temp_l)
+        return torch.LongTensor(attention_mask)
+    def process_mllm_input(self, context_hidden_state, target_img_size):
+        num_tokens_for_output_images = []
+        for img_size in target_img_size:
+            num_tokens_for_output_images.append(img_size[0] * img_size[1] // 16 // 16)
+        llm_2d_attention_mask = self.pad_input(context_hidden_state)
+        connector_position_ids = self.create_connector_position(llm_2d_attention_mask)
+        llm_position_ids = self.create_position(llm_2d_attention_mask, num_tokens_for_output_images)
+        llm_attention_mask, _ = self.create_mask(llm_2d_attention_mask, num_tokens_for_output_images)
+        return llm_2d_attention_mask, connector_position_ids, llm_attention_mask, llm_position_ids
+class OmniGenSeparateCollator(OmniGenCollator):
+    def __call__(self, features):
+        context_hidden_state = [f[0] for f in features]
+        neg_context_hidden_state = [f[1] for f in features]
+        target_img_size = [f[2] for f in features]
+        all_context_hidden_state, all_connector_attention_mask, all_connector_position_ids, all_llm_attention_mask, all_llm_position_ids = [], [], [], [], []
+        connector_attention_mask, connector_position_ids, llm_attention_mask, llm_position_ids = self.process_mllm_input(context_hidden_state, target_img_size)
+        all_context_hidden_state.append(context_hidden_state[0])
+        all_connector_attention_mask.append(connector_attention_mask)
+        all_connector_position_ids.append(connector_position_ids)
+        all_llm_attention_mask.append(llm_attention_mask)
+        all_llm_position_ids.append(llm_position_ids)
+        if neg_context_hidden_state[0] is not None:
+            connector_attention_mask, connector_position_ids, llm_attention_mask, llm_position_ids = self.process_mllm_input(neg_context_hidden_state, target_img_size)
+            all_context_hidden_state.append(neg_context_hidden_state[0])
+            all_connector_attention_mask.append(connector_attention_mask)
+            all_connector_position_ids.append(connector_position_ids)
+            all_llm_attention_mask.append(llm_attention_mask)
+            all_llm_position_ids.append(llm_position_ids)
+        data = {
+            "context_hidden_state": all_context_hidden_state,
+            "connector_attention_mask": all_connector_attention_mask,
+            "connector_position_ids": all_connector_position_ids,
+            "llm_attention_mask": all_llm_attention_mask,
+            "llm_position_ids": all_llm_position_ids,
+        }
+        return data

src/image_decoder/scheduler.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from tqdm import tqdm
+from typing import Optional, Dict, Any, Tuple, List
+import gc
+import torch
+try:
+    import torch_npu
+except Exception as e:
+    print(e)
+from transformers.cache_utils import DynamicCache
+class OmniGenCache(DynamicCache):
+    def __init__(self, num_tokens_for_img: int, offload_kv_cache: bool = False) -> None:
+        # if not torch.cuda.is_available():
+        #     # print("No avaliable GPU, offload_kv_cache wiil be set to False, which will result in large memory usage and time cost when input multiple images!!!")
+        #     # offload_kv_cache = False
+        #     raise RuntimeError("OffloadedCache can only be used with a GPU. If there is no GPU, you need to set use_kv_cache=False, which will result in longer inference time!")
+        super().__init__()
+        self.original_device = []
+        self.prefetch_stream = torch.cuda.Stream() if torch.cuda.is_available() else torch_npu.npu.Stream()
+        self.num_tokens_for_img = num_tokens_for_img
+        self.offload_kv_cache = offload_kv_cache
+    def prefetch_layer(self, layer_idx: int):
+        "Starts prefetching the next layer cache"
+        if layer_idx < len(self):
+            if torch.cuda.is_available():
+                with torch.cuda.stream(self.prefetch_stream):
+                    # Prefetch next layer tensors to GPU
+                    device = self.original_device[layer_idx]
+                    self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device, non_blocking=True)
+                    self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device, non_blocking=True)
+            else:
+                with torch_npu.npu.stream(self.prefetch_stream):
+                    # Prefetch next layer tensors to GPU
+                    device = self.original_device[layer_idx]
+                    self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device, non_blocking=True)
+                    self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device, non_blocking=True)
+    def evict_previous_layer(self, layer_idx: int):
+        "Moves the previous layer cache to the CPU"
+        if len(self) > 2:
+            # We do it on the default stream so it occurs after all earlier computations on these tensors are done
+            if layer_idx == 0:
+                prev_layer_idx = -1
+            else:
+                prev_layer_idx = (layer_idx - 1) % len(self)
+            self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True)
+            self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True)
+    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
+        "Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer."
+        if layer_idx < len(self):
+            if self.offload_kv_cache:
+                # Evict the previous layer if necessary
+                if torch.cuda.is_available():
+                    torch.cuda.current_stream().synchronize()
+                else:
+                    torch_npu.npu.current_stream().synchronize()
+                self.evict_previous_layer(layer_idx)
+                # Load current layer cache to its original device if not already there
+                # self.prefetch_stream.synchronize(original_device)
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize(self.prefetch_stream)
+                else:
+                    torch_npu.npu.synchronize(self.prefetch_stream)
+                key_tensor = self.key_cache[layer_idx]
+                value_tensor = self.value_cache[layer_idx]
+                # Prefetch the next layer
+                self.prefetch_layer((layer_idx + 1) % len(self))
+            else:
+                key_tensor = self.key_cache[layer_idx]
+                value_tensor = self.value_cache[layer_idx]
+            return (key_tensor, value_tensor)
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `OffloadedCache`.
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the cache
+        if len(self.key_cache) < layer_idx:
+            raise ValueError("OffloadedCache does not support model usage where layers are skipped. Use DynamicCache.")
+        elif len(self.key_cache) == layer_idx:
+            # only cache the states for condition tokens
+            key_states = key_states[..., :-(self.num_tokens_for_img + 1), :]
+            value_states = value_states[..., :-(self.num_tokens_for_img + 1), :]
+            # Update the number of seen tokens
+            if layer_idx == 0:
+                self._seen_tokens += key_states.shape[-2]
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+            self.original_device.append(key_states.device)
+            if self.offload_kv_cache:
+                self.evict_previous_layer(layer_idx)
+            return self.key_cache[layer_idx], self.value_cache[layer_idx]
+        else:
+            # only cache the states for condition tokens
+            key_tensor, value_tensor = self[layer_idx]
+            k = torch.cat([key_tensor, key_states], dim=-2)
+            v = torch.cat([value_tensor, value_states], dim=-2)
+            return k, v
+class OmniGenScheduler:
+    def __init__(self, num_steps: int = 50, time_shifting_factor: int = 1):
+        self.num_steps = num_steps
+        self.time_shift = time_shifting_factor
+        t = torch.linspace(0, 1, num_steps + 1)
+        t = t / (t + time_shifting_factor - time_shifting_factor * t)
+        self.sigma = t
+    def crop_kv_cache(self, past_key_values, num_tokens_for_img):
+        # return
+        crop_past_key_values = ()
+        for layer_idx in range(len(past_key_values)):
+            key_states, value_states = past_key_values[layer_idx][:2]
+            crop_past_key_values += ((key_states[..., :-(num_tokens_for_img + 1), :], value_states[..., :-(num_tokens_for_img + 1), :], ),)
+        # return crop_past_key_values
+        return DynamicCache.from_legacy_cache(crop_past_key_values)
+    def crop_position_ids_for_cache(self, position_ids, num_tokens_for_img):
+        if isinstance(position_ids, list):
+            for i in range(len(position_ids)):
+                position_ids[i] = position_ids[i][:, -(num_tokens_for_img + 1):]
+        else:
+            position_ids = position_ids[:, -(num_tokens_for_img + 1):]
+        return position_ids
+    def crop_attention_mask_for_cache(self, attention_mask, num_tokens_for_img):
+        if isinstance(attention_mask, list):
+            return [x[..., -(num_tokens_for_img + 1):, :] for x in attention_mask]
+        return attention_mask[..., -(num_tokens_for_img + 1):, :]
+    def crop_cache(self, cache, num_tokens_for_img):
+        for i in range(len(cache.key_cache)):
+            cache.key_cache[i] = cache.key_cache[i][..., :-(num_tokens_for_img + 1), :]
+            cache.value_cache[i] = cache.value_cache[i][..., :-(num_tokens_for_img + 1), :]
+        return cache
+    def __call__(self, z, func, model_kwargs, use_kv_cache: bool = True, offload_kv_cache: bool = True, tqdm_disable: bool = False):
+        num_tokens_for_img = z.size(-1) * z.size(-2) // 4
+        if isinstance(model_kwargs['llm_input_embeds'], list):
+            cache = [OmniGenCache(num_tokens_for_img, offload_kv_cache) for _ in range(len(model_kwargs['llm_input_embeds']))] if use_kv_cache else None
+        else:
+            cache = OmniGenCache(num_tokens_for_img, offload_kv_cache) if use_kv_cache else None
+        for i in tqdm(range(self.num_steps), disable=tqdm_disable):
+            timesteps = torch.zeros(size=(len(z), )).to(z.device) + self.sigma[i]
+            pred, cache = func(z, timesteps, past_key_values=cache, **model_kwargs)
+            sigma_next = self.sigma[i + 1]
+            sigma = self.sigma[i]
+            z = z + (sigma_next - sigma) * pred
+            if i == 0 and use_kv_cache:
+                num_tokens_for_img = z.size(-1) * z.size(-2) // 4
+                if isinstance(cache, list):
+                    model_kwargs['llm_input_embeds'] = [None] * len(cache)
+                else:
+                    model_kwargs['llm_input_embeds'] = None
+                model_kwargs['llm_position_ids'] = self.crop_position_ids_for_cache(model_kwargs['llm_position_ids'], num_tokens_for_img)
+                model_kwargs['llm_attention_mask'] = self.crop_attention_mask_for_cache(model_kwargs['llm_attention_mask'], num_tokens_for_img)
+        del cache
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        else:
+            torch_npu.npu.empty_cache()
+        gc.collect()
+        return z

src/image_decoder/transformer.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from typing import List, Optional, Tuple, Union
+import torch
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from .modeling_phi3 import Phi3Model
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Phi3Transformer(Phi3Model):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
+    We only modified the attention mask
+    Args:
+        config: Phi3Config
+    """
+    def prefetch_layer(self, layer_idx: int, device: torch.device):
+        "Starts prefetching the next layer cache"
+        with torch.cuda.stream(self.prefetch_stream):
+            # Prefetch next layer tensors to GPU
+            for name, param in self.layers[layer_idx].named_parameters():
+                param.data = param.data.to(device, non_blocking=True)
+    def evict_previous_layer(self, layer_idx: int):
+        "Moves the previous layer cache to the CPU"
+        prev_layer_idx = layer_idx - 1
+        for name, param in self.layers[prev_layer_idx].named_parameters():
+            param.data = param.data.to("cpu", non_blocking=True)
+    def get_offlaod_layer(self, layer_idx: int, device: torch.device):
+        # init stream
+        if not hasattr(self, "prefetch_stream"):
+            self.prefetch_stream = torch.cuda.Stream()
+        # delete previous layer
+        torch.cuda.current_stream().synchronize()
+        self.evict_previous_layer(layer_idx)
+        # make sure the current layer is ready
+        torch.cuda.synchronize(self.prefetch_stream)
+        # load next layer
+        self.prefetch_layer((layer_idx + 1) % len(self.layers), device)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        offload_model: Optional[bool] = False,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        # if inputs_embeds is None:
+        #     inputs_embeds = self.embed_tokens(input_ids)
+        # if cache_position is None:
+        #     past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        #     cache_position = torch.arange(
+        #         past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+        #     )
+        # if position_ids is None:
+        #     position_ids = cache_position.unsqueeze(0)
+        if attention_mask is not None and attention_mask.dim() == 3:
+            dtype = inputs_embeds.dtype
+            min_dtype = torch.finfo(dtype).min
+            attention_mask = (1 - attention_mask) * min_dtype
+            attention_mask = attention_mask.unsqueeze(1).to(inputs_embeds.dtype)
+        else:
+            raise Exception("attention_mask parameter was unavailable or invalid")
+            # causal_mask = self._update_causal_mask(
+            #     attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+            # )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        layer_idx = -1
+        for decoder_layer in self.layers:
+            layer_idx += 1
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                if offload_model and not self.training:
+                    self.get_offlaod_layer(layer_idx, device=inputs_embeds.device)
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )

src/mindomni.py ADDED Viewed

	@@ -0,0 +1,219 @@

+from .mllm import MindOmniMLLM
+from .image_decoder import OmniGen
+import torch.nn as nn
+from .image_decoder import Phi3DecoderLayer, ImageDecoderPipeline, OmniGenProcessor
+import os
+import torch
+from safetensors.torch import load_file
+from typing import Union
+from diffusers.utils import logging
+from diffusers.models import AutoencoderKL
+from transformers import AutoProcessor
+import re
+from qwen_vl_utils import process_vision_info
+try:
+    import torch_npu
+except Exception as e:
+    print(e)
+logger = logging.get_logger(__name__)
+class MindOmniConnector(nn.Module):
+    def __init__(self, pre_config, post_config, layer_num: int = 2):
+        super().__init__()
+        connector_decoder = nn.ModuleList(
+            [Phi3DecoderLayer(post_config, layer_idx) for layer_idx in range(layer_num)]
+        )
+        self.connector = nn.ModuleList(
+            [nn.Linear(pre_config.hidden_size, post_config.hidden_size)]  # qwen2.5vl-7b: 3584
+        )
+        self.connector.extend(connector_decoder)
+class MindOmni:
+    def __init__(self, mllm, image_decoder, connector, vae, processor, mllm_processor, device: Union[str, torch.device] = None):
+        self.mllm = mllm
+        self.image_decoder = image_decoder
+        self.connector = connector
+        self.vae = vae
+        self.processor = processor
+        self.mllm_processor = mllm_processor
+        self.vae.to(torch.float32)
+        self.device = device
+        if device is None:
+            if torch.cuda.is_available():
+                self.device = torch.device("cuda")
+            elif torch_npu.npu.is_available():
+                self.device = torch.device("npu")
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            else:
+                logger.info("Don't detect any available GPUs, using CPU instead, this may take long time to generate image!!!")
+                self.device = torch.device("cpu")
+    @classmethod
+    def from_pretrained(cls, model_path):
+        mllm = MindOmniMLLM.from_pretrained(os.path.join(model_path, 'mllm'))
+        image_decoder = OmniGen.from_pretrained(os.path.join(model_path, 'image_decoder'))
+        connector = MindOmniConnector(mllm.config, image_decoder.llm.config, 2).connector
+        connector_state = load_file(os.path.join(model_path, 'connector.safetensors'))
+        connector.load_state_dict(connector_state)
+        vae = AutoencoderKL.from_pretrained(os.path.join(model_path, "vae"))
+        processor = OmniGenProcessor.from_pretrained(os.path.join(model_path, 'image_decoder'))
+        mllm_processor = AutoProcessor.from_pretrained(os.path.join(model_path, 'mllm'))
+        logger.info("Preparing MindOmni")
+        return cls(mllm, image_decoder, connector, vae, processor, mllm_processor)
+    def to(self, device: Union[str, torch.device] = None, dtype: Union[str, torch.device] = None):
+        if device is not None:
+            if isinstance(device, str):
+                device = torch.device(device)
+            self.mllm.to(device)
+            self.image_decoder.to(device)
+            self.connector.to(device)
+            self.vae.to(device)
+            self.device = device
+        if dtype is not None:
+            self.mllm.to(dtype)
+            self.image_decoder.to(dtype)
+            self.connector.to(dtype)
+    def eval(self):
+        self.mllm.eval()
+        self.image_decoder.eval()
+        self.connector.eval()
+        self.vae.eval()
+    @torch.no_grad()
+    def get_mllm_hidden_state(self, user_input, input_images, do_sample, temperature, max_new_tokens, only_understand=False, use_cot=False):
+        input_llm_images = input_images
+        processor = self.mllm_processor
+        model = self.mllm
+        if only_understand or not use_cot:
+            system_prompt = (
+                "You are a helpful assistant."
+            )
+        else:
+            system_prompt = (
+                "You are a helpful assistant. When the user requests an image, the assistant "
+                "first thinks about the reasoning process in the mind and then provides the user with concise prompt as the answer. "
+                "The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+                "<think> reasoning process here </think><answer> answer here </answer>."
+            )
+        messages = [
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": system_prompt},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Generate an image according to the following instructions\n"},
+                    {"type": "text", "text": user_input},
+                ],
+            }
+        ]
+        if input_llm_images is not None:
+            if only_understand:
+                assert len(input_llm_images) == 1, "only support single image when multimodal understanding"
+                messages[1]['content'][0] = {"type": "image", "image": input_llm_images[0]}
+            else:
+                user_input = f'<img><|image_1|></img> {user_input}'
+                messages[1]['content'][1] = {"type": "text", "text": user_input}
+                image_tags = re.findall(r'<\|image_\d+\|>', messages[1]['content'][1]['text'])
+                image_ids = [int(s.split("|")[1].split("_")[-1]) for s in image_tags]
+                pattern = r"<img><\|image_\d+\|></img>"
+                prompt_chunks = [chunk for chunk in re.split(pattern, messages[1]['content'][1]['text'])]
+                assert len(prompt_chunks) == len(input_llm_images) + 1
+                new_content = []
+                for idx, per_prompt in enumerate(prompt_chunks):
+                    if idx != len(prompt_chunks) - 1:
+                        item_text = {"type": "text", "text": per_prompt}
+                        # resized_height, resized_width = input_images_shape[image_ids[idx] - 1]
+                        image_path = input_llm_images[image_ids[idx] - 1]
+                        # item_vit = {"type": "image", "image": image_path, "resized_height": resized_height, "resized_width": resized_width}
+                        item_vit = {"type": "image", "image": image_path}
+                        item_tag = {"type": "text", "text": f"<img>{image_tags[idx]}</img>"}
+                        new_content.append(item_text)
+                        new_content.append(item_vit)
+                        new_content.append(item_tag)
+                    else:
+                        item_text = {"type": "text", "text": per_prompt}
+                        new_content.append(item_text)
+                messages[1]['content'] = messages[1]['content'][:1] + new_content
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("npu")
+        if use_cot:
+            # Inference: Generation of the output
+            temperature = temperature if do_sample else None
+            generated_dict = model.generate(**inputs, do_sample=do_sample, temperature=temperature, max_new_tokens=max_new_tokens, output_hidden_states=True, return_dict_in_generate=True)
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_dict.sequences)
+            ]
+            output_hidden_state = [hidden_state[-1] for hidden_state in generated_dict.hidden_states]
+            context_hidden_state = torch.cat(output_hidden_state, dim=1)
+            output_text = processor.batch_decode(
+                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )
+            prompt_ = output_text[0]
+            assistant_content = [
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": prompt_},
+                    ],
+                }
+            ]
+            messages += assistant_content
+        else:
+            prompt_ = user_input
+            context_hidden_state = model(**inputs, output_hidden_states=True).hidden_states[-1]
+        return messages, prompt_, context_hidden_state
+    def generate_image(self, height, width, guidance_scale, inference_steps, separate_cfg_infer, offload_model, seed, max_input_image_size,
+                       text, NEGATIVE_PROMPT, input_llm_images, do_sample, temperature, max_new_tokens, only_understand, use_cot=False):
+        gen_pipe = ImageDecoderPipeline(self.vae, self.image_decoder, self.connector, self.processor)
+        message, prompt_, context_hidden_state = self.get_mllm_hidden_state(text, input_llm_images, do_sample, temperature, max_new_tokens, only_understand, use_cot=use_cot)
+        neg_message, neg_prompt_, neg_context_hidden_state = self.get_mllm_hidden_state(NEGATIVE_PROMPT, None, do_sample, temperature, max_new_tokens, only_understand, use_cot=False)
+        print(message)
+        output = gen_pipe(
+            context_hidden_state=context_hidden_state,
+            neg_context_hidden_state=neg_context_hidden_state,
+            height=height,
+            width=width,
+            guidance_scale=guidance_scale,
+            num_inference_steps=inference_steps,
+            separate_cfg_infer=separate_cfg_infer,
+            use_kv_cache=True,
+            offload_kv_cache=True,
+            offload_model=offload_model,
+            seed=seed,
+            max_input_image_size=max_input_image_size,
+        )
+        return output, prompt_
+    def generate_text(self, text, input_llm_images, do_sample, temperature, max_new_tokens, only_understand):
+        _, answer, _ = self.get_mllm_hidden_state(text, input_llm_images, do_sample, temperature, max_new_tokens, only_understand=True, use_cot=True)
+        return answer

src/mllm.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel
+from typing import List, Optional, Tuple, Union
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import logger
+from transformers.cache_utils import DynamicCache
+class MindOmniMLLM_Model(Qwen2_5_VLModel):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.dim() == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        # add hidden states from the last decoder layer before the self.norm
+        # import ipdb; ipdb.set_trace()
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        hidden_states = self.norm(hidden_states)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class MindOmniMLLM(Qwen2_5_VLForConditionalGeneration):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MindOmniMLLM_Model(config)
+    # @staticmethod
+    # def _update_model_kwargs_for_generation(
+    #     outputs, model_kwargs, past_key_values_field="past_key_values"
+    # ):
+    #     if past_key_values_field in outputs:
+    #         model_kwargs[past_key_values_field] = outputs[past_key_values_field]
+    #     if "attention_mask" in model_kwargs:
+    #         bs, _ = model_kwargs["attention_mask"].shape
+    #         new_mask = torch.ones(bs, 1, dtype=model_kwargs["attention_mask"].dtype,
+    #                               device=model_kwargs["attention_mask"].device)
+    #         model_kwargs["attention_mask"] = torch.cat(
+    #             [model_kwargs["attention_mask"], new_mask], dim=-1
+    #         )
+    #     return model_kwargs
+    # @staticmethod
+    # def _sample_token(
+    #     logits: torch.Tensor,
+    #     do_sample: bool,
+    #     logits_processors: LogitsProcessorList,
+    #     temperature: float,
+    #     top_p: float,
+    # ):
+    #     """do sample / greedy"""
+    #     logits = logits_processors(None, logits)
+    #     if do_sample:
+    #         # 温度缩放
+    #         if temperature != 1.0 and temperature > 0:
+    #             logits = logits / temperature
+    #         # nucleus
+    #         if top_p < 1.0:
+    #             logits = TopPLogitsWarper(top_p=top_p)(None, logits)
+    #         probs = nn.functional.softmax(logits, dim=-1, dtype=torch.float32)
+    #         next_token = torch.multinomial(probs, num_samples=1)
+    #     else:  # greedy
+    #         next_token = torch.argmax(logits, dim=-1, keepdim=True)
+    #     return next_token
+    # @torch.no_grad()
+    # def generate(
+    #     self,
+    #     pixel_values: Optional[torch.FloatTensor] = None,
+    #     input_ids: Optional[torch.LongTensor] = None,
+    #     attention_mask: Optional[torch.LongTensor] = None,
+    #     max_new_tokens: int = 64,
+    #     do_sample: bool = False,
+    #     temperature: float = 1.0,
+    #     top_p: float = 0.95,
+    #     device: Union[str, torch.device] = "cuda",
+    # ) -> torch.LongTensor:
+    #     assert input_ids is not None
+    #     eos_token_id = self.config.eos_token_id
+    #     generated = [input_ids]
+    #     input_ids = input_ids.to(device)
+    #     if pixel_values is not None:
+    #         pixel_values = pixel_values.to(device)
+    #     if attention_mask is None:
+    #         attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+    #     logits_processors = LogitsProcessorList()
+    #     if temperature != 1.0 and do_sample:
+    #         logits_processors.append(TemperatureLogitsWarper(temperature))
+    #     if top_p < 1.0 and do_sample:
+    #         logits_processors.append(TopPLogitsWarper(top_p=top_p))
+    #     # ---- 推理循环 ---- #
+    #     model_kwargs = {
+    #         "attention_mask": attention_mask,
+    #         "use_cache": True,
+    #         "past_key_values": None,
+    #         "cache_position": torch.arange(attention_mask.shape[-1]).to(attention_mask)
+    #     }
+    #     for _ in range(max_new_tokens):
+    #         model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+    #         outputs = self(
+    #             input_ids=input_ids,
+    #             use_cache=True,
+    #             **model_kwargs,
+    #         )
+    #         next_token = self._sample_token(
+    #             outputs.logits[:, -1, :],
+    #             do_sample=do_sample,
+    #             logits_processors=logits_processors,
+    #             temperature=temperature,
+    #             top_p=top_p,
+    #         )  # (bs, 1)
+    #         # 追加生成
+    #         input_ids = next_token
+    #         generated.append(next_token)
+    #         # 更新 kv cache / attention_mask
+    #         model_kwargs = self._update_model_kwargs_for_generation(
+    #             outputs, model_kwargs
+    #         )
+    #         # 判断终止：所有 batch 均生成 eos
+    #         if eos_token_id is not None:
+    #             if (next_token == eos_token_id).all():
+    #                 break
+    #     generated_ids = torch.cat(generated, dim=1)
+    #     return generated_ids