Spaces:

Junfeng5
/

Liquid_demo

Running on Zero

App Files Files Community

Junfeng5 commited on 19 days ago

Commit

0d7e8be

verified ·

1 Parent(s): c1f6040

Upload 31 files

Browse files

Files changed (32) hide show

.gitattributes +2 -0
Liquid_icon.png +3 -0
README.md +0 -14
app.py +343 -51
baklava.png +3 -0
chameleon/__init__.py +4 -0
chameleon/download_data.py +88 -0
chameleon/inference/__init__.py +4 -0
chameleon/inference/alignment.py +79 -0
chameleon/inference/chameleon.py +673 -0
chameleon/inference/cudagraph.py +85 -0
chameleon/inference/examples/batch.py +23 -0
chameleon/inference/examples/multimodal_input.py +28 -0
chameleon/inference/examples/simple.py +22 -0
chameleon/inference/examples/streaming.py +22 -0
chameleon/inference/examples/streaming_batch.py +24 -0
chameleon/inference/generation.py +162 -0
chameleon/inference/image_tokenizer.py +127 -0
chameleon/inference/loader.py +71 -0
chameleon/inference/logits_processor.py +336 -0
chameleon/inference/model_adapter.py +118 -0
chameleon/inference/stopping_criteria.py +55 -0
chameleon/inference/token_selector.py +47 -0
chameleon/inference/transformer.py +421 -0
chameleon/inference/utils.py +34 -0
chameleon/inference/vocab.py +122 -0
chameleon/inference/vqgan.py +675 -0
chameleon/vqgan.ckpt +3 -0
chameleon/vqgan.yaml +57 -0
conversation.py +460 -0
helpers.py +99 -0
requirements.txt +6 -1

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+baklava.png filter=lfs diff=lfs merge=lfs -text
+Liquid_icon.png filter=lfs diff=lfs merge=lfs -text

Liquid_icon.png ADDED Viewed

Git LFS Details

SHA256: 7d65c5aa3ed6ebc4d9327b3962690cda4ada81b9359daf5dcbe9528f0635f0b6
Pointer size: 131 Bytes
Size of remote file: 113 kB

README.md CHANGED Viewed

@@ -1,14 +0,0 @@
----
-title: Liquid Demo
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
-sdk: gradio
-sdk_version: 5.0.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: A unified understanding and generation multimodal model
----
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

app.py CHANGED Viewed

@@ -1,64 +1,356 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
 """
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+import time
+from threading import Thread
 import gradio as gr
+import torch
+import PIL
+from PIL import Image
+from transformers import AutoConfig, AutoModelForCausalLM
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import os
+from tqdm import tqdm
+from chameleon.inference.image_tokenizer import ImageTokenizer
+from helpers import sample, expand2square, tokenizer_image_token
+# from transformers import AutoProcessor, LlavaForConditionalGeneration
+from transformers import TextIteratorStreamer
+from conversation import  conv_templates
+import spaces
+import os
+os.system("pip uninstall -y gradio")
+os.system("pip install gradio==4.44.1")
+os.system("pip install gradio_client==1.3.0")
+IMAGE_TOKEN_INDEX=-200
+PLACEHOLDER = """
+<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <img src='file/Liquid_icon.png' style="width: 80%; max-width: 600px; height: auto; opacity: 0.5;">
+   <h1 style="font-size: 20px; margin-bottom: 1px; opacity: 0.55;">Liquid-7B</h1>
+</div>
 """
+CSS ="""
+.contain { display: flex; flex-direction: column; }
+#component-0 { height: 100%; }
+#chatbot { flex-grow: 1; }
 """
+title_html = """
+<div style="display: flex; flex-direction: column; align-items: center; gap: 10px;">
+<h1 style="margin: 0; line-height: 1; text-align: center;"> Liquid: Language Models are Scalable Multi-modal <br> Generators via Unified Understanding and Generation</h1>
+</div>
+"""
+links_html = f"""
+<center><font size=3><a href='https://foundationvision.github.io/Liquid/'>Liquid</a> has been open-sourced on <a href='https://huggingface.co/Junfeng5/Liquid_V1_7B'>😊 Huggingface</a> and <a href='https://github.com/FoundationVision/Liquid'>🌟 GitHub</a>. If you find Liquid useful, a like❤️ or a star🌟 would be appreciated.</font></center>
+"""
+introduction = f"""
+ Liquid explores the potential of a single LLM as a multimodal generator and its scaling laws. It achieves the level of diffusion models in visual generation and discovers the mutual enhancement between understanding and generation. More details can be found on the project <a href='https://foundationvision.github.io/Liquid/'> homepage</a> and in the <a href='https://arxiv.org/abs/2412.04332'> paper</a>. """
+model_id = 'Junfeng5/Liquid_V1_7B'
+tokenizer = AutoTokenizer.from_pretrained(model_id,padding_side='left')
+vqllm = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    attn_implementation='flash_attention_2',
+    torch_dtype=torch.bfloat16,
+    load_in_8bit=True,
+    max_memory={0: "40GiB" },
+    ) # .to("cuda:0")
+stop_flag = False
+ori_vocabe_size = len(tokenizer)
+vqgan_cfg_path = "chameleon/vqgan.yaml"
+vqgan_ckpt_path = "chameleon/vqgan.ckpt"
+image_tokenizer = ImageTokenizer(  cfg_path=vqgan_cfg_path, ckpt_path=vqgan_ckpt_path, device="cuda:0",)
+@spaces.GPU
+def bot_streaming_I2T(message, history):
+    print(message)
+    global stop_flag
+    stop_flag = True
+    time.sleep(0.2)
+    stop_flag = False
+    torch.cuda.empty_cache()
+    if message["files"]:
+        # message["files"][-1] is a Dict or just a string
+        if type(message["files"][-1]) == dict:
+            image = message["files"][-1]["path"]
+        else:
+            image = message["files"][-1]
+    else:
+        # if there's no image uploaded for this turn, look for images in the past turns
+        # kept inside tuples, take the last one
+        for hist in history:
+            if type(hist[0]) == tuple:
+                image = hist[0][0]
+    try:
+        if image is None:
+            # Handle the case where image is None
+            gr.Error("You need to upload an image for LLaVA to work.")
+    except NameError:
+        # Handle the case where 'image' is not defined at all
+        gr.Error("You need to upload an image for LLaVA to work.")
+    qs = message['text']
+    qs = '<boi><image><eoi>' + '\n' + qs
+    conv = conv_templates['gemma'].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    print(prompt)
+    image = Image.open(image).convert('RGB')
+    pad_image = expand2square(image, (122, 116, 104) )
+    input_image = pad_image.resize((512,512), PIL.Image.LANCZOS)
+    with torch.no_grad():
+        vq_code =  image_tokenizer.img_tokens_from_pil(input_image)
+        vqcode = vq_code.cpu()
+        vqcode = vqcode+ len(tokenizer)
+        text_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        num_images = (text_ids == IMAGE_TOKEN_INDEX).sum()
+        image_token_indices = [-1] + torch.where(text_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [text_ids.shape[0]]
+        cur_input_ids = []
+        for i in range(num_images + 1):
+            cur_input_ids.append(text_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+            if i < num_images:
+                cur_input_ids.append( vqcode )
+        input_ids = torch.cat(cur_input_ids, dim=0)
+        # input_embeddings = vqllm.embed_tokens(input_ids)
+        inputs =  {
+            "input_ids":input_ids.unsqueeze(0).to("cuda:0"),
+            "max_new_tokens":1024,
+            "bos_token_id":tokenizer.bos_token_id,  # Begin of sequence token
+            "eos_token_id":tokenizer.eos_token_id,  # End of sequence token
+            "pad_token_id":tokenizer.pad_token_id,  # Pad token
+            }
+        streamer = TextIteratorStreamer(tokenizer, **{"skip_special_tokens": False, "skip_prompt": True})
+        # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+        thread = Thread(target=vqllm.generate, kwargs=generation_kwargs)
+        thread.start()
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+            time.sleep(0.06)
+            yield generated_text
+def show_gallery(images):
+    gallery = gr.Gallery(images, label="Gallery", columns=4, height="auto",preview=True,scale=0.05)  # 设置两行两列的布局
+    return gallery
+@spaces.GPU
+def bot_streaming_T2I(message, history,guidance_scale, temperature, top_K, top_P):
+    global stop_flag
+    stop_flag = True
+    time.sleep(0.2)
+    stop_flag = False
+    text_inputs = [message]*4  # generate 4 samples once
+    uncondition_text_inputs = ['<unconditional><boi>']*len(text_inputs)
+    for i in range(len(text_inputs)):
+        text_inputs[i] = text_inputs[i]+' Generate an image based on this description.<boi>'
+    ori_batchsize = len(text_inputs)
+    if guidance_scale>1:
+        model_inputs = tokenizer(text_inputs+uncondition_text_inputs, return_tensors="pt",padding=True).to("cuda:0")
+    else:
+        model_inputs = tokenizer(text_inputs, return_tensors="pt",padding=True).to("cuda:0")
+    with torch.no_grad():
+        sampling_kwargs={'temperature': temperature, 'top_k': top_K, 'top_p': top_P, 'sample_logits': True}
+        input_ids = model_inputs['input_ids']
+        cur_len = input_ids.shape[1]
+        model_kwargs = {'attention_mask':model_inputs['attention_mask']  , 'use_cache': True}
+        model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+        pred_tokens = []
+        for i in tqdm(range(1024)):
+            if stop_flag:
+                print("generation is stoped")
+                del sampling_kwargs
+                del model_inputs
+                del outputs
+                torch.cuda.empty_cache()
+                break
+            model_inputs = vqllm.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            if i > 0 and guidance_scale>1:
+                outputs = vqllm(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=False,
+                    output_hidden_states=False,
+                )
+            else:
+                outputs = vqllm(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=False,
+                    output_hidden_states=False,
+                )
+            next_token_logits = outputs.logits[:, -1:, :]
+            if guidance_scale>1:
+                cond_logits, uncond_logits = torch.split(next_token_logits, len(next_token_logits) // 2, dim=0)
+                cfg_logits = uncond_logits + (cond_logits - uncond_logits) * guidance_scale
+                half_next_token, _ = sample(cfg_logits, **sampling_kwargs)
+                pred_tokens.append(half_next_token)
+                next_token = torch.cat([half_next_token,half_next_token])
+            else:
+                next_token, next_prob = sample(next_token_logits, **sampling_kwargs)
+                pred_tokens.append(next_token)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_token], dim=-1)
+            model_kwargs = vqllm._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=vqllm.config.is_encoder_decoder,
+            )
+        del sampling_kwargs
+        del model_inputs
+        del outputs
+        image_vq_id = torch.cat(pred_tokens,dim=1)-ori_vocabe_size
+        image_vq_id = torch.clamp(image_vq_id, min=0, max=8191)
+        generated_image_list = []
+        for index, generate_id in enumerate(image_vq_id):
+            rec_img = image_tokenizer.pil_from_img_toks(generate_id)
+            generated_image_list.append(rec_img)
+            # rec_img.save('{}/{}.jpg'.format(image_save_pth,str(idx)))
+        torch.cuda.empty_cache()
+         # yield gr.Image(value=generated_image_list[0], label="Generated Image", show_download_button=True)
+        yield show_gallery(generated_image_list)
+@spaces.GPU
+def bot_streaming_T2T(message, history,temperature):
+    print(message)
+    global stop_flag
+    stop_flag = True
+    time.sleep(0.2)
+    stop_flag = False
+    torch.cuda.empty_cache()
+    qs = message
+    conv = conv_templates['gemma'].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    print(prompt)
+    with torch.no_grad():
+        inputs = tokenizer([prompt], return_tensors="pt").to('cuda')
+        streamer = TextIteratorStreamer(tokenizer, **{"skip_special_tokens": False, "skip_prompt": True})
+        # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+        thread = Thread(target=vqllm.generate, kwargs=generation_kwargs)
+        thread.start()
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+            yield generated_text
+chatbot_T2I=gr.Chatbot(placeholder=PLACEHOLDER,height=600)
+chat_input_T2I = gr.Textbox(placeholder="Enter text prompts...", show_label=False)
+chatbot_I2T=gr.Chatbot(placeholder=PLACEHOLDER, scale=1)
+chat_input_I2T = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False)
+chatbot_T2T=gr.Chatbot(placeholder=PLACEHOLDER, scale=1)
+chat_input_T2T = gr.Textbox(placeholder="Enter text prompts...", show_label=False)
+with gr.Blocks(fill_height=True) as demo:
+    gr.Markdown(title_html)
+    gr.Markdown(links_html)
+    gr.Markdown(introduction)
+    with gr.Tab("Text To Image"):
+        description="Enter a text prompt or simply try one of the examples below to generate 4 images at once. Click to display the full image. You can configure hyperparameters for image generation in the Advanced Settings. "
+        gr.Markdown(description)
+        with gr.Accordion("⚙️ Advanced Settings", open=False):
+            with gr.Row():
+                guidance_scale = gr.Slider(1.0, 20.0, value=7.0, label="Guidance Scale")
+                temperature = gr.Slider(0.0, 1.0, value=0.9, label="temperature")
+                top_K = gr.Slider(1, 8192, value=4096, label="Top K")
+                top_P = gr.Slider(0.0, 1.0, value=0.99, label="Top P")
+        aaa = gr.ChatInterface(
+        fn=bot_streaming_T2I,
+        examples=[
+            ["young blue dragon with horn lightning in the style of dd fantasy full body",5.0, 0.9,4096,0.99],
+            ["A majestic Goddes of beauty, charming dressed in a regal, jeweled gown and ornate crown, her golden hair cascading down her back, in the style of Pino Daeni",5.0, 0.9,4096,0.99],
+            ["A highly realistic, closeup photograph of a beautiful 35 year old redread woman writing in her journal, sitting on her balcony wearing warm, stylish outfits. Shot on a Canon EOS R5, the image boasts sharp focus and intricate details. The heartwarming scene conveys love, connection, and the crisp winter atmosphere, dramatic lighting.",5.0, 0.9,4096,0.99],
+            ["Portrait of an asian woman. She has pink violet hair style with modern complex hairdressing. The background is dark with cyberpunk neon lights. Inspired by Cyberpunk 2077 and Blade Runner. Ultra realistic picture. To capture the image, you will use a fullframe DSLR or mirrorless camera with a highresolution sensor, an aperture of f2.8 or wider, and a shutter speed of 1500 second or faster. You will use natural light and reflectors to create a balanced and welllit image, and will experiment with different angles and compositions to create the most i",5.0, 0.9,4096,0.99],
+            ["female character fantasy world, for fantasy story, protagonist, interesting and detailed clothes, beautiful, medieval fantasy  cinematic shot  photo taken by canon, photo taken by fuji, photo taken by kodak  incredibly detailed, sharpen, details  professional lighting , film lighting  350mm  lightroom  cinematography, hyper realism, cinematic, film quality",5.0, 0.9,4096,0.99],
+            ["strawberries splashing, swirling liquid, realism, octane render, raytracing",5.0, 0.9,4096,0.99],
+                  ["hedgehog face, floating in space, wearing space suit no helmet, cinematic, 50mm f1.8, unreal engine 5",5.0, 0.9,4096,0.99],
+                  ["artificial intelligence, revolution, publishing, writer, hyperrealistic",5.0, 0.9,4096,0.99],
+                  ["A pig dressed as a mason, by Bill Gekas",5.0, 0.9,4096,0.99],
+                  ],
+        stop_btn="Stop Generation",
+        additional_inputs = [guidance_scale, temperature, top_K, top_P],
+        additional_inputs_accordion="⚙️ Advanced Settings",
+        multimodal=False,
+        textbox=chat_input_T2I,
+        chatbot=chatbot_T2I,
+        fill_height=True,
+        )
+    with gr.Tab("Image To Text"):
+        bbb = gr.ChatInterface(
+            fn=bot_streaming_I2T,
+            examples=[ {"text": "How to make this pastry?", "files": ["./baklava.png"]}],
+            description="Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
+            stop_btn="Stop Generation",
+            multimodal=True,
+            textbox=chat_input_I2T,
+            chatbot=chatbot_I2T,
+            )
+    with gr.Tab("Text To Text"):
+        with gr.Accordion("⚙️ Advanced Settings", open=False):
+            with gr.Row():
+                texttemperature = gr.Slider(0.0, 1.0, value=0.9, label="texttemperature")
+        gr.ChatInterface(
+            fn=bot_streaming_T2T,
+            examples=[["a dog", 0.9]],
+            description="Chat with Liquid without images.",
+            stop_btn="Stop Generation",
+            additional_inputs = [texttemperature],
+            additional_inputs_accordion="⚙️ Advanced Settings",
+            multimodal=False,
+            textbox=chat_input_T2T,
+            chatbot=chatbot_T2T,
+            )
+demo.queue(api_open=False)
+demo.launch(allowed_paths=["./"], share=False )

baklava.png ADDED Viewed

Git LFS Details

SHA256: 7839e93dd753e5356176bf70d38c43bc56355099d8891ead7aaa342029369268
Pointer size: 132 Bytes
Size of remote file: 2.04 MB

chameleon/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.

chameleon/download_data.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Chameleon License Agreement.
+import hashlib
+from pathlib import Path
+import subprocess
+import sys
+def download_file(url: str, output_path: Path):
+    print(f"Downloading {output_path}")
+    subprocess.check_call(["wget", "--continue", url, "-O", str(output_path)])
+def validate_checksum(folder: Path):
+    chks_parts = (folder / "checklist.chk").read_text().split()
+    for expected_checksum, file in zip(chks_parts[::2], chks_parts[1::2]):
+        file_path = folder / file
+        checksum = hashlib.md5(file_path.read_bytes()).hexdigest()
+        if checksum != expected_checksum:
+            print(f"Checksum mismatch for {file_path}")
+            sys.exit(1)
+def download_tokenizer(presigned_url: str, target_folder: Path):
+    tokenizer_folder = target_folder / "tokenizer"
+    tokenizer_folder.mkdir(parents=True, exist_ok=True)
+    for filename in [
+        "text_tokenizer.json",
+        "vqgan.ckpt",
+        "vqgan.yaml",
+        "checklist.chk",
+    ]:
+        download_file(
+            presigned_url.replace("*", f"tokenizer/{filename}"),
+            tokenizer_folder / filename,
+        )
+    validate_checksum(tokenizer_folder)
+def download_model(presigned_url: str, target_folder: Path, model: str):
+    model_folder = target_folder / "models" / model
+    model_folder.mkdir(parents=True, exist_ok=True)
+    download_filenames = ["params.json", "consolidate_params.json", "checklist.chk"]
+    if model == "7b":
+        download_filenames += ["consolidated.pth"]
+    elif model == "30b":
+        download_filenames += [f"consolidated.{i:02}.pth" for i in range(4)]
+    else:
+        print(f"Unknown model: {model}")
+        sys.exit(1)
+    for filename in download_filenames:
+        download_file(
+            presigned_url.replace("*", f"{model}/{filename}"),
+            model_folder / filename,
+        )
+    validate_checksum(model_folder)
+def main():
+    presigned_url = (
+        sys.argv[1] if len(sys.argv) > 1 else input("Enter the URL from email: ")
+    )
+    target_folder = Path("./data")
+    target_folder.mkdir(parents=True, exist_ok=True)
+    download_tokenizer(presigned_url, target_folder)
+    model_size = input(
+        "Enter the list of models to download without spaces (7B,30B), or press Enter for all: "
+    )
+    if not model_size:
+        model_size = "7B,30B"
+    for model in model_size.split(","):
+        model = model.strip().lower()
+        download_model(presigned_url, target_folder, model)
+if __name__ == "__main__":
+    main()

chameleon/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.

chameleon/inference/alignment.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
+import torch
+class PromptAlignment(ABC):
+    @abstractmethod
+    def start_index(self, input_ids: list[list[int]]) -> int:
+        ...
+    @abstractmethod
+    def prepare_inputs(self, input_ids: list[list[int]]) -> torch.Tensor:
+        ...
+    @abstractmethod
+    def postprocess_inputs(
+        self, inputs: torch.Tensor, original_inputs: torch.Tensor
+    ) -> torch.Tensor:
+        ...
+class AlignPromptRight(PromptAlignment):
+    def __init__(self, pad_id: int):
+        self.pad_id = pad_id
+    def start_index(self, input_ids: list[list[int]]) -> int:
+        return max(len(sublist) for sublist in input_ids)
+    def prepare_inputs(self, input_ids: list[list[int]]) -> torch.LongTensor:
+        max_length = max(len(sublist) for sublist in input_ids)
+        return torch.tensor(
+            [
+                ([self.pad_id] * (max_length - len(sublist))) + sublist
+                for sublist in input_ids
+            ],
+            requires_grad=False,
+        )
+    def postprocess_inputs(
+        self,
+        inputs: torch.Tensor,
+        original_inputs: torch.Tensor,
+    ) -> torch.Tensor:
+        return inputs
+class AlignPromptLeft(PromptAlignment):
+    def __init__(self, pad_id: int = -1):
+        self.pad_id = pad_id
+    def start_index(self, input_ids: list[list[int]]) -> int:
+        return min(len(sublist) for sublist in input_ids)
+    def prepare_inputs(self, input_ids: list[list[int]]) -> torch.Tensor:
+        max_length = max(len(sublist) for sublist in input_ids)
+        return torch.tensor(
+            [
+                sublist + ([self.pad_id] * (max_length - len(sublist)))
+                for sublist in input_ids
+            ],
+            requires_grad=False,
+        )
+    def postprocess_inputs(
+        self,
+        inputs: torch.Tensor,
+        original_inputs: torch.Tensor,
+    ) -> torch.Tensor:
+        max_init_len = original_inputs.shape[1]
+        if inputs.shape[1] <= max_init_len:
+            original_inputs_limited = original_inputs[:, : inputs.shape[1]]
+            mask = original_inputs_limited != self.pad_id
+            inputs[mask] = original_inputs_limited[mask]
+        return inputs

chameleon/inference/chameleon.py ADDED Viewed

	@@ -0,0 +1,673 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import base64
+import io
+import json
+import math
+import queue
+import threading
+from dataclasses import dataclass, field
+from enum import Enum
+from multiprocessing import managers, queues, synchronize
+from typing import Literal, Union
+import PIL
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from PIL.Image import Image
+from tokenizers import Tokenizer
+from transformers import (
+    LogitsProcessor,
+    RepetitionPenaltyLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopPLogitsWarper,
+    enable_full_determinism,
+)
+from chameleon.inference import loader
+from chameleon.inference.alignment import AlignPromptRight
+from chameleon.inference.generation import ChameleonGenerator
+from chameleon.inference.image_tokenizer import ImageTokenizer
+from chameleon.inference.logits_processor import (
+    AllowOnlyTokensLogitsProcessor,
+    DisallowTokensAtOrAfterIndexLogitsProcessor,
+    InBatchInstructCFGLogitsProcessor,
+)
+from chameleon.inference.model_adapter import ChameleonModelAdapter
+from chameleon.inference.stopping_criteria import (
+    MaxLengthCriteria,
+    StopOnEOSAfterBatchIndex,
+)
+from chameleon.inference.token_selector import (
+    ArgmaxTokenSelector,
+    MultinomialTokenSelector,
+    ReplicatedInputTokenSelector,
+)
+from chameleon.inference.transformer import Transformer
+from chameleon.inference.utils import DynamicGenerator, advance, random_unused_port
+from chameleon.inference.vocab import VocabInfo, VocabTranslation
+@dataclass
+class Options:
+    @dataclass
+    class Text:
+        repetition_penalty: float = 1.2
+        temp: float = 0.7
+        top_p: float = 0.9
+        greedy: bool = False
+    @dataclass
+    class Image:
+        @dataclass
+        class CFG:
+            guidance_scale_text: float = 3.0
+            guidance_scale_image: float = 1.2
+        cfg: CFG = field(default_factory=CFG)
+        temp: float = 0.7
+        top_p: float = 0.9
+        greedy: bool = False
+    max_seq_len: int = 4096
+    max_gen_len: int = 4096
+    seed: int | None = None
+    txt: Text | bool = True
+    img: Image | bool = False
+    extra_eos_tokens: list[int | str] = field(default_factory=lambda: ["<racm3:break>"])
+    def __post_init__(self):
+        if self.txt == True:
+            self.txt = Options.Text()
+        if self.img == True:
+            self.img = Options.Image()
+class TokenManager:
+    def __init__(
+        self,
+        tokenizer_path: str,
+        vqgan_cfg_path: str,
+        vqgan_ckpt_path: str,
+        device: str | None = None,
+    ):
+        self.tokenizer = Tokenizer.from_file(tokenizer_path)
+        self.vocab = VocabInfo(json.load(open(tokenizer_path))["model"]["vocab"])
+        self.translation = VocabTranslation(self.vocab, device=device)
+        self.image_tokenizer = ImageTokenizer(
+            cfg_path=vqgan_cfg_path, ckpt_path=vqgan_ckpt_path, device=device
+        )
+    def pil_from_bpe_tokens(self, bpe_tokens: torch.Tensor) -> PIL.Image:
+        image_tensor = self.translation.convert_bpe2img(bpe_tokens)
+        if image_tensor.shape[0] < 1024:
+            padding = (
+                torch.ones(
+                    [1024 - image_tensor.shape[0]],
+                    dtype=int,
+                    device=image_tensor.device,
+                )
+                * image_tensor[0]
+            )
+            image_tensor = torch.cat((image_tensor, padding)).unsqueeze(0)
+        return self.image_tokenizer.pil_from_img_toks(image_tensor)
+    def png_from_bpe_tokens(self, bpe_tokens: torch.Tensor) -> bytes:
+        pil = self.pil_from_bpe_tokens(bpe_tokens)
+        img_io = io.BytesIO()
+        pil.save(img_io, format="PNG")
+        return img_io.getvalue()
+    def tokenize_text(self, text: str) -> list[int]:
+        return self.tokenizer.encode(text).ids
+    def tokenize_image(self, img: Image) -> list[int]:
+        return (
+            [self.vocab.begin_image]
+            + self.translation.convert_img2bp2(
+                self.image_tokenizer.img_tokens_from_pil(img)
+            ).tolist()
+            + [self.vocab.end_image]
+        )
+    def tokenize_b64img(self, b64img: str) -> list[int]:
+        image_data = base64.b64decode(b64img)
+        image_file = io.BytesIO(image_data)
+        return self.tokenize_image(PIL.Image.open(image_file))
+    def tokens_from_ui(self, inputs: list[dict]) -> list[int]:
+        tokens = [self.vocab.bos_id]
+        for input_ in inputs:
+            if input_["type"] == "text":
+                tokens += self.tokenize_text(input_["value"])
+            elif input_["type"] == "image":
+                if type(input_["value"]) == str:
+                    if input_["value"].startswith("data:"):
+                        # Value Format: 'data:image/[^;]+;base64,[A-Za-z0-9+/]+={0,2}'
+                        tokens += self.tokenize_b64img(input_["value"].split(",", 1)[1])
+                    elif input_["value"].startswith("file:"):
+                        tokens += self.tokenize_image(
+                            PIL.Image.open(input_["value"].split(":", 1)[1])
+                        )
+                    else:
+                        raise ValueError("Unknown image format.")
+                elif type(input_["value"]) == Image:
+                    tokens += self.tokenize_image(input_["value"])
+                else:
+                    raise ValueError("Unknown image type.")
+            elif input_["type"] == "sentinel":
+                tokens += [
+                    {
+                        "<START-OF-IMAGE>": self.vocab.begin_image,
+                        "<END-OF-TURN>": self.vocab.eot_id,
+                    }[input_["value"]]
+                ]
+            elif input_["type"] == "ids":
+                tokens += input_["value"]
+            else:
+                raise ValueError("Unknown input type.")
+        return tokens
+    def decode_text(self, ids: torch.LongTensor | list[list[int]]) -> list[str]:
+        if isinstance(ids, torch.Tensor):
+            ids = ids.tolist()
+        for row, values in enumerate(ids):
+            try:
+                ids[row] = values[: values.index(self.vocab.eos_id)]
+            except ValueError:
+                pass
+        return self.tokenizer.decode_batch(ids)
+    def decode_image(self, ids: torch.LongTensor) -> list[PIL.Image]:
+        return [self.pil_from_bpe_tokens(sample) for sample in ids]
+@dataclass
+class DecodePiece:
+    token: ChameleonGenerator.Token
+    next_decoder: type["Decoder"] | None
+class Decoder:
+    def __init__(
+        self,
+        model: Transformer,
+        vocab: VocabInfo,
+        options: Options,
+        input_ids: list[int],
+    ): ...
+    def __next__(self) -> DecodePiece: ...
+class TextDecoder(Decoder):
+    def __init__(
+        self,
+        model: Transformer,
+        vocab: VocabInfo,
+        options: Options,
+        input_ids: list[list[int]],
+    ):
+        self.vocab = vocab
+        self.options = options
+        assert vocab.eos_id is not None
+        prompt_lens = [len(inp) for inp in input_ids]
+        max_prompt_len = max(prompt_lens)
+        max_seq_len = min(options.max_seq_len, max_prompt_len + options.max_gen_len)
+        self.eos_ids = [vocab.eos_id]
+        for extra_eos_token in options.extra_eos_tokens:
+            if isinstance(extra_eos_token, str):
+                extra_eos_token = vocab.name2val[extra_eos_token]
+            assert isinstance(extra_eos_token, int)
+            self.eos_ids.append(extra_eos_token)
+        stopping_criteria = [
+            MaxLengthCriteria(max_seq_len),
+        ] + [StopOnEOSAfterBatchIndex(eos_id, [max_prompt_len] * len(prompt_lens)) for eos_id in self.eos_ids]
+        self.gen = ChameleonGenerator(
+            model=ChameleonModelAdapter(model, max_seq_len=max_seq_len),
+            input_ids=input_ids,
+            stopping_criteria=stopping_criteria,
+            logits_processors=self._logits_processors(),
+            alignment=AlignPromptRight(vocab.pad_id),
+            token_selector=(
+                ArgmaxTokenSelector()
+                if options.txt.greedy
+                else MultinomialTokenSelector()
+            ),
+        )
+        advance(self.gen, max_prompt_len)
+    def _allowed_tokens(self) -> list[int]:
+        allowed_tokens = [self.vocab.eos_id]
+        if self.options.txt:
+            allowed_tokens += self.vocab.text_tokens
+        if self.options.img:
+            allowed_tokens += [self.vocab.begin_image]
+        return allowed_tokens
+    def _logits_processors(self) -> list[LogitsProcessor]:
+        logits_processors = [
+            AllowOnlyTokensLogitsProcessor(self._allowed_tokens()),
+        ]
+        if isinstance(self.options.img, Options.Image):
+            logits_processors += [
+                DisallowTokensAtOrAfterIndexLogitsProcessor(
+                    [self.vocab.begin_image],
+                    self.options.max_seq_len - 1026,
+                ),
+            ]
+        if isinstance(self.options.txt, Options.Text):
+            logits_processors += [
+                RepetitionPenaltyLogitsProcessor(self.options.txt.repetition_penalty),
+                TemperatureLogitsWarper(self.options.txt.temp),
+                TopPLogitsWarper(self.options.txt.top_p),
+            ]
+        return logits_processors
+    def __next__(self) -> DecodePiece:
+        tok = next(self.gen)
+        next_decoder = None
+        if (
+            self.vocab.begin_image not in self.eos_ids
+            and (tok.id == self.vocab.begin_image).all()
+        ):
+            next_decoder = ImageDecoder
+        return DecodePiece(tok, next_decoder)
+class ImageDecoder(Decoder):
+    def __init__(
+        self,
+        model: Transformer,
+        vocab: VocabInfo,
+        options: Options,
+        input_ids: list[list[int]],
+    ):
+        assert isinstance(options.img, Options.Image)
+        self.vocab = vocab
+        self.options = options
+        self.batch_size = len(input_ids)
+        logits_processors = [
+            InBatchInstructCFGLogitsProcessor(
+                options.img.cfg.guidance_scale_text,
+                options.img.cfg.guidance_scale_image,
+            ),
+            AllowOnlyTokensLogitsProcessor(vocab.image_tokens),
+            TemperatureLogitsWarper(options.img.temp),
+            TopPLogitsWarper(options.img.top_p),
+        ]
+        for inp in input_ids:
+            if inp[-1] != self.vocab.begin_image:
+                inp.append(self.vocab.begin_image)
+        max_prompt_len = max(len(inp) for inp in input_ids)
+        self.gen = ChameleonGenerator(
+            model=ChameleonModelAdapter(model, max_seq_len=max_prompt_len + 1024),
+            input_ids=self._split_inputs_for_cfg(input_ids),
+            logits_processors=logits_processors,
+            alignment=AlignPromptRight(vocab.pad_id),
+            token_selector=ReplicatedInputTokenSelector(
+                (
+                    ArgmaxTokenSelector()
+                    if options.img.greedy
+                    else MultinomialTokenSelector()
+                ),
+                n=3,
+            ),
+        )
+        advance(self.gen, max_prompt_len)
+        self.gen_count = 0
+    def _split_inputs_for_cfg(self, input_ids: list[list[int]]) -> list[list[int]]:
+        image_conditioned_allowed = set(self.vocab.image_tokens) | {
+            self.vocab.bos_id,
+            self.vocab.begin_image,
+            self.vocab.end_image,
+        }
+        full_conditioned = input_ids
+        image_conditioned = [
+            [id for id in sample if id in image_conditioned_allowed]
+            for sample in input_ids
+        ]
+        unconditioned = [
+            [
+                self.vocab.bos_id,
+                self.vocab.begin_image,
+            ]
+        ] * self.batch_size
+        return full_conditioned + image_conditioned + unconditioned
+    def __next__(self) -> DecodePiece:
+        if self.gen_count == 1024:
+            id = torch.tensor([self.vocab.end_image] * self.batch_size)
+            logits = torch.full(
+                (self.batch_size, len(self.vocab.all_tokens)), -math.inf
+            )
+            logits[:, self.vocab.end_image] = 0
+            return DecodePiece(
+                ChameleonGenerator.Token(id=id, logits=logits),
+                TextDecoder,
+            )
+        tok = next(self.gen)
+        tok.id = tok.id.chunk(3)[0]
+        self.gen_count += 1
+        return DecodePiece(tok, None)
+class Generator(Decoder):
+    def __init__(
+        self,
+        model: Transformer,
+        vocab: VocabInfo,
+        options: Options,
+        input_ids: list[list[int]],
+    ):
+        if options.seed is not None:
+            enable_full_determinism(options.seed, warn_only=True)
+        self.model = model
+        self.vocab = vocab
+        self.input_ids = input_ids[:]
+        self.generated_token_ids: list[torch.LongTensor] = []
+        self.options = options
+        if not self.options.txt:
+            self.dyngen = DynamicGenerator(
+                ImageDecoder(model, vocab, options, input_ids)
+            )
+        else:
+            self.dyngen = DynamicGenerator(
+                TextDecoder(model, vocab, options, input_ids)
+            )
+    def __iter__(self):
+        return self
+    def __next__(self) -> ChameleonGenerator.Token:
+        piece = next(self.dyngen)
+        self.generated_token_ids.append(piece.token.id)
+        if piece.next_decoder is not None:
+            if not self.options.txt:
+                raise StopIteration
+            self.input_ids = [
+                old_list + generated
+                for old_list, generated in zip(
+                    self.input_ids, torch.stack(self.generated_token_ids).T.tolist()
+                )
+            ]
+            self.generated_token_ids = []
+            self.dyngen.gen = piece.next_decoder(
+                self.model,
+                self.vocab,
+                self.options,
+                self.input_ids,
+            )
+        return piece.token
+class DistributedMode(Enum):
+    AUTO = 0
+    THREAD = 1
+    PROCESS = 2
+@dataclass
+class _DistributedContext:
+    req_q: Union[queue.Queue, queues.Queue]
+    res_q: Union[queue.Queue, queues.Queue]
+    active_key: Union[dict[int, Literal[True]], managers.DictProxy]
+    active_key_lock: Union[threading.Lock, synchronize.Lock]
+    ready_barrier: Union[threading.Barrier, synchronize.Barrier]
+    worker_launcher: Union[type[threading.Thread], type[mp.Process]]
+    @staticmethod
+    def make_for_threading(world_size: int):
+        return _DistributedContext(
+            req_q=queue.Queue(),
+            res_q=queue.Queue(),
+            active_key={},
+            active_key_lock=threading.Lock(),
+            ready_barrier=threading.Barrier(world_size + 1),
+            worker_launcher=threading.Thread,
+        )
+    @staticmethod
+    def make_for_multiprocessing(world_size: int):
+        local_mp = mp.get_context("spawn")
+        return _DistributedContext(
+            req_q=local_mp.Queue(),
+            res_q=local_mp.Queue(),
+            active_key=local_mp.Manager().dict(),
+            active_key_lock=local_mp.Lock(),
+            ready_barrier=local_mp.Barrier(world_size + 1),
+            worker_launcher=local_mp.Process,
+        )
+    @staticmethod
+    def make(mode: DistributedMode, world_size: int):
+        if mode == DistributedMode.AUTO:
+            mode = DistributedMode.PROCESS
+        if mode == DistributedMode.THREAD:
+            return _DistributedContext.make_for_threading(world_size)
+        elif mode == DistributedMode.PROCESS:
+            return _DistributedContext.make_for_multiprocessing(world_size)
+        else:
+            raise ValueError("Unknown DistributedMode")
+def _worker_impl(
+    init_method: str,
+    model: Transformer | str,
+    world_size: int,
+    rank: int,
+    vocab: VocabInfo,
+    dctx: _DistributedContext,
+):
+    dist.init_process_group(
+        "nccl",
+        init_method=init_method,
+        world_size=world_size,
+        rank=rank,
+    )
+    torch.set_default_device(f"cuda:{rank}")
+    torch.cuda.set_device(rank)
+    if isinstance(model, str):
+        model = loader.load_model(model, rank=rank)
+    dctx.ready_barrier.wait()
+    is_coord = rank == 0
+    while True:
+        req = [Options(), [], 0, False]
+        if is_coord:
+            req = dctx.req_q.get()
+        dist.broadcast_object_list(req, src=0)
+        options, input_ids, key, shutdown = req
+        if shutdown:
+            break
+        for token in Generator(
+            model=model,
+            vocab=vocab,
+            options=options,
+            input_ids=input_ids,
+        ):
+            if is_coord:
+                dctx.res_q.put((key, token))
+            to_continue = [True]
+            if is_coord:
+                with dctx.active_key_lock:
+                    to_continue = [key in dctx.active_key]
+            dist.broadcast_object_list(to_continue, src=0)
+            if not to_continue[0]:
+                break
+        if is_coord:
+            dctx.res_q.put((key, None))
+class ChameleonInferenceModel:
+    def __init__(
+        self,
+        model: Transformer | str,
+        tokenizer_path: str,
+        vqgan_cfg_path: str,
+        vqgan_ckpt_path: str,
+        *,
+        options: Options | None = None,
+        distributed_mode: DistributedMode = DistributedMode.AUTO,
+    ):
+        self.options = options or Options()
+        self.next_key = 0
+        self.token_manager = TokenManager(
+            tokenizer_path=tokenizer_path,
+            vqgan_cfg_path=vqgan_cfg_path,
+            vqgan_ckpt_path=vqgan_ckpt_path,
+            device="cuda",
+        )
+        self.vocab = self.token_manager.vocab
+        world_size = 1
+        if isinstance(model, str):
+            world_size = loader.detect_shard_count(model)
+        self.dctx = _DistributedContext.make(distributed_mode, world_size)
+        init_method = f"tcp://0.0.0.0:{random_unused_port()}"
+        self.workers = [
+            self.dctx.worker_launcher(
+                target=_worker_impl,
+                args=(init_method, model, world_size, i, self.vocab, self.dctx),
+                daemon=True,
+            )
+            for i in range(world_size)
+        ]
+        for w in self.workers:
+            w.start()
+        self.dctx.ready_barrier.wait()
+    def __del__(self):
+        try:
+            with self.dctx.active_key_lock:
+                self.dctx.active_key.clear()
+            self.dctx.req_q.put([None, None, None, True])
+            for w in self.workers:
+                w.join()
+        except FileNotFoundError:
+            pass
+    def stream(
+        self,
+        *,
+        input_ids: list[int] | None = None,
+        prompt_text: str | None = None,
+        prompt_ui: list[dict] | None = None,
+        batch_input_ids: list[list[int]] | None = None,
+        batch_prompt_text: list[str] | None = None,
+        batch_prompt_ui: list[list[dict]] | None = None,
+        options: Options | None = None,
+    ):
+        # NOTE: Not thread-safe! Only one instance of generate may be run at a time.
+        if (
+            sum(
+                x is not None
+                for x in [
+                    input_ids,
+                    prompt_text,
+                    prompt_ui,
+                    batch_input_ids,
+                    batch_prompt_text,
+                    batch_prompt_ui,
+                ]
+            )
+            != 1
+        ):
+            raise ValueError(
+                "Must specify exactly one of: input_ids, prompt_text, prompt_ui, batch_input_ids, batch_prompt_text, batch_prompt_ui"
+            )
+        options = options or self.options
+        if prompt_text is not None:
+            batch_prompt_text = [prompt_text]
+        if prompt_ui is not None:
+            batch_prompt_ui = [prompt_ui]
+        if input_ids is not None:
+            batch_input_ids = [input_ids]
+        if batch_prompt_text is not None:
+            batch_prompt_ui = [
+                [{"type": "text", "value": prompt_text}]
+                for prompt_text in batch_prompt_text
+            ]
+        if batch_prompt_ui is not None:
+            batch_input_ids = [
+                self.token_manager.tokens_from_ui(prompt_ui)
+                for prompt_ui in batch_prompt_ui
+            ]
+        assert batch_input_ids
+        if not options.txt and not options.img:
+            raise ValueError("Must specify at least one modality.")
+        if options.txt and options.img and len(batch_input_ids) > 1:
+            raise ValueError(
+                "Batch generation only supported for one modality at a time."
+            )
+        req_key = self.next_key
+        self.next_key += 1
+        with self.dctx.active_key_lock:
+            self.dctx.active_key[req_key] = True
+        self.dctx.req_q.put([options, batch_input_ids, req_key, False])
+        try:
+            while key_token := self.dctx.res_q.get():
+                key, token = key_token
+                if key != req_key:
+                    # Residual from prior calls to generation. Skip.
+                    continue
+                if token is None:
+                    break
+                yield token
+        finally:
+            with self.dctx.active_key_lock:
+                del self.dctx.active_key[req_key]
+    def step(self, *args, **kwargs) -> ChameleonGenerator.Token:
+        return next(self.stream(*args, **kwargs))
+    def generate(self, *args, **kwargs) -> torch.LongTensor:
+        tokens = [t.id for t in self.stream(*args, **kwargs)]
+        if not tokens:
+            return torch.LongTensor()
+        return torch.stack(tokens).T
+    def decode_text(self, ids: torch.LongTensor | list[list[int]]) -> list[str]:
+        return self.token_manager.decode_text(ids)
+    def decode_image(self, ids: torch.LongTensor) -> list[PIL.Image]:
+        return self.token_manager.decode_image(ids)

chameleon/inference/cudagraph.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+from typing import Any, Callable, TypeVar
+import torch
+T = TypeVar("T")
+FN = Callable[..., T]  # type: ignore
+class CUDAGraphWrapper:
+    def __init__(
+        self,
+        fn: FN[T],
+        warmup_iter: int = 1,
+        debug_dump_path: str | None = None,
+    ):
+        self.fn = fn
+        self.warmup_iter = warmup_iter
+        self.debug_dump_path = debug_dump_path
+        self.graph: torch.cuda.CUDAGraph | None = None
+        self.result: T | None = None
+    def __call__(self, *args, **kwargs) -> Any:  # type: ignore
+        if self.warmup_iter > 0:
+            self.warmup_iter -= 1
+            return self.fn(*args, **kwargs)
+        if self.graph is None:
+            self.graph = torch.cuda.CUDAGraph()
+            if self.debug_dump_path is not None:
+                self.graph.enable_debug_mode()
+            recording_kwargs = {}
+            if "capture_error_mode" in torch.cuda.graph.__init__.__annotations__:
+                # In PyTorch 2.1+ and nightlies from late Aug 2023,
+                # we can do this to maybe avoid watchdog-related crashes
+                recording_kwargs["capture_error_mode"] = "thread_local"
+            with torch.cuda.graph(self.graph, **recording_kwargs):
+                self.result = self.fn(*args, **kwargs)
+            torch.cuda.synchronize()
+            if self.debug_dump_path is not None:
+                self.graph.debug_dump(self.debug_dump_path)
+        assert self.graph is not None
+        self.graph.replay()
+        return self.result
+def cudagraph_wrap(
+    *args,
+    warmup_iter: int = 1,
+    debug_dump_path: str | None = None,
+) -> Callable[[FN[T]], FN[T]]:
+    def wrapper(fn: FN[T]) -> FN[T]:
+        graph_wrapper = CUDAGraphWrapper(
+            fn, warmup_iter=warmup_iter, debug_dump_path=debug_dump_path
+        )
+        @functools.wraps(fn)
+        def call_wrapper(*inner_args, **inner_kwargs):
+            return graph_wrapper(*inner_args, **inner_kwargs)
+        return call_wrapper
+    # @cudagraph_wrap
+    # def fn(...):
+    #   ...
+    #
+    # - or -
+    #
+    # fast_fn = cudagraph_wrap(slow_fn, warmup_iter=2)
+    if len(args) == 1 and callable(args[0]):
+        return wrapper(args[0])
+    # @cudagraph_wrap(warmup_iter=3)
+    # def fn(...):
+    #   ...
+    def decorator(fn: FN[T]) -> FN[T]:
+        return wrapper(fn)
+    return decorator

chameleon/inference/examples/batch.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+from chameleon.inference.chameleon import ChameleonInferenceModel
+def main():
+    model = ChameleonInferenceModel(
+        "./data/models/7b/",
+        "./data/tokenizer/text_tokenizer.json",
+        "./data/tokenizer/vqgan.yaml",
+        "./data/tokenizer/vqgan.ckpt",
+    )
+    batch_tokens = model.generate(batch_prompt_text=["All your base", "import asyncio"])
+    for text in model.decode_text(batch_tokens):
+        print(text)
+if __name__ == "__main__":
+    main()

chameleon/inference/examples/multimodal_input.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+from chameleon.inference.chameleon import ChameleonInferenceModel
+def main():
+    model = ChameleonInferenceModel(
+        "./data/models/7b/",
+        "./data/tokenizer/text_tokenizer.json",
+        "./data/tokenizer/vqgan.yaml",
+        "./data/tokenizer/vqgan.ckpt",
+    )
+    tokens = model.generate(
+        prompt_ui=[
+            {"type": "image", "value": "file:/path/to/image.jpeg"},
+            {"type": "text", "value": "What do you see?"},
+            {"type": "sentinel", "value": "<END-OF-TURN>"},
+        ]
+    )
+    print(model.decode_text(tokens)[0])
+if __name__ == "__main__":
+    main()

chameleon/inference/examples/simple.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+from chameleon.inference.chameleon import ChameleonInferenceModel
+def main():
+    model = ChameleonInferenceModel(
+        "./data/models/7b/",
+        "./data/tokenizer/text_tokenizer.json",
+        "./data/tokenizer/vqgan.yaml",
+        "./data/tokenizer/vqgan.ckpt",
+    )
+    tokens = model.generate(prompt_text="All your base")
+    print(model.decode_text(tokens)[0])
+if __name__ == "__main__":
+    main()

chameleon/inference/examples/streaming.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+from chameleon.inference.chameleon import ChameleonInferenceModel
+def main():
+    model = ChameleonInferenceModel(
+        "./data/models/7b/",
+        "./data/tokenizer/text_tokenizer.json",
+        "./data/tokenizer/vqgan.yaml",
+        "./data/tokenizer/vqgan.ckpt",
+    )
+    for tokens in model.stream(prompt_text="All your base"):
+        print(model.decode_text(tokens.id.view(-1, 1))[0], end="")
+if __name__ == "__main__":
+    main()

chameleon/inference/examples/streaming_batch.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+from chameleon.inference.chameleon import ChameleonInferenceModel
+def main():
+    model = ChameleonInferenceModel(
+        "./data/models/7b/",
+        "./data/tokenizer/text_tokenizer.json",
+        "./data/tokenizer/vqgan.yaml",
+        "./data/tokenizer/vqgan.ckpt",
+    )
+    for i, batch_tokens in enumerate(
+        model.stream(batch_prompt_text=["All your base", "import asyncio"])
+    ):
+        print(model.decode_text(batch_tokens.id.view(-1, 1)))
+if __name__ == "__main__":
+    main()

chameleon/inference/generation.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+import torch
+from transformers import (
+    LogitsProcessor,
+    LogitsProcessorList,
+)
+from transformers.generation.streamers import BaseStreamer
+from chameleon.inference.alignment import AlignPromptLeft, PromptAlignment
+from chameleon.inference.model_adapter import ModelAdapter
+from chameleon.inference.stopping_criteria import StoppingCriteria, StoppingCriteriaList
+from chameleon.inference.token_selector import MultinomialTokenSelector, TokenSelector
+class ChameleonGenerator:
+    @dataclass
+    class Token:
+        id: torch.LongTensor
+        logits: torch.Tensor | None
+    def __init__(
+        self,
+        model: ModelAdapter,
+        input_ids: list[list[int]],
+        stopping_criteria: StoppingCriteriaList | list[StoppingCriteria] | None = None,
+        logits_processors: LogitsProcessorList | list[LogitsProcessor] | None = None,
+        probability_processors: LogitsProcessorList
+        | list[LogitsProcessor]
+        | None = None,
+        token_selector: TokenSelector | None = None,
+        alignment: PromptAlignment = AlignPromptLeft(),
+    ):
+        assert model.supports_alignment(alignment)
+        self.model = model
+        self.stopping_criteria = stopping_criteria
+        self.logits_processors = logits_processors
+        self.probability_processors = probability_processors
+        self.token_selector: TokenSelector = (
+            token_selector or MultinomialTokenSelector()
+        )
+        self.alignment = alignment
+        self.model.initialize(input_ids)
+        self._inputs = self.alignment.prepare_inputs(
+            input_ids
+        )  # inputs.shape = [batch, seq-len]
+        self._idx = 0
+        self._start_idx = self.alignment.start_index(input_ids)
+        self._original_inputs = self._inputs.clone()
+        self._inputs = self._inputs[:, : self._start_idx]
+    def __iter__(self):
+        return self
+    @torch.inference_mode()
+    def __next__(self) -> Token:
+        # Are we done?
+        if self.stopping_criteria(self._inputs, None):
+            raise StopIteration
+        # Emit initial tokens.
+        # Model is not run for these.
+        # If you want the logits, you can do a separate forward pass outside generation.
+        if self._idx < self._start_idx:
+            idx, self._idx = self._idx, self._idx + 1
+            return ChameleonGenerator.Token(id=self._inputs[:, idx], logits=None)
+        # Run the model for the next token.
+        self._inputs = self._inputs.contiguous()
+        outputs = self.model(self._inputs)  # outputs.shape = [batch, seq-len, vocab]
+        # Pull out and process the logits.
+        logits = outputs[:, -1, :]  # logits.shape = [batch, vocab]
+        logits = self.logits_processors(self._inputs, logits)
+        probs = logits.softmax(dim=1)  # probs.shape = [batch, vocab]
+        probs = self.probability_processors(self._inputs, probs)
+        # Select a token and add it to the inputs.
+        next_tokens = self.token_selector(
+            self._inputs, probs
+        )  # next_tokens.shape = [batch]
+        self._inputs = torch.cat([self._inputs, next_tokens[:, None]], dim=1)
+        # Run alignment specific postprocessing.
+        self._inputs = self.alignment.postprocess_inputs(
+            self._inputs, self._original_inputs
+        )
+        # Return the next step result.
+        return ChameleonGenerator.Token(id=self._inputs[:, -1], logits=logits)
+    @property
+    def stopping_criteria(self) -> StoppingCriteriaList:
+        return self._stopping_criteria
+    @stopping_criteria.setter
+    def stopping_criteria(
+        self, value: StoppingCriteriaList | list[StoppingCriteria] | None
+    ):
+        self._stopping_criteria = StoppingCriteriaList(value or [])
+    @property
+    def logits_processors(self) -> LogitsProcessorList:
+        return self._logits_processors
+    @logits_processors.setter
+    def logits_processors(
+        self, value: LogitsProcessorList | list[LogitsProcessor] | None
+    ):
+        self._logits_processors = LogitsProcessorList(value or [])
+    @property
+    def probability_processors(self) -> LogitsProcessorList:
+        return self._probability_processors
+    @probability_processors.setter
+    def probability_processors(
+        self, value: LogitsProcessorList | list[LogitsProcessor] | None
+    ):
+        self._probability_processors = LogitsProcessorList(value or [])
+def run_generation(
+    model: torch.nn.Module,
+    input_ids: list[list[int]],
+    stopping_criteria: StoppingCriteriaList | list[StoppingCriteria],
+    logits_processors: LogitsProcessorList | list[LogitsProcessor] | None = None,
+    probability_processors: LogitsProcessorList | list[LogitsProcessor] | None = None,
+    token_selector: TokenSelector | None = None,
+    alignment: PromptAlignment = AlignPromptLeft(),
+    streamer: BaseStreamer | None = None,
+) -> torch.LongTensor:
+    result = torch.empty((len(input_ids), 0), dtype=int)
+    for tok in ChameleonGenerator(
+        model=model,
+        input_ids=input_ids,
+        stopping_criteria=stopping_criteria,
+        logits_processors=logits_processors,
+        probability_processors=probability_processors,
+        token_selector=token_selector,
+        alignment=alignment,
+    ):
+        if streamer is not None:
+            streamer.put(tok.id)
+        result = torch.cat([result, tok.id.view(-1, 1)], dim=1)
+    if streamer is not None:
+        streamer.end()
+    return result

chameleon/inference/image_tokenizer.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import PIL
+import torch
+import yaml
+from PIL import Image
+from chameleon.inference.vqgan import VQModel
+class ImageTokenizer:
+    def __init__(
+        self,
+        cfg_path: str,
+        ckpt_path: str,
+        device: str,
+    ):
+        with open(cfg_path) as f:
+            config = yaml.safe_load(f)
+        params = config["model"]["params"]
+        if "lossconfig" in params:
+            del params["lossconfig"]
+        params["ckpt_path"] = ckpt_path
+        self._vq_model = VQModel(**params)
+        self._vq_model.eval()
+        if device is None:
+            devices = {p.device for p in self._vq_model.parameters()}
+            assert len(devices) == 1
+            device = devices.pop()
+        else:
+            self._vq_model.to(device)
+        self._device = device
+        dtypes = {p.dtype for p in self._vq_model.parameters()}
+        assert len(dtypes) == 1
+        self._dtype = dtypes.pop()
+    def _whiten_transparency(self, img: PIL.Image) -> PIL.Image:
+        # Check if it's already in RGB format.
+        if img.mode == "RGB":
+            return img
+        vals_rgba = np.array(img.convert("RGBA"))
+        # If there is no transparency layer, simple convert and return.
+        if not (vals_rgba[:, :, 3] < 255).any():
+            return img.convert("RGB")
+        # There is a transparency layer, blend it with a white background.
+        # Calculate the alpha proportion for blending.
+        alpha = vals_rgba[:, :, 3] / 255.0
+        # Blend with white background.
+        vals_rgb = (1 - alpha[:, :, np.newaxis]) * 255 + alpha[
+            :, :, np.newaxis
+        ] * vals_rgba[:, :, :3]
+        return PIL.Image.fromarray(vals_rgb.astype("uint8"), "RGB")
+    def _vqgan_input_from(self, img: PIL.Image, target_image_size=512) -> torch.Tensor:
+        # Resize with aspect ratio preservation.
+        s = min(img.size)
+        scale = target_image_size / s
+        new_size = (round(scale * img.size[0]), round(scale * img.size[1]))
+        img = img.resize(new_size, PIL.Image.LANCZOS)
+        # Center crop.
+        x0 = (img.width - target_image_size) // 2
+        y0 = (img.height - target_image_size) // 2
+        img = img.crop((x0, y0, x0 + target_image_size, y0 + target_image_size))
+        # Convert to tensor.
+        np_img = np.array(img) / 255.0  # Normalize to [0, 1]
+        np_img = np_img * 2 - 1  # Scale to [-1, 1]
+        tensor_img = (
+            torch.from_numpy(np_img).permute(2, 0, 1).float()
+        )  # (Channels, Height, Width) format.
+        # Add batch dimension.
+        return tensor_img.unsqueeze(0)
+    def img_tokens_from_pil(self, image: PIL.Image) -> list[int]:
+        image = self._whiten_transparency(image)
+        vqgan_input = self._vqgan_input_from(image).to(self._device).to(self._dtype)
+        _, _, [_, _, img_toks] = self._vq_model.encode(vqgan_input)
+        return img_toks
+    def _pil_from_chw_tensor(self, chw_tensor: torch.Tensor) -> PIL.Image:
+        # Ensure detachment and move tensor to CPU.
+        detached_chw_tensor = chw_tensor.detach().cpu()
+        # Normalize tensor to [0, 1] range from [-1, 1] range.
+        normalized_chw_tensor = (
+            torch.clamp(detached_chw_tensor, -1.0, 1.0) + 1.0
+        ) / 2.0
+        # Permute CHW tensor to HWC format and convert to NumPy array.
+        hwc_array = normalized_chw_tensor.permute(1, 2, 0).numpy()
+        # Convert to an 8-bit unsigned integer format.
+        image_array_uint8 = (hwc_array * 255).astype(np.uint8)
+        # Convert NumPy array to PIL Image.
+        pil_image = Image.fromarray(image_array_uint8)
+        # Convert image to RGB if it is not already.
+        if pil_image.mode != "RGB":
+            pil_image = pil_image.convert("RGB")
+        return pil_image
+    def pil_from_img_toks(self, img_tensor: torch.Tensor, height=32,width=32) -> PIL.Image:
+        emb_dim = self._vq_model.quantize.embedding.weight.shape[-1]
+        # import pdb;pdb.set_trace()
+        codebook_entry = self._vq_model.quantize.get_codebook_entry(
+            img_tensor, (1, height, width, emb_dim)
+        )
+        # import pdb;pdb.set_trace()
+        pixels = self._vq_model.decode(codebook_entry)
+        # import pdb;pdb.set_trace()
+        return self._pil_from_chw_tensor(pixels[0])

chameleon/inference/loader.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import glob
+import inspect
+import json
+from pathlib import Path
+import torch
+from chameleon.inference.transformer import ModelArgs, Transformer
+def _convert(model_args: ModelArgs, consolidated_path: Path) -> Transformer:
+    old_default_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(torch.bfloat16)
+    model = Transformer(model_args)
+    transfer_results = model.load_state_dict(
+        torch.load(str(consolidated_path)),
+        strict=False,
+    )
+    # TODO: More generally, assert missing or unexpected keys are buffers.
+    assert transfer_results.missing_keys == []
+    assert transfer_results.unexpected_keys == ["rope.freqs"]
+    model.eval()
+    torch.set_default_dtype(old_default_dtype)
+    return model
+def _get_checkpoint_path(src_dir: Path, rank: int | None) -> Path:
+    base_path = src_dir / "consolidated.pth"
+    if not rank and base_path.exists():
+        return base_path
+    alt_path = src_dir / f"consolidated.{rank:02}.pth"
+    if alt_path.exists():
+        return alt_path
+    raise ValueError("Consolidated checkpoint not found.")
+def load_model(path: str, rank: int | None = None) -> Transformer:
+    src_dir = Path(path)
+    with open(src_dir / "params.json", "r") as f:
+        params = json.loads(f.read())
+    with open(src_dir / "consolidate_params.json", "r") as f:
+        consolidate_params = json.loads(f.read())
+    params = {**params, **params["model"], **consolidate_params}
+    known_param = inspect.signature(ModelArgs.__init__).parameters
+    filtered_params = {k: v for k, v in params.items() if k in known_param}
+    return _convert(
+        ModelArgs(**filtered_params),
+        _get_checkpoint_path(src_dir, rank),
+    )
+def detect_shard_count(path: str) -> int:
+    src_dir = Path(path)
+    if (src_dir / "consolidated.pth").exists():
+        return 1
+    return len(glob.glob(str(src_dir / "consolidated.*.pth")))

chameleon/inference/logits_processor.py ADDED Viewed

	@@ -0,0 +1,336 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+from transformers import LogitsProcessor
+class TopPProbabilityProcessor(LogitsProcessor):
+    # Modified version of TopPLogitsWarper to act on probabilities.
+    # Changes:
+    # * filter_value changed from -inf to 0
+    # * removed softmax
+    # * renormalize L1
+    def __init__(
+        self,
+        top_p: float,
+        min_tokens_to_keep: int = 1,
+    ):
+        top_p = float(top_p)
+        if top_p < 0 or top_p > 1.0:
+            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(
+                f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}"
+            )
+        self.top_p = top_p
+        self.min_tokens_to_keep = min_tokens_to_keep
+    def __call__(
+        self, input_ids: torch.LongTensor, probs: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # input_ids.shape=[batch, seq-len]
+        # probs.shape=[batch, vocab]
+        sorted_probs, sorted_indices = torch.sort(probs, descending=False)
+        cumulative_probs = sorted_probs.cumsum(dim=-1)
+        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p)
+        # Keep at least min_tokens_to_keep
+        sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            1, sorted_indices, sorted_indices_to_remove
+        )
+        probs = probs.masked_fill(indices_to_remove, 0.0)
+        probs = probs / probs.sum(dim=-1, keepdim=True)
+        return probs
+class DisallowTokensInIndexRangeLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self, token_ids: list[int], start_index: int, end_index: int | None = None
+    ):
+        self.token_ids = torch.tensor(token_ids)
+        self.start_index = start_index
+        self.end_index = end_index if end_index is not None else math.inf
+    def __call__(
+        self, input_ids: torch.LongTensor, logits: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        current_index = input_ids.shape[1]
+        if self.start_index <= current_index < self.end_index:
+            logits[:, self.token_ids] = -math.inf
+        return logits
+class DisallowTokensLogitsProcessor(DisallowTokensInIndexRangeLogitsProcessor):
+    def __init__(self, token_ids: list[int]):
+        super().__init__(token_ids, 0)
+class DisallowTokensAtIndexLogitsProcessor(DisallowTokensInIndexRangeLogitsProcessor):
+    def __init__(self, token_ids: list[int], index: int):
+        super().__init__(token_ids, index, index + 1)
+class DisallowTokensAfterIndexLogitsProcessor(
+    DisallowTokensInIndexRangeLogitsProcessor
+):
+    def __init__(self, token_ids: list[int], index: int):
+        super().__init__(token_ids, index + 1)
+class DisallowTokensAtOrAfterIndexLogitsProcessor(
+    DisallowTokensInIndexRangeLogitsProcessor
+):
+    def __init__(self, token_ids: list[int], index: int):
+        super().__init__(token_ids, index)
+class DisallowTokensInBatchIndexRangeLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self,
+        token_ids: list[int],
+        start_indices: list[int],
+        end_indices: list[int] | None = None,
+    ):
+        self.token_ids = torch.tensor(token_ids)
+        self.start_indices = torch.tensor(start_indices)
+        self.end_indices = (
+            torch.tensor(end_indices)
+            if end_indices is not None
+            else torch.full_like(self.start_indices, math.inf, dtype=torch.float)
+        )
+    def __call__(
+        self, input_ids: torch.LongTensor, logits: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # input_ids.shape = [batch, seq_len]
+        # logits.shape = [batch, vocab]
+        current_index = input_ids.shape[1]
+        mask = (self.start_indices <= current_index) & (
+            current_index < self.end_indices
+        )
+        # The following will fail if the mask is all False.
+        # logits[mask, self.token_ids] = -math.inf
+        logits[torch.where(mask)[0].unsqueeze(1), self.token_ids] = -math.inf
+        return logits
+class DisallowTokensAtBatchIndexLogitsProcessor(
+    DisallowTokensInBatchIndexRangeLogitsProcessor
+):
+    def __init__(self, token_ids: list[int], batch_index: list[int]):
+        super().__init__(token_ids, batch_index, [i + 1 for i in batch_index])
+class AllowOnlyTokensInIndexRangeLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self, token_ids: list[int], start_index: int, end_index: int | None = None
+    ):
+        self.token_ids = torch.tensor(token_ids)
+        self.start_index = start_index
+        self.end_index = end_index if end_index is not None else math.inf
+    def __call__(
+        self, input_ids: torch.LongTensor, logits: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        current_index = input_ids.shape[1]
+        if self.start_index <= current_index < self.end_index:
+            replacement = torch.full_like(logits, -math.inf)
+            replacement[:, self.token_ids] = logits[:, self.token_ids]
+            logits[:] = replacement
+        return logits
+class AllowOnlyTokensLogitsProcessor(AllowOnlyTokensInIndexRangeLogitsProcessor):
+    def __init__(self, token_ids: list[int]):
+        super().__init__(token_ids, 0)
+class AllowOnlyTokensAtIndexLogitsProcessor(AllowOnlyTokensInIndexRangeLogitsProcessor):
+    def __init__(self, token_ids: list[int], index: int):
+        super().__init__(token_ids, index, index + 1)
+class AllowOnlyTokensAfterIndexLogitsProcessor(
+    AllowOnlyTokensInIndexRangeLogitsProcessor
+):
+    def __init__(self, token_ids: list[int], index: int):
+        super().__init__(token_ids, index + 1)
+class AllowOnlyTokensAtOrAfterIndexLogitsProcessor(
+    AllowOnlyTokensInIndexRangeLogitsProcessor
+):
+    def __init__(self, token_ids: list[int], index: int):
+        super().__init__(token_ids, index)
+class AllowOnlyTokensInBatchIndexRangeLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self,
+        token_ids: list[int],
+        start_indices: list[int],
+        end_indices: list[int] | None = None,
+    ):
+        self.token_ids = torch.tensor(token_ids)
+        self.start_indices = torch.tensor(start_indices)
+        self.end_indices = (
+            torch.tensor(end_indices)
+            if end_indices is not None
+            else torch.full_like(self.start_indices, math.inf, dtype=torch.float)
+        )
+    def __call__(
+        self, input_ids: torch.LongTensor, logits: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # input_ids.shape = [batch, seq_len]
+        # logits.shape = [batch, vocab]
+        current_index = input_ids.shape[1]
+        mask = (self.start_indices <= current_index) & (
+            current_index < self.end_indices
+        )
+        valid_batch_indices = torch.where(mask)[0].unsqueeze(1)
+        full_mask = torch.full_like(logits, -math.inf)
+        full_mask[valid_batch_indices, self.token_ids] = logits[
+            valid_batch_indices, self.token_ids
+        ]
+        logits[:] = torch.where(full_mask != -math.inf, full_mask, logits)
+        return logits
+class AllowOnlyTokensAtRelativeOffsetLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self, trigger_token_id: int, subsequent_token_ids: list[int], offset: int
+    ):
+        self.trigger_token_id = trigger_token_id
+        self.subsequent_token_ids = torch.tensor(subsequent_token_ids)
+        self.offset = offset
+    def __call__(
+        self, input_ids: torch.LongTensor, logits: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # input_ids.shape=[batch, seq_len]
+        # logits.shape=[batch, vocab]
+        if input_ids.shape[1] < self.offset:
+            return logits
+        trigger_positions = (
+            input_ids[:, -self.offset] == self.trigger_token_id
+        ).unsqueeze(-1)
+        disallowed_tokens_mask = torch.ones_like(logits, dtype=bool)
+        disallowed_tokens_mask[:, self.subsequent_token_ids] = False
+        return logits.masked_fill_(
+            disallowed_tokens_mask & trigger_positions,
+            -math.inf,
+        )
+class AllowOnlyTokensInRelativeWindowLogitsProcessor(LogitsProcessor):
+    def __init__(self, trigger_token_id: int, allowed_token_ids: list[int], width: int):
+        self.trigger_token_id = trigger_token_id
+        self.allowed_token_ids = torch.tensor(allowed_token_ids).unsqueeze(
+            0
+        )  # shape: [1, num_allowed_tokens]
+        self.width = width
+    def __call__(
+        self, input_ids: torch.LongTensor, logits: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # input_ids.shape=[batch, seq_len]
+        # logits.shape=[batch, vocab]
+        width = min(self.width, input_ids.shape[1])
+        trigger_positions = (
+            (input_ids[:, -width:] == self.trigger_token_id).any(dim=1).unsqueeze(-1)
+        )
+        disallowed_tokens_mask = torch.ones_like(logits, dtype=bool)
+        disallowed_tokens_mask[:, self.allowed_token_ids] = False
+        return logits.masked_fill_(
+            disallowed_tokens_mask & trigger_positions,
+            -math.inf,
+        )
+class CFGLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self,
+        guidance_scale: float,
+        unconditional_ids: torch.LongTensor,
+        model,
+    ):
+        self.guidance_scale = guidance_scale
+        self.unconditional_ids = unconditional_ids
+        self.model = model
+    def __call__(
+        self, input_ids: torch.LongTensor, logits: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        conditioned_logits = logits
+        self.unconditional_ids = torch.cat(
+            [self.unconditional_ids, input_ids[:, -1:]], dim=1
+        )
+        unconditioned_outputs = self.model(self.unconditional_ids)
+        unconditioned_logits = unconditioned_outputs[:, -1, :]
+        return (
+            self.guidance_scale * (conditioned_logits - unconditioned_logits)
+            + unconditioned_logits
+        )
+class InBatchCFGLogitsProcessor(LogitsProcessor):
+    def __init__(self, guidance_scale: float):
+        self.guidance_scale = guidance_scale
+    def __call__(
+        self, input_ids: torch.LongTensor, logits: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # input_ids.shape=[2*batch, seq-len]
+        # logits.shape=[2*batch, vocab]
+        conditioned_logits, unconditioned_logits = torch.chunk(logits, chunks=2, dim=0)
+        mixed_logits = unconditioned_logits + self.guidance_scale * (
+            conditioned_logits - unconditioned_logits
+        )
+        return mixed_logits.repeat(2, 1)
+class InBatchInstructCFGLogitsProcessor(LogitsProcessor):
+    # See https://arxiv.org/abs/2211.09800
+    def __init__(self, guidance_scale_text: float, guidance_scale_image: float):
+        self.guidance_scale_text = guidance_scale_text
+        self.guidance_scale_image = guidance_scale_image
+    def __call__(
+        self, input_ids: torch.LongTensor, logits: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # input_ids.shape=[3*batch, seq-len]
+        # logits.shape=[3*batch, vocab]
+        (
+            full_conditioned_logits,
+            image_conditioned_logits,
+            unconditioned_logits,
+        ) = logits.chunk(3)
+        mixed_logits = (
+            unconditioned_logits
+            + self.guidance_scale_image
+            * (image_conditioned_logits - unconditioned_logits)
+            + self.guidance_scale_text
+            * (full_conditioned_logits - image_conditioned_logits)
+        )
+        return mixed_logits.repeat(3, 1)

chameleon/inference/model_adapter.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from abc import ABC, abstractmethod
+import torch
+from chameleon.inference import transformer
+from chameleon.inference.alignment import (
+    AlignPromptLeft,
+    AlignPromptRight,
+    PromptAlignment,
+)
+from chameleon.inference.cudagraph import cudagraph_wrap
+class ModelAdapter(ABC):
+    @abstractmethod
+    def initialize(self, prompt_tokens: list[list[int]]):
+        ...
+    @abstractmethod
+    def supports_alignment(self, alignment: PromptAlignment) -> bool:
+        ...
+    @abstractmethod
+    @torch.inference_mode()
+    def __call__(self, inputs: torch.LongTensor) -> torch.FloatTensor:
+        ...
+class ChameleonModelAdapter(ModelAdapter):
+    """Adapter for Chameleon-style model that handles state, such as cache."""
+    def __init__(
+        self,
+        model: transformer.Transformer,
+        max_seq_len: int,
+        dtype: torch.dtype | None = None,
+    ):
+        super().__init__()
+        self._args = model.args
+        self._model = model
+        self._max_seq_len = max_seq_len
+        self._dtype = dtype or next(model.parameters()).data.dtype
+    def initialize(self, prompt_tokens: list[list[int]]):
+        self._prompt_lengths = [len(toks) for toks in prompt_tokens]
+        batch_size = len(prompt_tokens)
+        self._cache = transformer.make_cache(
+            args=self._args,
+            length=batch_size * self._max_seq_len,
+            dtype=self._dtype,
+        )
+        self._local_inputs = torch.zeros([batch_size], dtype=int, device="cuda")
+        self._forward = cudagraph_wrap(self._model.forward_with_attn_bias)
+        self._first_pass = True
+    def supports_alignment(self, alignment: PromptAlignment) -> bool:
+        return isinstance(alignment, AlignPromptLeft) or isinstance(
+            alignment, AlignPromptRight
+        )
+    def __call__(self, inputs: torch.LongTensor) -> torch.FloatTensor:
+        # inputs.shape=[batch, seq-len]
+        batch_size, seq_len = inputs.shape
+        if self._first_pass:
+            attn_seqlen = [min(pl, seq_len) for pl in self._prompt_lengths]
+            self._bias = transformer.AttnBias.from_seqlens(
+                q_seqlen=attn_seqlen,
+                kv_seqlen=attn_seqlen,
+                kv_padding=self._max_seq_len,
+            )
+            mask = torch.zeros_like(inputs, dtype=torch.bool)
+            for i, k in enumerate(self._prompt_lengths):
+                mask[i, -k:] = True
+            flat_outputs: torch.Tensor = self._forward(  # type: ignore
+                token_values=inputs[mask],
+                attn_bias=self._bias,
+                cache=self._cache,
+            )
+            self._local_outputs = torch.full(
+                (inputs.shape[0], inputs.shape[1], flat_outputs.shape[-1]),
+                -math.inf,
+            )
+            self._local_outputs[mask] = flat_outputs
+            self._vocab_size = self._local_outputs.shape[-1]
+            self._bias.q_seqinfo.seqstart.copy_(
+                torch.arange(batch_size + 1, dtype=torch.int)
+            )
+            self._bias.q_seqinfo.max_seqlen = 1
+            self._bias.q_seqinfo.seqstart_py = self._bias.q_seqinfo.seqstart.tolist()
+            self._first_pass = False
+        else:
+            self._local_inputs.copy_(inputs[:, -1])  # type: ignore
+            self._local_outputs = self._forward(  # type: ignore
+                token_values=self._local_inputs,
+                attn_bias=self._bias,
+                cache=self._cache,
+            )
+        self._bias.k_seqinfo.seqlen.add_(1)
+        return self._local_outputs.view(batch_size, -1, self._vocab_size)

chameleon/inference/stopping_criteria.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+class StoppingCriteria:
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        raise NotImplementedError("StoppingCriteria needs to be subclassed")
+class StoppingCriteriaList(list):
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        return any(criteria(input_ids, scores, **kwargs) for criteria in self)
+class MaxLengthCriteria(StoppingCriteria):
+    def __init__(self, max_length: int):
+        self.max_length = max_length
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        cur_len = input_ids.shape[-1]
+        return cur_len >= self.max_length
+class StopOnEOS(StoppingCriteria):
+    def __init__(self, eos_id: int):
+        self._eos_id = eos_id
+    def __call__(self, input_ids: torch.LongTensor, _: torch.FloatTensor) -> bool:
+        # input_ids.shape=[batch, seq_len]
+        return (input_ids == self._eos_id).sum(dim=1).all()
+class StopOnEOSAfterBatchIndex(StoppingCriteria):
+    def __init__(self, eos_id: int, batch_index: list[int]):
+        self._eos_id = eos_id
+        self.batch_index = torch.tensor(batch_index, dtype=torch.long).unsqueeze(1)
+    def __call__(self, input_ids: torch.LongTensor, _: torch.FloatTensor) -> bool:
+        # input_ids.shape=[batch, seq_len]
+        eos_mask = input_ids == self._eos_id
+        consider_eos_mask = (
+            torch.arange(input_ids.shape[1]).unsqueeze(0) >= self.batch_index
+        )
+        valid_eos = eos_mask & consider_eos_mask
+        return valid_eos.sum(dim=1).all()

chameleon/inference/token_selector.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+class TokenSelector:
+    def __call__(
+        self, input_ids: torch.LongTensor, probs: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # input_ids.shape=[batch, seq_len]
+        # probs.shape=[batch, vocab]
+        ...
+class ArgmaxTokenSelector(TokenSelector):
+    def __call__(
+        self, _: torch.LongTensor, probs: torch.FloatTensor
+    ) -> torch.LongTensor:
+        # probs.shape=[batch, vocab]
+        return probs.argmax(dim=1)
+class MultinomialTokenSelector(TokenSelector):
+    def __call__(
+        self, _: torch.LongTensor, probs: torch.FloatTensor
+    ) -> torch.LongTensor:
+        # probs.shape=[batch, vocab]
+        return probs.multinomial(num_samples=1).squeeze(1)
+class ReplicatedInputTokenSelector(TokenSelector):
+    def __init__(self, token_selector: TokenSelector, n: int):
+        self.token_selector = token_selector
+        self.n = n
+    def __call__(
+        self, input_ids: torch.LongTensor, probs: torch.FloatTensor
+    ) -> torch.LongTensor:
+        # input_ids.shape=[n*batch, seq_len]
+        # probs.shape=[n*batch, vocab]
+        primary_input_ids = torch.chunk(input_ids, chunks=self.n, dim=0)[0]
+        primary_probs = torch.chunk(probs, chunks=self.n, dim=0)[0]
+        tokens = self.token_selector(primary_input_ids, primary_probs)
+        return tokens.repeat(self.n)

chameleon/inference/transformer.py ADDED Viewed

	@@ -0,0 +1,421 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+import torch
+from torch import distributed as dist
+from torch import nn
+from torch.nn import functional as F
+from xformers.ops import RMSNorm, fmha, rope_padded
+from xformers.ops.fmha.attn_bias import (
+    BlockDiagonalCausalWithOffsetPaddedKeysMask as AttnBias,
+)
+@dataclass
+class ModelArgs:
+    model_parallel_size: int = 1
+    dim: int = 512
+    n_layers: int = 8
+    n_heads: int = 8
+    n_kv_heads: int | None = None
+    vocab_size: int = -1
+    ffn_dim_multiplier: float | None = None
+    multiple_of: int = 256
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000.0
+    qk_normalization: bool = False
+    swin_norm: bool = False
+LayerCache = tuple[torch.Tensor, torch.Tensor]
+class Attention(nn.Module):
+    def __init__(
+        self,
+        model_parallel_size: int,
+        dim: int,
+        head_dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        rope_theta: float,
+        qk_normalization: bool = False,
+    ):
+        super().__init__()
+        self.model_parallel_size = model_parallel_size
+        self.head_dim = head_dim
+        self.rope_theta = rope_theta
+        self.n_local_heads = n_heads // model_parallel_size
+        self.n_local_kv_heads = n_kv_heads // model_parallel_size
+        self.wqkv = nn.Linear(
+            dim,
+            (self.n_local_heads + 2 * self.n_local_kv_heads) * head_dim,
+            bias=False,
+            dtype=torch.bfloat16,
+        )
+        self.wo = nn.Linear(
+            self.n_local_heads * head_dim,
+            dim,
+            bias=False,
+            dtype=torch.bfloat16,
+        )
+        self.qk_normalization = qk_normalization
+        if qk_normalization:
+            self.q_normalization = torch.nn.LayerNorm(head_dim)
+            self.k_normalization = torch.nn.LayerNorm(head_dim)
+        self._register_load_state_dict_pre_hook(self.load_hook)
+    # This adapter makes sure we can load vanilla
+    # Llama checkpoints where wq, wk, and wv are
+    # not fused in a single parameter
+    def load_hook(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        if prefix + "wq.weight" in state_dict:
+            wq = state_dict.pop(prefix + "wq.weight")
+            wk = state_dict.pop(prefix + "wk.weight")
+            wv = state_dict.pop(prefix + "wv.weight")
+            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
+    def forward(
+        self,
+        x: torch.Tensor,
+        cache: LayerCache,
+        attn_bias: AttnBias,
+        group: dist.ProcessGroup | None = None,
+    ) -> torch.Tensor:
+        # x.shape is (sum(seq_lens), dim)
+        #
+        # Since we support heterogenous sequence
+        # lengths, the hidden states are all
+        # concatenated together along the usual
+        # sequence dimension. The attention below
+        # finds out where sequences start & end
+        # using the provided attention bias.
+        xqkv = self.wqkv(x)
+        xq = xqkv[:, : (self.n_local_heads * self.head_dim)]
+        xkv = xqkv[:, (self.n_local_heads * self.head_dim) :]
+        xk, xv = xkv.chunk(2, 1)
+        if self.qk_normalization:
+            xq = xq.view(-1, self.n_local_heads, self.head_dim)
+            xq = self.q_normalization(xq)
+            xq = xq.view(-1, self.n_local_heads * self.head_dim)
+            xk = xk.view(-1, self.n_local_kv_heads, self.head_dim)
+            xk = self.k_normalization(xk)
+            xk = xk.view(-1, self.n_local_kv_heads * self.head_dim)
+        output_shape = xq.shape
+        xq = xq.view(1, xq.shape[0], self.n_local_heads, self.head_dim)
+        xk = xk.view(1, xk.shape[0], self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(1, xv.shape[0], self.n_local_kv_heads, self.head_dim)
+        cache_k, cache_v = cache
+        xq = rope_padded(
+            xq=xq,
+            xk=xk,
+            xv=xv,
+            cache_k=cache_k,
+            cache_v=cache_v,
+            attn_bias=attn_bias,
+            theta=self.rope_theta,
+        )
+        # Handle GQA
+        # Q shape: [B, M, Hkv, Hq // Hkv, K]
+        heads_per_group = self.n_local_heads // self.n_local_kv_heads
+        cache_k = cache_k.unsqueeze(3).expand(-1, -1, -1, heads_per_group, -1)
+        cache_v = cache_v.unsqueeze(3).expand(-1, -1, -1, heads_per_group, -1)
+        xq = xq.reshape(
+            [*xq.shape[:2], self.n_local_kv_heads, heads_per_group, xq.shape[-1]]
+        )
+        # rope_padded() updated the caches, so we
+        # call attention directly
+        output = fmha.memory_efficient_attention_forward(
+            xq, cache_k, cache_v, attn_bias
+        )
+        output = self.wo(output.reshape(output_shape))
+        if self.model_parallel_size > 1:
+            dist.all_reduce(output, group=group)
+        return output
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        model_parallel_size: int,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float | None,
+    ):
+        super().__init__()
+        self.model_parallel_size = model_parallel_size
+        hidden_dim = int(2 * hidden_dim / 3)
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        assert hidden_dim % model_parallel_size == 0
+        self.w13 = nn.Linear(
+            dim,
+            2 * hidden_dim // model_parallel_size,
+            bias=False,
+        )
+        self.w2 = nn.Linear(
+            hidden_dim // model_parallel_size,
+            dim,
+            bias=False,
+        )
+        self._register_load_state_dict_pre_hook(self.load_hook)
+    # This adapter makes sure we can load vanilla
+    # Llama checkpoints where w1 and w3 are not
+    # fused in a single parameter
+    def load_hook(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        if prefix + "w1.weight" in state_dict:
+            w1 = state_dict.pop(prefix + "w1.weight")
+            w3 = state_dict.pop(prefix + "w3.weight")
+            state_dict[prefix + "w13.weight"] = torch.cat([w1, w3])
+    def forward(
+        self, x: torch.Tensor, group: dist.ProcessGroup | None = None
+    ) -> torch.Tensor:
+        x13 = self.w13(x)
+        x1, x3 = x13.chunk(2, -1)
+        output = self.w2(F.silu(x1) * x3)
+        if self.model_parallel_size > 1:
+            dist.all_reduce(output, group=group)
+        return output
+class TransformerBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        assert args.dim % args.n_heads == 0
+        head_dim = args.dim // args.n_heads
+        if args.n_kv_heads is not None:
+            n_kv_heads = args.n_kv_heads
+        else:
+            n_kv_heads = args.n_heads
+        model_parallel_size = args.model_parallel_size
+        assert args.n_heads % n_kv_heads == 0
+        assert args.n_heads % model_parallel_size == 0
+        assert n_kv_heads % model_parallel_size == 0
+        self.attention = Attention(
+            model_parallel_size=model_parallel_size,
+            dim=args.dim,
+            head_dim=head_dim,
+            n_heads=args.n_heads,
+            n_kv_heads=n_kv_heads,
+            rope_theta=args.rope_theta,
+            qk_normalization=args.qk_normalization,
+        )
+        self.feed_forward = FeedForward(
+            model_parallel_size=model_parallel_size,
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            ffn_dim_multiplier=args.ffn_dim_multiplier,
+        )
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.swin_norm = args.swin_norm
+    def forward(
+        self,
+        x: torch.Tensor,
+        cache: LayerCache,
+        attn_bias: AttnBias,
+        group: dist.ProcessGroup | None = None,
+    ) -> torch.Tensor:
+        if self.swin_norm:
+            h = x + self.attention_norm(
+                self.attention.forward(
+                    x,
+                    cache,
+                    attn_bias,
+                    group=group,
+                )
+            )
+            out = h + self.ffn_norm(self.feed_forward(h, group=group))
+        else:
+            h = x + self.attention.forward(
+                self.attention_norm(x),
+                cache,
+                attn_bias,
+                group=group,
+            )
+            out = h + self.feed_forward(self.ffn_norm(h), group=group)
+        return out
+class Transformer(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.model_parallel_size = args.model_parallel_size
+        assert args.dim % self.model_parallel_size == 0
+        assert args.vocab_size > 0
+        assert args.vocab_size % self.model_parallel_size == 0
+        self.tok_embeddings = nn.Embedding(
+            num_embeddings=args.vocab_size,
+            embedding_dim=args.dim // self.model_parallel_size,
+        )
+        self.layers = nn.ModuleList()
+        for _ in range(args.n_layers):
+            self.layers.append(TransformerBlock(args))
+        self.norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.output = nn.Linear(
+            args.dim,
+            args.vocab_size // self.model_parallel_size,
+            bias=False,
+        )
+    @torch.no_grad()
+    def forward_with_attn_bias(
+        self,
+        token_values: torch.Tensor,
+        attn_bias: AttnBias,
+        cache: list[LayerCache],
+        group: dist.ProcessGroup | None = None,
+    ) -> torch.Tensor:
+        h = self.tok_embeddings(token_values)
+        if self.model_parallel_size > 1:
+            gather = [torch.empty_like(h) for _ in range(self.model_parallel_size)]
+            dist.all_gather(gather, h, group=group)
+            h = torch.cat(gather, dim=-1)
+        for i, layer in enumerate(self.layers):
+            h = layer(h, cache[i], attn_bias, group=group)
+        logits = self.output(self.norm(h))
+        if self.model_parallel_size > 1:
+            gather = [torch.empty_like(logits) for _ in range(self.model_parallel_size)]
+            dist.all_gather(gather, logits, group=group)
+            logits = torch.cat(gather, dim=-1)
+        return logits.float()
+    def forward(
+        self,
+        token_values: torch.Tensor,
+        token_lengths: torch.Tensor,
+        start_pos: torch.Tensor,
+        cache: list[LayerCache],
+        kv_padding: int,
+        group: dist.ProcessGroup | None = None,
+    ) -> torch.Tensor:
+        attn_bias = AttnBias.from_seqlens(
+            q_seqlen=token_lengths.tolist(),
+            kv_seqlen=(start_pos + token_lengths).tolist(),
+            kv_padding=kv_padding,
+        )
+        return self.forward_with_attn_bias(token_values, attn_bias, cache, group=group)
+def make_cache(
+    args: ModelArgs,
+    length: int,
+    device: str | torch.device | None = None,
+    n_layers: int | None = None,
+    dtype: torch.dtype | None = None,
+) -> list[LayerCache]:
+    """
+    Allocate a cache to be used with the Transformer module.
+    Args:
+        args (ModelArgs): the model configuration.
+        length (int): per layer cache size.
+            It is usually budgeted as ``max_batch * max_seq``
+        device (torch.device, optional): the device on which
+            the cache should be allocated.
+        n_layers (int, optional): the number of layers to
+            allocate a cache for (defaults to the model
+            settings).
+        dtype (torch.dtype, optional): the dtype to use for
+            cache entries (defaults to the default dtype).
+    Returns:
+        The cache object to pass to ``Tranformer.forward``.
+    """
+    head_dim = args.dim // args.n_heads
+    n_kv_heads = args.n_kv_heads
+    if n_kv_heads is None:
+        n_kv_heads = args.n_heads
+    n_local_kv_heads = n_kv_heads // args.model_parallel_size
+    if n_layers is None:
+        n_layers = args.n_layers
+    shape = (1, length, n_local_kv_heads, head_dim)
+    return [
+        (
+            torch.zeros(shape, device=device, dtype=dtype),
+            torch.zeros(shape, device=device, dtype=dtype),
+        )
+        for _ in range(n_layers)
+    ]
+def cache_prefix(cache: list[LayerCache], length: int) -> list[LayerCache]:
+    """
+    Take a prefix view of a larger cache.
+    The original cache object remains of identical size and valid
+    after the shrinked alias has been used. This function is useful
+    when a cache was allocated for a larger batch size than what is
+    necessary.
+    Args:
+        cache: the cache to take a view in.
+        length (int): the desired length
+    Returns:
+        A view in the input cache object.
+    """
+    if len(cache) > 0:
+        assert cache[0][0].shape[1] >= length
+    return [(ck[:, :length], cv[:, :length]) for ck, cv in cache]

chameleon/inference/utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+import socket
+from typing import Generator, Generic, Iterator, TypeVar
+T = TypeVar("T")
+class DynamicGenerator(Generic[T]):
+    def __init__(self, gen: Generator[T, None, None]):
+        self.gen = gen
+    def __iter__(self) -> Iterator[T]:
+        return self
+    def __next__(self) -> T:
+        return next(self.gen)
+def advance(iterator: Iterator[T], steps: int):
+    try:
+        for _ in range(steps):
+            next(iterator)
+    except StopIteration:
+        pass
+def random_unused_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]

chameleon/inference/vocab.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+from functools import cached_property
+import torch
+class VocabInfo:
+    def __init__(self, vocab_map: dict[str, int]):
+        self.name2val = vocab_map
+        self.bos_id = vocab_map.get("<s>")
+        self.eos_id = vocab_map.get("</s>")
+        self.boi_id = vocab_map.get("<racm3:break>")
+        self.eoi_id = vocab_map.get("<eoss>")
+        self.pad_id = vocab_map.get("<pad>")
+        self.eot_id = vocab_map.get("<reserved08706>")
+    @property
+    def begin_sequence(self) -> int:
+        return self.bos_id
+    @property
+    def end_sequence(self) -> int:
+        return self.eos_id
+    @property
+    def begin_image(self) -> int:
+        return self.boi_id
+    @property
+    def end_image(self) -> int:
+        return self.eoi_id
+    @property
+    def padding(self) -> int:
+        return self.pad_id
+    @property
+    def end_turn(self) -> int:
+        return self.eot_id
+    @cached_property
+    def val2name(self) -> dict[int, str]:
+        return {v: k for k, v in self.name2val.items()}
+    @cached_property
+    def all_tokens(self) -> list[int]:
+        return sorted(self.name2val.values())
+    @cached_property
+    def image_tokens(self) -> list[int]:
+        return sorted(
+            [val for name, val in self.name2val.items() if name.startswith("IMGIMG")]
+        )
+    @cached_property
+    def special_tokens(self) -> list[int]:
+        return sorted(
+            [
+                val
+                for name, val in self.name2val.items()
+                if name.startswith("<") and name != "<"
+            ]
+        )
+    @cached_property
+    def text_tokens(self) -> list[int]:
+        return sorted(
+            set(self.all_tokens) - set(self.image_tokens) - set(self.special_tokens)
+        )
+class VocabTranslation:
+    def __init__(self, vocab_info: VocabInfo, device: str | None = None):
+        self._vocab = vocab_info
+        self._device = device
+    @cached_property
+    def bpe2img(self) -> dict[int, int]:
+        img_tkn_chr_mapping = {chr(ord("A") + i): str(i) for i in range(10)}
+        def remap(old_name: str) -> str:
+            return "".join(
+                img_tkn_chr_mapping.get(c, c) for c in old_name[len("IMGIMG") : -1]
+            )
+        return {
+            tok: int(remap(self._vocab.val2name[tok]))
+            for tok in self._vocab.image_tokens
+        }
+    @cached_property
+    def img2bpe(self) -> dict[int, int]:
+        return {v: k for k, v in self.bpe2img.items()}
+    @cached_property
+    def bpe2img_search_tensors(self) -> tuple[torch.Tensor, torch.Tensor]:
+        sorted_bpe = torch.tensor(sorted(self.bpe2img.keys()), device=self._device)
+        sorted_img = torch.tensor(sorted(self.bpe2img.values()), device=self._device)
+        return sorted_bpe, sorted_img
+    @cached_property
+    def img2bpe_mapping_tensor(self) -> torch.LongTensor:
+        mapping = torch.zeros(
+            max(self.img2bpe.keys()) + 1,
+            dtype=torch.int,
+            device=self._device,
+        )
+        for k, v in self.img2bpe.items():
+            mapping[k] = v
+        return mapping
+    def convert_bpe2img(self, bpe_batch: torch.Tensor) -> torch.Tensor:
+        bpe_tok, img_tok = self.bpe2img_search_tensors
+        return img_tok[torch.searchsorted(bpe_tok, bpe_batch)]
+    def convert_img2bp2(self, img_batch: torch.Tensor) -> torch.Tensor:
+        return self.img2bpe_mapping_tensor[img_batch]

chameleon/inference/vqgan.py ADDED Viewed

	@@ -0,0 +1,675 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the Chameleon License found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Contents of this file are taken from https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/models/vqgan.py
+[with minimal dependencies]
+This implementation is inference-only -- training steps and optimizer components
+introduce significant additional dependencies
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class VectorQuantizer2(nn.Module):
+    """
+    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly
+    avoids costly matrix multiplications and allows for post-hoc remapping of indices.
+    """
+    # NOTE: due to a bug the beta term was applied to the wrong term. for
+    # backwards compatibility we use the buggy version by default, but you can
+    # specify legacy=False to fix it.
+    def __init__(
+        self,
+        n_e,
+        e_dim,
+        beta,
+        remap=None,
+        unknown_index="random",
+        sane_index_shape=False,
+        legacy=True,
+    ):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.legacy = legacy
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index  # "random" or "extra" or integer
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed + 1
+            print(
+                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
+        else:
+            self.re_embed = n_e
+        self.sane_index_shape = sane_index_shape
+    def remap_to_used(self, inds):
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        match = (inds[:, :, None] == used[None, None, ...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2) < 1
+        if self.unknown_index == "random":
+            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(
+                device=new.device
+            )
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+    def unmap_to_all(self, inds):
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]:  # extra token
+            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
+        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
+        return back.reshape(ishape)
+    def forward(self, z, temp=None, rescale_logits=False, return_logits=False):
+        assert temp is None or temp == 1.0, "Only for interface compatible with Gumbel"
+        assert rescale_logits is False, "Only for interface compatible with Gumbel"
+        assert return_logits is False, "Only for interface compatible with Gumbel"
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        d = (
+            torch.sum(z_flattened**2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight**2, dim=1)
+            - 2
+            * torch.einsum(
+                "bd,dn->bn", z_flattened, self.embedding.weight.transpose(0, 1)
+            )
+        )
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        perplexity = None
+        min_encodings = None
+        # compute loss for embedding
+        if not self.legacy:
+            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean(
+                (z_q - z.detach()) ** 2
+            )
+        else:
+            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean(
+                (z_q - z.detach()) ** 2
+            )
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        if self.remap is not None:
+            min_encoding_indices = min_encoding_indices.reshape(
+                z.shape[0], -1
+            )  # add batch axis
+            min_encoding_indices = self.remap_to_used(min_encoding_indices)
+            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
+        if self.sane_index_shape:
+            min_encoding_indices = min_encoding_indices.reshape(
+                z_q.shape[0], z_q.shape[2], z_q.shape[3]
+            )
+        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+    def get_codebook_entry(self, indices, shape):
+        # shape specifying (batch, height, width, channel)
+        if self.remap is not None:
+            indices = indices.reshape(shape[0], -1)  # add batch axis
+            indices = self.unmap_to_all(indices)
+            indices = indices.reshape(-1)  # flatten again
+        # get quantized latent vectors
+        z_q = self.embedding(indices)
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q
+# Alias
+VectorQuantizer = VectorQuantizer2
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = F.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none"], f"attn_type {attn_type} unknown"
+    # print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        raise ValueError("Unexpected attention type")
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        double_z=True,
+        use_linear_attn=False,
+        attn_type="vanilla",
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        give_pre_end=False,
+        tanh_out=False,
+        use_linear_attn=False,
+        attn_type="vanilla",
+        **ignorekwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+class VQModel(nn.Module):
+    def __init__(
+        self,
+        ddconfig,
+        n_embed,
+        embed_dim,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key="image",
+        colorize_nlabels=None,
+        monitor=None,
+        scheduler_config=None,
+        lr_g_factor=1.0,
+        remap=None,
+        sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+    ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.quantize = VectorQuantizer(
+            n_embed,
+            embed_dim,
+            beta=0.25,
+            remap=remap,
+            sane_index_shape=sane_index_shape,
+        )
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.image_key = image_key
+        if colorize_nlabels is not None:
+            assert isinstance(colorize_nlabels, int)
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.scheduler_config = scheduler_config
+        self.lr_g_factor = lr_g_factor
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"VQModel loaded from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+        return x.float()
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
+        return x

chameleon/vqgan.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ede986bf6b171db3081ce171ad88e4ac970793cea14c180b3e5ac5105f4cb43
+size 281270377

chameleon/vqgan.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+model:
+  base_learning_rate: 4.5e-06
+  target: taming.models.vqgan.VQModel
+  params:
+    embed_dim: 256
+    n_embed: 8192
+    ddconfig:
+      double_z: false
+      z_channels: 256
+      resolution: 512
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult:
+      - 1
+      - 1
+      - 2
+      - 2
+      - 4
+      num_res_blocks: 2
+      attn_resolutions: []
+      dropout: 0.0
+    lossconfig:
+      target: taming.modules.losses.vqperceptual_vit_vqgan.VQLPIPSWithDiscriminator
+      params:
+        disc_start: 100001
+        perceptual_weight: 1.0
+        adversarial_weight: 0.5
+        disc_params:
+          size: 512
+    ckpt_path: manifold://fair_onellm_checkpoints/tree/v2/tokenizer/vqgan_wm_0209.ckpt
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 4
+    num_workers: 10
+    image_size: 512
+    filter_image_size: 512
+    dataset: coco
+    aesthetics_th: 0
+    clipsim_th: 0
+--distributed-world-size: null
+'32': null
+--distributed-port: null
+'17338': null
+--save-dir: null
+/checkpoint/shellysheynin/shutterstock/512x512_1024tokens_4node_shutterstock_laion_no_attn_styleGAN:
+  log_every-500:
+    ngpu32: null
+--tensorboard-logdir: null
+/checkpoint/shellysheynin/tensorboard_logs/2023-03-30/512x512_1024tokens_4node_shutterstock_laion_no_attn_styleGAN:
+  log_every-500:
+    ngpu32: null
+'14561': null
+/checkpoint/shellysheynin/tensorboard_logs/2023-04-02/512x512_1024tokens_4node_shutterstock_laion_no_attn_styleGAN:
+  log_every-500:
+    ngpu32: null

conversation.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    GEMMA = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.GEMMA:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += "<start_of_turn>" + role + "\n" + message + "<end_of_turn>\n" + seps[i % 2]
+                else:
+                    ret += "<start_of_turn>" + role + "\n"
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                if type(msg) is tuple and len(msg) == 2:
+                    msg, img_b64_str = msg
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = msg.strip() + img_str
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_vicuna_imgsp_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="imgsp_v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_plain_guided = Conversation(
+    system="",
+    roles=("", ""),
+    version="plain_guided",
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_phi_2 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="phi2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="<|endoftext|>",
+)
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_gemma = Conversation(
+    system="",
+    roles=("user", "model"),
+    version="gemma",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.GEMMA,
+    sep="",
+    sep2="<eos>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "phi_2": conv_phi_2,
+    "gemma": conv_gemma,
+    "llama_2": conv_llama_2,
+    "imgsp_v1": conv_vicuna_imgsp_v1,
+    "plain_guided": conv_llava_plain_guided,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

helpers.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+from torch.nn import functional as F
+from PIL import Image
+### from https://huggingface.co/transformers/v3.2.0/_modules/transformers/generation_utils.html
+def top_k_top_p_filtering(
+    logits,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+    ):
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    logits[:,:256000]=filter_value
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    # import pdb;pdb.set_trace()
+    return logits
+def sample(logits, temperature: float=1.0, top_k: int=0, top_p: float=1.0, sample_logits=True):
+    logits = logits[:, -1, :] / max(temperature, 1e-5)
+    if top_k > 0 or top_p < 1.0:
+        logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    probs = F.softmax(logits, dim=-1)
+    if sample_logits:
+        idx = torch.multinomial(probs, num_samples=1)
+    else:
+        _, idx = torch.topk(probs, k=1, dim=-1)
+    return idx, probs
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def tokenizer_image_token(prompt, tokenizer, image_token_index=-200, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids

requirements.txt CHANGED Viewed

	@@ -1 +1,6 @@
1	- ~~huggingface_hub==0.25.2~~

+torch
+transformers==4.39.2
+spaces
+pillow
+accelerate
+tqdm