BAGEL-NHR-Edit

Runtime error

App Files Files Community

iitolstykh commited on Jul 21

Commit

11b25a1

verified ·

1 Parent(s): 816d4c8

Update inferencer.py

Browse files

Files changed (1) hide show

inferencer.py +42 -41

inferencer.py CHANGED Viewed

@@ -51,8 +51,9 @@ class InterleaveInferencer:
             new_token_ids=self.new_token_ids,
         )
-        with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
-            past_key_values = self.model.forward_cache_update_text(past_key_values, **generation_input)
         gen_context['kv_lens'] = kv_lens
         gen_context['ropes'] = ropes
         gen_context['past_key_values'] = past_key_values
@@ -77,8 +78,8 @@ class InterleaveInferencer:
                 transforms=self.vae_transform,
                 new_token_ids=self.new_token_ids,
             )
-            with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
-                past_key_values = self.model.forward_cache_update_vae(self.vae_model, past_key_values, **generation_input)
         if vit:
             ## update vit
@@ -89,8 +90,8 @@ class InterleaveInferencer:
                 transforms=self.vit_transform,
                 new_token_ids=self.new_token_ids,
             )
-            with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
-                past_key_values = self.model.forward_cache_update_vit(past_key_values, **generation_input)
         gen_context['kv_lens'] = kv_lens
         gen_context['ropes'] = ropes
@@ -146,28 +147,28 @@ class InterleaveInferencer:
             image_sizes=[image_shape],
         )
-        with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
-            unpacked_latent = self.model.generate_image(
-                past_key_values=past_key_values,
-                cfg_text_past_key_values=cfg_text_past_key_values,
-                cfg_img_past_key_values=cfg_img_past_key_values,
-                num_timesteps=num_timesteps,
-                cfg_text_scale=cfg_text_scale,
-                cfg_img_scale=cfg_img_scale,
-                cfg_interval=cfg_interval,
-                cfg_renorm_min=cfg_renorm_min,
-                cfg_renorm_type=cfg_renorm_type,
-                timestep_shift=timestep_shift,
-                **generation_input,
-                cfg_text_packed_position_ids=generation_input_cfg_text['cfg_packed_position_ids'],
-                cfg_text_packed_query_indexes=generation_input_cfg_text['cfg_packed_query_indexes'],
-                cfg_text_key_values_lens=generation_input_cfg_text['cfg_key_values_lens'],
-                cfg_text_packed_key_value_indexes=generation_input_cfg_text['cfg_packed_key_value_indexes'],
-                cfg_img_packed_position_ids=generation_input_cfg_img['cfg_packed_position_ids'],
-                cfg_img_packed_query_indexes=generation_input_cfg_img['cfg_packed_query_indexes'],
-                cfg_img_key_values_lens=generation_input_cfg_img['cfg_key_values_lens'],
-                cfg_img_packed_key_value_indexes=generation_input_cfg_img['cfg_packed_key_value_indexes'],
-            )
         image = self.decode_image(unpacked_latent[0], image_shape)
         return image
@@ -193,19 +194,19 @@ class InterleaveInferencer:
         kv_lens = gen_context['kv_lens']
         ropes = gen_context['ropes']
-        with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
-            generation_input = self.model.prepare_start_tokens(kv_lens, ropes, self.new_token_ids)
-            for unpacked_latent in self.model.generate_text(
-                past_key_values=past_key_values,
-                max_length=max_length,
-                do_sample=do_sample,
-                temperature=temperature,
-                end_token_id=self.new_token_ids['eos_token_id'],
-                **generation_input,
-            ):
-                output = self.tokenizer.decode(unpacked_latent)
-                if output != "<|im_end|>":
-                    yield output
     @torch.no_grad()
     def interleave_inference(

             new_token_ids=self.new_token_ids,
         )
+        # with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        past_key_values = self.model.forward_cache_update_text(past_key_values, **generation_input)
         gen_context['kv_lens'] = kv_lens
         gen_context['ropes'] = ropes
         gen_context['past_key_values'] = past_key_values
                 transforms=self.vae_transform,
                 new_token_ids=self.new_token_ids,
             )
+            # with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+            past_key_values = self.model.forward_cache_update_vae(self.vae_model, past_key_values, **generation_input)
         if vit:
             ## update vit
                 transforms=self.vit_transform,
                 new_token_ids=self.new_token_ids,
             )
+            # with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+            past_key_values = self.model.forward_cache_update_vit(past_key_values, **generation_input)
         gen_context['kv_lens'] = kv_lens
         gen_context['ropes'] = ropes
             image_sizes=[image_shape],
         )
+        # with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        unpacked_latent = self.model.generate_image(
+            past_key_values=past_key_values,
+            cfg_text_past_key_values=cfg_text_past_key_values,
+            cfg_img_past_key_values=cfg_img_past_key_values,
+            num_timesteps=num_timesteps,
+            cfg_text_scale=cfg_text_scale,
+            cfg_img_scale=cfg_img_scale,
+            cfg_interval=cfg_interval,
+            cfg_renorm_min=cfg_renorm_min,
+            cfg_renorm_type=cfg_renorm_type,
+            timestep_shift=timestep_shift,
+            **generation_input,
+            cfg_text_packed_position_ids=generation_input_cfg_text['cfg_packed_position_ids'],
+            cfg_text_packed_query_indexes=generation_input_cfg_text['cfg_packed_query_indexes'],
+            cfg_text_key_values_lens=generation_input_cfg_text['cfg_key_values_lens'],
+            cfg_text_packed_key_value_indexes=generation_input_cfg_text['cfg_packed_key_value_indexes'],
+            cfg_img_packed_position_ids=generation_input_cfg_img['cfg_packed_position_ids'],
+            cfg_img_packed_query_indexes=generation_input_cfg_img['cfg_packed_query_indexes'],
+            cfg_img_key_values_lens=generation_input_cfg_img['cfg_key_values_lens'],
+            cfg_img_packed_key_value_indexes=generation_input_cfg_img['cfg_packed_key_value_indexes'],
+        )
         image = self.decode_image(unpacked_latent[0], image_shape)
         return image
         kv_lens = gen_context['kv_lens']
         ropes = gen_context['ropes']
+        # with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        generation_input = self.model.prepare_start_tokens(kv_lens, ropes, self.new_token_ids)
+        for unpacked_latent in self.model.generate_text(
+            past_key_values=past_key_values,
+            max_length=max_length,
+            do_sample=do_sample,
+            temperature=temperature,
+            end_token_id=self.new_token_ids['eos_token_id'],
+            **generation_input,
+        ):
+            output = self.tokenizer.decode(unpacked_latent)
+            if output != "<|im_end|>":
+                yield output
     @torch.no_grad()
     def interleave_inference(