ImportError: cannot import name 'intel' from 'triton._C.libtriton' (/home/fkurushin/venv/pqr/lib/python3.11/site-packages/triton/_C/libtriton.so)

#2
by Fedor99 - opened

Hello! I have some problems running your model on gpu:

The minimal code to reproduce the error:

from transformers import AutoTokenizer, AutoModel
MODEL_NAME = "deepvk/USER2-small"
TOKENIZER_NAME = "deepvk/USER2-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(TOKENIZER_NAME)
model.to('cuda:7')
output = model(
    **tokenizer(
        ["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
    )
)

The error itself:

TorchRuntimeError: Dynamo failed to run FX node with fake tensors: call_function <function embedding at 0x7f2364d46ac0>(*(FakeTensor(..., size=(s0, s1), dtype=torch.int64), Parameter(FakeTensor(..., device='cuda:7', size=(50368, 384), requires_grad=True)), 50283, None, 2.0, False, False), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.embedding.default, found two different devices cuda:7, cpu')

from user code:
   File "/home/fkurushin/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py", line 207, in compiled_embeddings
    return self.drop(self.norm(self.tok_embeddings(input_ids)))
  File "/home/fkurushin/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
    return F.embedding(

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"

This is the python setup:

pytorch-triton==3.4.0
pytorch-triton-xpu==3.3.1
torch==2.7.1+cu128
torchaudio==2.7.1+cu128
torchcodec==0.4.0+cu128
torchvision==0.22.1+cu128
...
pytorch-triton==3.4.0
pytorch-triton-xpu==3.3.1
triton==3.3.1

OS:

Distributor ID: Debian
Description:    Debian GNU/Linux 12 (bookworm)
Release:        12
Codename:       bookworm

Thank you!

deep vk org

Hi!

**tokenizer(
        ["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
    )

Its look like tokenizer returns tensors on CPU, but model is on GPU. Try to move tensors to GPU before passing them into model

Hi, thank you for the reply. The example is bad, i'm sorry.

from transformers import AutoTokenizer, AutoModel
MODEL_NAME = "deepvk/USER2-small"
TOKENIZER_NAME = "deepvk/USER2-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(TOKENIZER_NAME)
model.to('cuda:6')
tok = tokenizer(
        ["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
)
tok.to('cuda:6')
output = model(
    **tok
)

This is he GPU example and the GPU error as well, although the code runs on cpu without any problems.

---------------------------------------------------------------------------
InductorError                             Traceback (most recent call last)
Cell In[3], line 1
----> 1 output = model(
      2     **tok
      3 )

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1751, in Module._wrapped_call_impl(self, *args, **kwargs)
   1749     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1750 else:
-> 1751     return self._call_impl(*args, **kwargs)

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1762, in Module._call_impl(self, *args, **kwargs)
   1757 # If we don't have any hooks, we want to skip the rest of the logic in
   1758 # this function, and just call forward.
   1759 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1760         or _global_backward_pre_hooks or _global_backward_hooks
   1761         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1762     return forward_call(*args, **kwargs)
   1764 result = None
   1765 called_always_called_hooks = set()

File ~/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:850, in ModernBertModel.forward(self, input_ids, attention_mask, sliding_window_mask, position_ids, inputs_embeds, indices, cu_seqlens, max_seqlen, batch_size, seq_len, output_attentions, output_hidden_states, return_dict)
    844         position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
    846     attention_mask, sliding_window_mask = self._update_attention_mask(
    847         attention_mask, output_attentions=output_attentions
    848     )
--> 850 hidden_states = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds)
    852 for encoder_layer in self.layers:
    853     if output_hidden_states:

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1751, in Module._wrapped_call_impl(self, *args, **kwargs)
   1749     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1750 else:
-> 1751     return self._call_impl(*args, **kwargs)

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1762, in Module._call_impl(self, *args, **kwargs)
   1757 # If we don't have any hooks, we want to skip the rest of the logic in
   1758 # this function, and just call forward.
   1759 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1760         or _global_backward_pre_hooks or _global_backward_hooks
   1761         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1762     return forward_call(*args, **kwargs)
   1764 result = None
   1765 called_always_called_hooks = set()

File ~/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:216, in ModernBertEmbeddings.forward(self, input_ids, inputs_embeds)
    213     hidden_states = self.drop(self.norm(inputs_embeds))
    214 else:
    215     hidden_states = (
--> 216         self.compiled_embeddings(input_ids)
    217         if self.config.reference_compile
    218         else self.drop(self.norm(self.tok_embeddings(input_ids)))
    219     )
    220 return hidden_states

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py:663, in _TorchDynamoContext.__call__.<locals>._fn(*args, **kwargs)
    659     raise e.with_traceback(None) from None
    660 except ShortenTraceback as e:
    661     # Failures in the backend likely don't have useful
    662     # data in the TorchDynamo frames, so we strip them out.
--> 663     raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    664 finally:
    665     # Restore the dynamic layer stack depth if necessary.
    666     set_eval_frame(None)

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:760, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
    758     raise
    759 except Exception as e:
--> 760     raise InductorError(e, currentframe()).with_traceback(
    761         e.__traceback__
    762     ) from None
    763 finally:
    764     TritonBundler.end_compile()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:745, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
    743 TritonBundler.begin_compile()
    744 try:
--> 745     mb_compiled_graph = fx_codegen_and_compile(
    746         gm, example_inputs, inputs_to_check, **graph_kwargs
    747     )
    748     assert mb_compiled_graph is not None
    749     mb_compiled_graph._time_taken_ns = time.time_ns() - start_time

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1295, in fx_codegen_and_compile(gm, example_inputs, inputs_to_check, **graph_kwargs)
   1291     from .compile_fx_subproc import _SubprocessFxCompile
   1293     scheme = _SubprocessFxCompile()
-> 1295 return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1197, in _InProcessFxCompile.codegen_and_compile(self, gm, example_inputs, inputs_to_check, graph_kwargs)
   1184             compiled_fn = AotCodeCompiler.compile(
   1185                 graph,
   1186                 wrapper_code.value,
   (...)   1194                 ],
   1195             )
   1196     else:
-> 1197         compiled_fn = graph.compile_to_module().call
   1199 num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
   1200 metrics.num_bytes_accessed += num_bytes

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/graph.py:2083, in GraphLowering.compile_to_module(self)
   2076 def compile_to_module(self) -> ModuleType:
   2077     with dynamo_timed(
   2078         "GraphLowering.compile_to_module",
   2079         phase_name="code_gen",
   2080         log_pt2_compile_event=True,
   2081         dynamo_compile_column_us="inductor_code_gen_cumulative_compile_time_us",
   2082     ):
-> 2083         return self._compile_to_module()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/graph.py:2130, in GraphLowering._compile_to_module(self)
   2124     trace_structured(
   2125         "inductor_output_code",
   2126         lambda: {"filename": path},
   2127         payload_fn=lambda: wrapper_code.value,
   2128     )
   2129 with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
-> 2130     mod = PyCodeCache.load_by_key_path(
   2131         key,
   2132         path,
   2133         linemap=linemap,  # type: ignore[arg-type]
   2134         attrs={**self.constants, **self.torchbind_constants},
   2135     )
   2136 self.cache_key = key
   2137 self.cache_path = path

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/codecache.py:2747, in PyCodeCache.load_by_key_path(cls, key, path, linemap, attrs)
   2744 if linemap is None:
   2745     linemap = []
-> 2747 mod = _reload_python_module(key, path)
   2749 # unzip into separate lines/nodes lists
   2750 cls.linemaps[path] = list(zip(*linemap))

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py:36, in _reload_python_module(key, path)
     34 mod.__file__ = path
     35 mod.key = key  # type: ignore[attr-defined]
---> 36 exec(code, mod.__dict__, mod.__dict__)
     37 sys.modules[mod.__name__] = mod
     38 return mod

File /tmp/torchinductor_fkurushin/tf/ctf7mar2xkgqznrahfvgzfthle4v7qpb4opxscap5r6ee3rwcbvv.py:119
     36 # kernel path: /tmp/torchinductor_fkurushin/au/caujvggywsoktbv4uj7v54rnzomxiwuv64al5jvxs5gitkhjc4zz.py
     37 # Topologically Sorted Source Nodes: [embedding, layer_norm], Original ATen: [aten.embedding, aten.native_layer_norm]
     38 # Source node to ATen node mapping:
   (...)     47 #   %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_2, %rsqrt), kwargs = {})
     48 #   %mul_4 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_3, %primals_5), kwargs = {})
     49 triton_per_fused_embedding_native_layer_norm_0 = async_compile.triton('triton_per_fused_embedding_native_layer_norm_0', '''
     50 import triton
     51 import triton.language as tl
   (...)    115     tl.store(out_ptr1 + (x0), tmp16, None)
    116 ''', device_str='cuda')
--> 119 async_compile.wait(globals())
    120 del async_compile
    122 def call(args):

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:424, in AsyncCompile.wait(self, scope)
    417 if get_compile_threads() > 1:
    418     with dynamo_timed(
    419         "async_compile.wait",
    420         log_pt2_compile_event=True,
    421         dynamo_compile_column_us="triton_compile_time_us",
    422         log_waitcounter=True,
    423     ):
--> 424         self._wait_futures(scope)
    426 _compile_end()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:445, in AsyncCompile._wait_futures(self, scope)
    443     pbar.set_postfix_str(key)
    444 try:
--> 445     scope[key] = result.result()
    446 except BrokenProcessPool as e:
    447     raise RuntimeError(
    448         "A compilation subprocess exited unexpectedly. This "
    449         "is likely due to a crash. To facilitate debugging, "
    450         "you can re-run with TORCHINDUCTOR_COMPILE_THREADS=1 "
    451         "to cause compilation to occur in the main process."
    452     ) from e

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/codecache.py:3224, in LambdaFuture.result(self)
   3223 def result(self) -> Callable[..., Any]:  # type: ignore[override]
-> 3224     return self.result_fn()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:325, in AsyncCompile.triton.<locals>.get_result()
    322 # Now that we've compiled, we should clear the future
    323 # so it can't be used again
    324 CompiledTritonKernels.remove_future(source_code)
--> 325 kernel.precompile(
    326     warm_cache_only=False, reload_kernel=reload_kernel_in_parent
    327 )
    328 get_metrics_context().add_top_n(
    329     "triton_kernel_compile_times_us", kernel_name, elapsed_us
    330 )
    331 return kernel

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:277, in CachingAutotuner.precompile(self, warm_cache_only, reload_kernel)
    275     self._reload_kernel = reload_kernel
    276 self._precompile_worker()
--> 277 self._make_launchers()
    278 self._dynamic_scale_rblock()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:434, in CachingAutotuner._make_launchers(self)
    432 for result in self.compile_results:
    433     try:
--> 434         launchers.append(result.make_launcher())
    436     except (OutOfResources, PTXASError) as e:
    437         exc = e

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:1153, in TritonCompileResult.make_launcher(self)
   1140     def_args = [
   1141         name
   1142         for name in fn.arg_names
   1143         if name not in cfg_dict and name not in none_args
   1144     ]
   1146 binary_shared = (
   1147     binary.shared if hasattr(binary, "shared") else binary.metadata.shared
   1148 )
   1150 scope = {
   1151     "grid_meta": cfg.kwargs,
   1152     "bin": binary,
-> 1153     "launch_enter_hook": binary.__class__.launch_enter_hook,
   1154     "launch_exit_hook": binary.__class__.launch_exit_hook,
   1155     "metadata": (
   1156         binary.packed_metadata
   1157         if hasattr(binary, "packed_metadata")
   1158         else binary.metadata
   1159     ),
   1160     "shared": binary_shared,
   1161     "num_warps": (
   1162         binary.num_warps
   1163         if hasattr(binary, "num_warps")
   1164         else binary.metadata.num_warps
   1165     ),
   1166     "cta_args": (
   1167         (
   1168             binary.num_ctas,
   1169             *get_first_attr(binary, "cluster_dims", "clusterDims"),
   1170         )
   1171         if hasattr(binary, "num_ctas")
   1172         else (
   1173             (binary.metadata.num_ctas, *binary.metadata.cluster_dims)
   1174             if hasattr(binary, "metadata")
   1175             else ()
   1176         )
   1177     ),
   1178     "function": get_first_attr(binary, "function", "cu_function"),
   1179     "runner": get_first_attr(binary, "run", "c_wrapper"),
   1180 }
   1182 if not hasattr(binary, "launch_metadata"):
   1183     # launch args before CompiledKernel.launch_metadata is added.
   1184     # TODO(jansel): delete this branch in mid-2025
   1185     runner_args = [
   1186         "grid_0",
   1187         "grid_1",
   (...)   1197         *call_args,
   1198     ]

InductorError: AttributeError: type object 'CompiledKernel' has no attribute 'launch_enter_hook'

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"

Thank you

Sign up or log in to comment