ImportError: cannot import name 'intel' from 'triton._C.libtriton' (/home/fkurushin/venv/pqr/lib/python3.11/site-packages/triton/_C/libtriton.so)
#2
by
Fedor99
- opened
Hello! I have some problems running your model on gpu:
The minimal code to reproduce the error:
from transformers import AutoTokenizer, AutoModel
MODEL_NAME = "deepvk/USER2-small"
TOKENIZER_NAME = "deepvk/USER2-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(TOKENIZER_NAME)
model.to('cuda:7')
output = model(
**tokenizer(
["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
)
)
The error itself:
TorchRuntimeError: Dynamo failed to run FX node with fake tensors: call_function <function embedding at 0x7f2364d46ac0>(*(FakeTensor(..., size=(s0, s1), dtype=torch.int64), Parameter(FakeTensor(..., device='cuda:7', size=(50368, 384), requires_grad=True)), 50283, None, 2.0, False, False), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.embedding.default, found two different devices cuda:7, cpu')
from user code:
File "/home/fkurushin/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py", line 207, in compiled_embeddings
return self.drop(self.norm(self.tok_embeddings(input_ids)))
File "/home/fkurushin/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
return F.embedding(
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
This is the python setup:
pytorch-triton==3.4.0
pytorch-triton-xpu==3.3.1
torch==2.7.1+cu128
torchaudio==2.7.1+cu128
torchcodec==0.4.0+cu128
torchvision==0.22.1+cu128
...
pytorch-triton==3.4.0
pytorch-triton-xpu==3.3.1
triton==3.3.1
OS:
Distributor ID: Debian
Description: Debian GNU/Linux 12 (bookworm)
Release: 12
Codename: bookworm
Thank you!
Hi!
**tokenizer(
["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
)
Its look like tokenizer returns tensors on CPU
, but model is on GPU
. Try to move tensors to GPU before passing them into model
Hi, thank you for the reply. The example is bad, i'm sorry.
from transformers import AutoTokenizer, AutoModel
MODEL_NAME = "deepvk/USER2-small"
TOKENIZER_NAME = "deepvk/USER2-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(TOKENIZER_NAME)
model.to('cuda:6')
tok = tokenizer(
["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
)
tok.to('cuda:6')
output = model(
**tok
)
This is he GPU example and the GPU error as well, although the code runs on cpu without any problems.
---------------------------------------------------------------------------
InductorError Traceback (most recent call last)
Cell In[3], line 1
----> 1 output = model(
2 **tok
3 )
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1751, in Module._wrapped_call_impl(self, *args, **kwargs)
1749 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1750 else:
-> 1751 return self._call_impl(*args, **kwargs)
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1762, in Module._call_impl(self, *args, **kwargs)
1757 # If we don't have any hooks, we want to skip the rest of the logic in
1758 # this function, and just call forward.
1759 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1760 or _global_backward_pre_hooks or _global_backward_hooks
1761 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1762 return forward_call(*args, **kwargs)
1764 result = None
1765 called_always_called_hooks = set()
File ~/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:850, in ModernBertModel.forward(self, input_ids, attention_mask, sliding_window_mask, position_ids, inputs_embeds, indices, cu_seqlens, max_seqlen, batch_size, seq_len, output_attentions, output_hidden_states, return_dict)
844 position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
846 attention_mask, sliding_window_mask = self._update_attention_mask(
847 attention_mask, output_attentions=output_attentions
848 )
--> 850 hidden_states = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds)
852 for encoder_layer in self.layers:
853 if output_hidden_states:
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1751, in Module._wrapped_call_impl(self, *args, **kwargs)
1749 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1750 else:
-> 1751 return self._call_impl(*args, **kwargs)
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1762, in Module._call_impl(self, *args, **kwargs)
1757 # If we don't have any hooks, we want to skip the rest of the logic in
1758 # this function, and just call forward.
1759 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1760 or _global_backward_pre_hooks or _global_backward_hooks
1761 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1762 return forward_call(*args, **kwargs)
1764 result = None
1765 called_always_called_hooks = set()
File ~/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:216, in ModernBertEmbeddings.forward(self, input_ids, inputs_embeds)
213 hidden_states = self.drop(self.norm(inputs_embeds))
214 else:
215 hidden_states = (
--> 216 self.compiled_embeddings(input_ids)
217 if self.config.reference_compile
218 else self.drop(self.norm(self.tok_embeddings(input_ids)))
219 )
220 return hidden_states
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py:663, in _TorchDynamoContext.__call__.<locals>._fn(*args, **kwargs)
659 raise e.with_traceback(None) from None
660 except ShortenTraceback as e:
661 # Failures in the backend likely don't have useful
662 # data in the TorchDynamo frames, so we strip them out.
--> 663 raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1
664 finally:
665 # Restore the dynamic layer stack depth if necessary.
666 set_eval_frame(None)
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:760, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
758 raise
759 except Exception as e:
--> 760 raise InductorError(e, currentframe()).with_traceback(
761 e.__traceback__
762 ) from None
763 finally:
764 TritonBundler.end_compile()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:745, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
743 TritonBundler.begin_compile()
744 try:
--> 745 mb_compiled_graph = fx_codegen_and_compile(
746 gm, example_inputs, inputs_to_check, **graph_kwargs
747 )
748 assert mb_compiled_graph is not None
749 mb_compiled_graph._time_taken_ns = time.time_ns() - start_time
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1295, in fx_codegen_and_compile(gm, example_inputs, inputs_to_check, **graph_kwargs)
1291 from .compile_fx_subproc import _SubprocessFxCompile
1293 scheme = _SubprocessFxCompile()
-> 1295 return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1197, in _InProcessFxCompile.codegen_and_compile(self, gm, example_inputs, inputs_to_check, graph_kwargs)
1184 compiled_fn = AotCodeCompiler.compile(
1185 graph,
1186 wrapper_code.value,
(...) 1194 ],
1195 )
1196 else:
-> 1197 compiled_fn = graph.compile_to_module().call
1199 num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
1200 metrics.num_bytes_accessed += num_bytes
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/graph.py:2083, in GraphLowering.compile_to_module(self)
2076 def compile_to_module(self) -> ModuleType:
2077 with dynamo_timed(
2078 "GraphLowering.compile_to_module",
2079 phase_name="code_gen",
2080 log_pt2_compile_event=True,
2081 dynamo_compile_column_us="inductor_code_gen_cumulative_compile_time_us",
2082 ):
-> 2083 return self._compile_to_module()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/graph.py:2130, in GraphLowering._compile_to_module(self)
2124 trace_structured(
2125 "inductor_output_code",
2126 lambda: {"filename": path},
2127 payload_fn=lambda: wrapper_code.value,
2128 )
2129 with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
-> 2130 mod = PyCodeCache.load_by_key_path(
2131 key,
2132 path,
2133 linemap=linemap, # type: ignore[arg-type]
2134 attrs={**self.constants, **self.torchbind_constants},
2135 )
2136 self.cache_key = key
2137 self.cache_path = path
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/codecache.py:2747, in PyCodeCache.load_by_key_path(cls, key, path, linemap, attrs)
2744 if linemap is None:
2745 linemap = []
-> 2747 mod = _reload_python_module(key, path)
2749 # unzip into separate lines/nodes lists
2750 cls.linemaps[path] = list(zip(*linemap))
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py:36, in _reload_python_module(key, path)
34 mod.__file__ = path
35 mod.key = key # type: ignore[attr-defined]
---> 36 exec(code, mod.__dict__, mod.__dict__)
37 sys.modules[mod.__name__] = mod
38 return mod
File /tmp/torchinductor_fkurushin/tf/ctf7mar2xkgqznrahfvgzfthle4v7qpb4opxscap5r6ee3rwcbvv.py:119
36 # kernel path: /tmp/torchinductor_fkurushin/au/caujvggywsoktbv4uj7v54rnzomxiwuv64al5jvxs5gitkhjc4zz.py
37 # Topologically Sorted Source Nodes: [embedding, layer_norm], Original ATen: [aten.embedding, aten.native_layer_norm]
38 # Source node to ATen node mapping:
(...) 47 # %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_2, %rsqrt), kwargs = {})
48 # %mul_4 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_3, %primals_5), kwargs = {})
49 triton_per_fused_embedding_native_layer_norm_0 = async_compile.triton('triton_per_fused_embedding_native_layer_norm_0', '''
50 import triton
51 import triton.language as tl
(...) 115 tl.store(out_ptr1 + (x0), tmp16, None)
116 ''', device_str='cuda')
--> 119 async_compile.wait(globals())
120 del async_compile
122 def call(args):
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:424, in AsyncCompile.wait(self, scope)
417 if get_compile_threads() > 1:
418 with dynamo_timed(
419 "async_compile.wait",
420 log_pt2_compile_event=True,
421 dynamo_compile_column_us="triton_compile_time_us",
422 log_waitcounter=True,
423 ):
--> 424 self._wait_futures(scope)
426 _compile_end()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:445, in AsyncCompile._wait_futures(self, scope)
443 pbar.set_postfix_str(key)
444 try:
--> 445 scope[key] = result.result()
446 except BrokenProcessPool as e:
447 raise RuntimeError(
448 "A compilation subprocess exited unexpectedly. This "
449 "is likely due to a crash. To facilitate debugging, "
450 "you can re-run with TORCHINDUCTOR_COMPILE_THREADS=1 "
451 "to cause compilation to occur in the main process."
452 ) from e
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/codecache.py:3224, in LambdaFuture.result(self)
3223 def result(self) -> Callable[..., Any]: # type: ignore[override]
-> 3224 return self.result_fn()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:325, in AsyncCompile.triton.<locals>.get_result()
322 # Now that we've compiled, we should clear the future
323 # so it can't be used again
324 CompiledTritonKernels.remove_future(source_code)
--> 325 kernel.precompile(
326 warm_cache_only=False, reload_kernel=reload_kernel_in_parent
327 )
328 get_metrics_context().add_top_n(
329 "triton_kernel_compile_times_us", kernel_name, elapsed_us
330 )
331 return kernel
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:277, in CachingAutotuner.precompile(self, warm_cache_only, reload_kernel)
275 self._reload_kernel = reload_kernel
276 self._precompile_worker()
--> 277 self._make_launchers()
278 self._dynamic_scale_rblock()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:434, in CachingAutotuner._make_launchers(self)
432 for result in self.compile_results:
433 try:
--> 434 launchers.append(result.make_launcher())
436 except (OutOfResources, PTXASError) as e:
437 exc = e
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:1153, in TritonCompileResult.make_launcher(self)
1140 def_args = [
1141 name
1142 for name in fn.arg_names
1143 if name not in cfg_dict and name not in none_args
1144 ]
1146 binary_shared = (
1147 binary.shared if hasattr(binary, "shared") else binary.metadata.shared
1148 )
1150 scope = {
1151 "grid_meta": cfg.kwargs,
1152 "bin": binary,
-> 1153 "launch_enter_hook": binary.__class__.launch_enter_hook,
1154 "launch_exit_hook": binary.__class__.launch_exit_hook,
1155 "metadata": (
1156 binary.packed_metadata
1157 if hasattr(binary, "packed_metadata")
1158 else binary.metadata
1159 ),
1160 "shared": binary_shared,
1161 "num_warps": (
1162 binary.num_warps
1163 if hasattr(binary, "num_warps")
1164 else binary.metadata.num_warps
1165 ),
1166 "cta_args": (
1167 (
1168 binary.num_ctas,
1169 *get_first_attr(binary, "cluster_dims", "clusterDims"),
1170 )
1171 if hasattr(binary, "num_ctas")
1172 else (
1173 (binary.metadata.num_ctas, *binary.metadata.cluster_dims)
1174 if hasattr(binary, "metadata")
1175 else ()
1176 )
1177 ),
1178 "function": get_first_attr(binary, "function", "cu_function"),
1179 "runner": get_first_attr(binary, "run", "c_wrapper"),
1180 }
1182 if not hasattr(binary, "launch_metadata"):
1183 # launch args before CompiledKernel.launch_metadata is added.
1184 # TODO(jansel): delete this branch in mid-2025
1185 runner_args = [
1186 "grid_0",
1187 "grid_1",
(...) 1197 *call_args,
1198 ]
InductorError: AttributeError: type object 'CompiledKernel' has no attribute 'launch_enter_hook'
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
Thank you