Training in progress, step 500

25b4ce2 verified 7 months ago

7.77 kB

	import os
	import torch
	from pathlib import Path
	from setuptools import setup, find_packages
	from distutils.sysconfig import get_python_lib
	from torch.utils.cpp_extension import BuildExtension, CUDAExtension

	os.environ["CC"] = "g++"
	os.environ["CXX"] = "g++"
	AUTOAWQ_KERNELS_VERSION = "0.0.7"
	PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
	CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
	ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip


	if not PYPI_BUILD:
	# only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
	if CUDA_VERSION:
	CUDA_VERSION = "".join(CUDA_VERSION.split("."))[:3]
	AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
	elif ROCM_VERSION:
	ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
	AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
	else:
	raise RuntimeError(
	"Your system must have either Nvidia or AMD GPU to build this package."
	)

	print(f"Building AutoAWQ Kernels version {AUTOAWQ_KERNELS_VERSION}")

	common_setup_kwargs = {
	"version": AUTOAWQ_KERNELS_VERSION,
	"name": "autoawq_kernels",
	"author": "Casper Hansen",
	"license": "MIT",
	"python_requires": ">=3.8.0",
	"description": "AutoAWQ Kernels implements the AWQ kernels.",
	"long_description": (Path(__file__).parent / "README.md").read_text(
	encoding="UTF-8"
	),
	"long_description_content_type": "text/markdown",
	"url": "https://github.com/casper-hansen/AutoAWQ_kernels",
	"keywords": ["awq", "autoawq", "quantization", "transformers"],
	"platforms": ["linux", "windows"],
	"classifiers": [
	"Environment :: GPU :: NVIDIA CUDA :: 11.8",
	"Environment :: GPU :: NVIDIA CUDA :: 12",
	"License :: OSI Approved :: MIT License",
	"Natural Language :: English",
	"Programming Language :: Python :: 3.8",
	"Programming Language :: Python :: 3.9",
	"Programming Language :: Python :: 3.10",
	"Programming Language :: Python :: 3.11",
	"Programming Language :: C++",
	],
	}

	requirements = [
	"torch==2.3.1",
	]


	def get_include_dirs():
	include_dirs = []

	if CUDA_VERSION:
	conda_cuda_include_dir = os.path.join(
	get_python_lib(), "nvidia/cuda_runtime/include"
	)
	if os.path.isdir(conda_cuda_include_dir):
	include_dirs.append(conda_cuda_include_dir)

	this_dir = os.path.dirname(os.path.abspath(__file__))
	include_dirs.append(this_dir)

	return include_dirs


	def get_generator_flag():
	generator_flag = []

	# if CUDA_VERSION:
	torch_dir = torch.__path__[0]
	if os.path.exists(
	os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
	):
	generator_flag = ["-DOLD_GENERATOR_PATH"]

	return generator_flag


	def get_compute_capabilities(
	compute_capabilities={75, 80, 86, 89, 90}
	):
	capability_flags = []

	if CUDA_VERSION:
	# Collect the compute capabilities of all available CUDA GPUs
	for i in range(torch.cuda.device_count()):
	major, minor = torch.cuda.get_device_capability(i)
	cc = major * 10 + minor
	if cc < 75:
	raise RuntimeError(
	"GPUs with compute capability less than 7.5 are not supported."
	)

	# Figure out compute capability
	for cap in compute_capabilities:
	capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]

	return capability_flags


	def get_extra_compile_args(arch_flags, generator_flags):
	extra_compile_args = {}

	if os.name == "nt" and CUDA_VERSION:
	include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
	# Relaxed args on Windows
	if include_arch:
	extra_compile_args = {"nvcc": arch_flags}

	elif CUDA_VERSION:
	extra_compile_args = {
	"cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
	"nvcc": [
	"-O3",
	"-std=c++17",
	"-DENABLE_BF16",
	"-U__CUDA_NO_HALF_OPERATORS__",
	"-U__CUDA_NO_HALF_CONVERSIONS__",
	"-U__CUDA_NO_BFLOAT16_OPERATORS__",
	"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
	"-U__CUDA_NO_BFLOAT162_OPERATORS__",
	"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
	"--expt-relaxed-constexpr",
	"--expt-extended-lambda",
	"--use_fast_math",
	]
	+ arch_flags
	+ generator_flags,
	}

	return extra_compile_args


	def get_extra_link_args():
	extra_link_args = []

	if os.name == "nt" and CUDA_VERSION:
	cuda_path = os.environ.get("CUDA_PATH", None)
	extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]

	return extra_link_args


	include_dirs = get_include_dirs()
	extra_link_args = get_extra_link_args()
	generator_flags = get_generator_flag()
	arch_flags = get_compute_capabilities()
	extra_compile_args = get_extra_compile_args(arch_flags, generator_flags)


	extensions = []
	if CUDA_VERSION:
	# contain un-hipifiable inline PTX
	extensions.append(
	CUDAExtension(
	"awq_ext",
	[
	"awq_ext/pybind_awq.cpp",
	"awq_ext/quantization/gemm_cuda_gen.cu",
	"awq_ext/layernorm/layernorm.cu",
	"awq_ext/position_embedding/pos_encoding_kernels.cu",
	"awq_ext/quantization/gemv_cuda.cu",
	"awq_ext/vllm/moe_alig_block.cu",
	"awq_ext/vllm/activation.cu",
	"awq_ext/vllm/topk_softmax_kernels.cu",
	],
	extra_compile_args=extra_compile_args,
	)
	)

	# only compatible with ampere
	arch_flags = get_compute_capabilities({80, 86, 89, 90})
	extra_compile_args_v2 = get_extra_compile_args(arch_flags, generator_flags)

	extensions.append(
	CUDAExtension(
	"awq_v2_ext",
	[
	"awq_ext/pybind_awq_v2.cpp",
	"awq_ext/quantization_new/gemv/gemv_cuda.cu",
	"awq_ext/quantization_new/gemm/gemm_cuda.cu",
	],
	extra_compile_args=extra_compile_args_v2,
	)
	)

	extensions.append(
	CUDAExtension(
	"exl_ext",
	[
	"awq_ext/exllama/exllama_ext.cpp",
	"awq_ext/exllama/cuda_buffers.cu",
	"awq_ext/exllama/cuda_func/column_remap.cu",
	"awq_ext/exllama/cuda_func/q4_matmul.cu",
	"awq_ext/exllama/cuda_func/q4_matrix.cu",
	],
	extra_compile_args=extra_compile_args,
	extra_link_args=extra_link_args,
	)
	)
	extensions.append(
	CUDAExtension(
	"exlv2_ext",
	[
	"awq_ext/exllamav2/ext.cpp",
	"awq_ext/exllamav2/cuda/q_matrix.cu",
	"awq_ext/exllamav2/cuda/q_gemm.cu",
	],
	extra_compile_args=extra_compile_args,
	extra_link_args=extra_link_args,
	)
	)

	if os.name != "nt" and CUDA_VERSION:
	# FasterTransformer kernels
	extensions.append(
	CUDAExtension(
	"awq_ft_ext",
	[
	"awq_ext/pybind_awq_ft.cpp",
	"awq_ext/attention/ft_attention.cpp",
	"awq_ext/attention/decoder_masked_multihead_attention.cu",
	],
	extra_compile_args=extra_compile_args,
	)
	)

	additional_setup_kwargs = {
	"ext_modules": extensions,
	"cmdclass": {"build_ext": BuildExtension},
	}

	common_setup_kwargs.update(additional_setup_kwargs)

	setup(
	packages=find_packages(),
	install_requires=requirements,
	include_dirs=include_dirs,
	**common_setup_kwargs,
	)