{ lib, buildPythonPackage, fetchFromGitHub, # build-system setuptools, torch, # buildInputs pybind11, # nativeBuildInputs writableTmpDirAsHomeHook, # dependencies cxxfilt, numpy, packaging, pytest, pyyaml, tqdm, # tests onnxscript, pytestCheckHook, torchvision, apex, cudaPackages, cudaSupport ? torch.cudaSupport, }: buildPythonPackage.override { inherit (torch) stdenv; } (finalAttrs: { pname = "apex"; version = "25.09"; pyproject = true; __structuredAttrs = true; src = fetchFromGitHub { owner = "nvidia"; repo = "apex"; tag = finalAttrs.version; hash = "sha256-/WcFCDjNXWbCnWoprYYAUcLt9p1CqJLzPXcBkPn+ics="; }; patches = [ # Fix incompatibility with more recent versions of cudnn to de-vendor it: # error: ‘throw_if’ is not a member of ‘cudnn_frontend’ ./fix-cudnn-frontend-compat.patch ]; # Don't use git submodules for cuda dependencies postPatch = '' substituteInPlace setup.py \ --replace-fail \ 'subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/multihead_attn/cutlass"])' \ "" \ --replace-fail \ 'subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/cudnn-frontend/"])' \ "" ''; env = { APEX_CPP_EXT = 1; } // lib.optionalAttrs cudaSupport { CUDA_HOME = (lib.getBin cudaPackages.cuda_nvcc).outPath; TORCH_CUDA_ARCH_LIST = "${lib.concatStringsSep ";" torch.cudaCapabilities}"; # Even if APEX_ALL_CONTRIB_EXT is enabled, APEX_CUDA_EXT must be explicitly enable APEX_CUDA_EXT = 1; # Enable all contrib extensions at once # https://github.com/NVIDIA/apex/tree/25.09#custom-ccuda-extensions-and-install-options APEX_ALL_CONTRIB_EXT = 1; NVCC_APPEND_FLAGS = lib.toString [ # Make kernel compilation slightly more parallel "--threads 2" ]; }; preBuild = '' export APEX_PARALLEL_BUILD=$NIX_BUILD_CORES ''; build-system = [ setuptools torch ]; buildInputs = [ pybind11 ] ++ lib.optionals cudaSupport ( with cudaPackages; [ cuda_cudart # cuda_runtime.h cuda_profiler_api # cuda_profiler_api.h cudnn # cudnn.h cudnn-frontend # cudnn_frontend.h cutlass # cutlass/cutlass.h libcublas # cublas_v2.h libcufile # cufile.h libcurand # curand_kernel.h libcusolver # cusolverDn.h libcusparse # cusparse.h nccl # nccl.h ] ); nativeBuildInputs = [ writableTmpDirAsHomeHook ]; dependencies = [ cxxfilt numpy packaging pytest pyyaml tqdm ]; pythonImportsCheck = [ "apex" "apex_C" ] ++ lib.optionals cudaSupport [ "_apex_gpu_direct_storage" "_apex_nccl_allocator" "amp_C" "apex_C" "bnp" "fmhalib" "fused_layer_norm_cuda" "nccl_p2p_cuda" "syncbn" ]; nativeCheckInputs = [ onnxscript pytestCheckHook torchvision ]; preCheck = '' rm -rf apex '' # Otherwise, test collection fails with: # ModuleNotFoundError: No module named 'test_fused_optimizer' + '' rm tests/L0/run_optimizers/__init__.py ''; doCheck = false; disabledTestPaths = [ # Try to read the driver version from nvidia-smi (failing in the sandbox) # TypeError: expected string or bytes-like object, got 'NoneType' "tests/L0/run_transformer/" # apex.parallel was removed in https://github.com/NVIDIA/apex/pull/1896, but some tests still # try to import it "tests/distributed/DDP/ddp_race_condition_test.py" "tests/distributed/synced_batchnorm/" ]; disabledTests = [ # RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet. # torch.onnx._internal.exporter._errors.TorchExportError: Failed to export the model with torch.export. "test_layer_norm_export_cuda" "test_rms_export_cuda" ]; passthru.gpuCheck = apex.overridePythonAttrs { requiredSystemFeatures = [ "cuda" ]; doCheck = true; }; meta = { description = "Tools for easy mixed precision and distributed training in Pytorch"; homepage = "https://github.com/nvidia/apex"; license = lib.licenses.bsd3; maintainers = with lib.maintainers; [ GaetanLepage ]; broken = !cudaSupport; }; })