summaryrefslogtreecommitdiff
path: root/pkgs/development/rocm-modules/composable_kernel/base.nix
blob: da94466388c82abd19aa87a711dc2966247d3255 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
{
  lib,
  stdenv,
  fetchFromGitHub,
  rocmUpdateScript,
  cmake,
  rocm-cmake,
  clr,
  rocminfo,
  python3,
  hipify,
  gitMinimal,
  gtest,
  jemalloc,
  zstd,
  buildTests ? false,
  buildExamples ? false,
  # limits prebuilt kernel selection to those needed for MIOPEN (currently "*conv*")
  # Other kernels can still be used if treating CK as a header only library
  # and building specific instances, as done with ck4inductor/torch
  miOpenReqLibsOnly ? true,
  withDeprecatedKernels ? false,
  gpuTargets ? (
    clr.localGpuTargets or [
      "gfx900"
      "gfx906"
      "gfx908"
      "gfx90a"
      "gfx942"
      "gfx950"
      "gfx10-3-generic"
      "gfx11-generic"
      "gfx12-generic"
    ]
  ),
}:

stdenv.mkDerivation (finalAttrs: {
  preBuild = ''
    echo "This derivation isn't intended to be built directly and only exists to be overridden and built in chunks";
    exit 1
  '';

  pname = "composable_kernel_base";
  version = "7.2.3";

  outputs = [
    "out"
  ]
  ++ lib.optionals buildTests [
    "test"
  ]
  ++ lib.optionals buildExamples [
    "example"
  ];

  src = fetchFromGitHub {
    owner = "ROCm";
    repo = "rocm-libraries";
    rev = "rocm-${finalAttrs.version}";
    sparseCheckout = [
      "projects/composablekernel"
      "shared"
    ];
    hash = "sha256-Zs6wwPmys1kUlgDD4XzKKw273nH/Ur3HtuYxJjvjDs0=";
  };
  sourceRoot = "${finalAttrs.src.name}/projects/composablekernel";

  nativeBuildInputs = [
    # Deliberately not using ninja
    # because we're jankily composing build outputs from multiple drvs
    # ninja won't believe they're up to date
    gitMinimal
    cmake
    rocminfo
    clr
    hipify
    zstd
    python3
  ];

  buildInputs = [
    rocm-cmake
    clr
    zstd
  ];

  strictDeps = true;
  enableParallelBuilding = true;
  env.ROCM_PATH = clr;
  # Speed up build by ~7% with jemalloc (template torture test workload means allocation heavy clang invocations)
  env.LD_PRELOAD = "${jemalloc}/lib/libjemalloc.so";
  env.MALLOC_CONF = "background_thread:true,metadata_thp:auto,dirty_decay_ms:10000,muzzy_decay_ms:10000";

  cmakeFlags = [
    (lib.cmakeBool "MIOPEN_REQ_LIBS_ONLY" miOpenReqLibsOnly)
    (lib.cmakeBool "BUILD_MHA_LIB" (!miOpenReqLibsOnly))
    (lib.cmakeBool "DISABLE_DL_KERNELS" true)
    (lib.cmakeBool "DISABLE_DPP_KERNELS" true)
    (lib.cmakeBool "CK_TIME_KERNEL" false)
    "-DCMAKE_MODULE_PATH=${clr}/hip/cmake"
    "-DCMAKE_POLICY_DEFAULT_CMP0069=NEW"
    "-DDL_KERNELS=OFF"
    # CK_USE_CODEGEN Required for migraphx which uses device_gemm_multiple_d.hpp
    # but migraphx requires an incompatible fork of CK and fails anyway
    # "-DCK_USE_CODEGEN=ON"
    # It might be worth skipping fp64 in future with this:
    # "-DDTYPES=fp32;fp16;fp8;bf16;int8"
    # Manually define CMAKE_INSTALL_<DIR>
    # See: https://github.com/NixOS/nixpkgs/pull/197838
    "-DCMAKE_INSTALL_BINDIR=bin"
    "-DCMAKE_INSTALL_LIBDIR=lib"
    "-DCMAKE_INSTALL_INCLUDEDIR=include"
    "-DBUILD_DEV=OFF"
    "-DBUILD_MHA_LIB=ON"
    "-DROCM_PATH=${clr}"
    "-DENABLE_CLANG_CPP_CHECKS=OFF"
    "-DCMAKE_HIP_COMPILER_ROCM_ROOT=${clr}"

    # FP8 can build for 908/90a but very slow build
    # and produces unusably slow kernels that are huge
    "-DCK_USE_FP8_ON_UNSUPPORTED_ARCH=OFF"
  ]
  ++ lib.optionals (gpuTargets != [ ]) [
    # We intentionally set GPU_ARCHS and not AMD/GPU_TARGETS
    # per readme this is required if archs are dissimilar
    # In rocm-6.3.x not setting any arch flag worked
    # but setting dissimilar arches always failed
    "-DGPU_ARCHS=${lib.concatStringsSep ";" gpuTargets}"
  ]
  ++ lib.optionals buildTests [
    "-DGOOGLETEST_DIR=${gtest.src}" # Custom linker names
  ];

  patches = [
    # Hacky fix for failure for some targets when all targets are selected out
    # for a non-optional at link time kernel
    ./fix-empty-offload-targets.diff
  ];

  postPatch =
    # Reduce configure time by preventing thousands of clang-tidy targets being added
    # We will never call them
    # Never build profiler
    ''
      substituteInPlace library/src/utility/CMakeLists.txt library/src/tensor_operation_instance/gpu/CMakeLists.txt \
        --replace-fail clang_tidy_check '#clang_tidy_check'
      substituteInPlace CMakeLists.txt \
        --replace-fail "add_subdirectory(profiler)" ""
      substituteInPlace cmake/EnableCompilerWarnings.cmake \
        --replace-fail "-Werror" ""
    ''
    + lib.optionalString (!withDeprecatedKernels) ''
      substituteInPlace include/ck/ck.hpp \
        --replace-fail "CK_BUILD_DEPRECATED 1" "CK_BUILD_DEPRECATED 0"
    ''
    # Optionally remove tests
    + lib.optionalString (!buildTests) ''
      substituteInPlace CMakeLists.txt \
        --replace-fail "add_subdirectory(test)" ""
      substituteInPlace codegen/CMakeLists.txt \
        --replace-fail "include(ROCMTest)" ""
    ''
    # Optionally remove examples
    + lib.optionalString (!buildExamples) ''
      substituteInPlace CMakeLists.txt \
        --replace-fail "add_subdirectory(example)" ""
    '';

  postInstall =
    lib.optionalString buildTests ''
      mkdir -p $test/bin
      mv $out/bin/test_* $test/bin
    ''
    + lib.optionalString buildExamples ''
      mkdir -p $example/bin
      mv $out/bin/example_* $example/bin
    '';

  passthru = {
    inherit gpuTargets miOpenReqLibsOnly;
    updateScript = rocmUpdateScript { inherit finalAttrs; };
    anyGfx9Target = lib.lists.any (lib.strings.hasPrefix "gfx9") gpuTargets;
    anyMfmaTarget =
      (lib.lists.intersectLists gpuTargets [
        "gfx908"
        "gfx90a"
        "gfx942"
        "gfx950"
      ]) != [ ];
  };

  meta = {
    description = "Performance portable programming model for machine learning tensor operators";
    homepage = "https://github.com/ROCm/rocm-libraries/tree/develop/projects/composablekernel";
    license = with lib.licenses; [ mit ];
    teams = [ lib.teams.rocm ];
    platforms = lib.platforms.linux;
    broken = true; # this base package shouldn't be built directly
  };
})