pkgs/development/python-modules/exllamav3/default.nix


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

{
  lib,
  fetchFromGitHub,
  buildPythonPackage,
  cudaPackages,
  nix-update-script,

  setuptools,

  flash-attn,
  formatron,
  kbnf,
  marisa-trie,
  ninja,
  numpy,
  pillow,
  pydantic,
  pyyaml,
  rich,
  safetensors,
  tokenizers,
  torch,
  typing-extensions,
}:
let
  newerThanTuring = lib.filter (version: lib.versionOlder "7.9" version) torch.cudaCapabilities;
in
buildPythonPackage (finalAttrs: {
  pname = "exllamav3";
  version = "0.0.25";
  pyproject = true;

  src = fetchFromGitHub {
    owner = "turboderp-org";
    repo = "exllamav3";
    tag = "v${finalAttrs.version}";
    hash = "sha256-CltM0bQ3mvQwUYulsVByS7mcIIy6O/P1+nq4h5UAO6E=";
  };

  pythonRelaxDeps = [
    "pydantic"
  ];

  build-system = [
    setuptools
  ];

  nativeBuildInputs = [
    ninja
  ];

  buildInputs = lib.optionals torch.cudaSupport [
    cudaPackages.cuda_cudart # cuda_runtime.h
    cudaPackages.libcusparse # cusparse.h
    cudaPackages.libcublas # cublas_v2.h
    cudaPackages.libcusolver # cusolverDn.h
    cudaPackages.libcurand # curand_kernel.h
  ];

  dependencies = [
    flash-attn
    formatron
    kbnf
    marisa-trie
    numpy
    pillow
    pydantic
    pyyaml
    rich
    safetensors
    tokenizers
    torch
    typing-extensions
  ];

  env = lib.optionalAttrs torch.cudaSupport {
    CUDA_HOME = lib.getDev cudaPackages.cuda_nvcc;
    # exllamav3 only supports turing or newer GPUs
    # https://github.com/turboderp-org/exllamav3/issues/44
    TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" newerThanTuring;
  };

  pythonImportsCheck = [ "exllamav3" ];

  # Tests require GPU hardware and external model files
  doCheck = false;

  passthru.updateScript = nix-update-script { };

  meta = {
    description = "Quantization and inference library for running LLMs locally on modern consumer-class GPUs";
    homepage = "https://github.com/turboderp-org/exllamav3";
    changelog = "https://github.com/turboderp-org/exllamav3/releases/tag/${finalAttrs.src.tag}";
    license = lib.licenses.mit;
    platforms = [
      "x86_64-windows"
      "x86_64-linux"
    ];
    broken = !torch.cudaSupport; # Package requires CUDA for functionality
    maintainers = with lib.maintainers; [ BatteredBunny ];
  };
})