summaryrefslogtreecommitdiff
path: root/pkgs/development/python-modules/pytorch-tokenizers/default.nix
blob: 10f2835577f55135b29156b3e11f867feb5b6388 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,
  replaceVars,

  # build-system
  cmake,
  pybind11,
  setuptools,

  # dependencies
  sentencepiece,
  tiktoken,
  tokenizers,

  # tests
  pytestCheckHook,
  transformers,
}:

let
  # https://github.com/meta-pytorch/tokenizers/blob/v1.0.1/CMakeLists.txt#L174-L175
  pybind11-src = fetchFromGitHub {
    owner = "pybind";
    repo = "pybind11";
    tag = "v2.13.6";
    hash = "sha256-SNLdtrOjaC3lGHN9MAqTf51U9EzNKQLyTMNPe0GcdrU=";
  };
in
buildPythonPackage rec {
  pname = "pytorch-tokenizers";
  version = "1.0.1";
  pyproject = true;

  src = fetchFromGitHub {
    owner = "meta-pytorch";
    repo = "tokenizers";
    tag = "v${version}";
    fetchSubmodules = true;
    hash = "sha256-1BGazimbauNBN/VfLiuhk21VEhbP07GEpPc+GAfKTQY=";
  };

  patches = [
    (replaceVars ./dont-fetch-pybind11.patch {
      pybind11 = pybind11-src;
    })
    # error: ‘uint32_t’ does not name a type
    ./add-missing-cstdint-sentencepiece.patch
  ];

  postPatch = ''
    substituteInPlace pyproject.toml \
      --replace-fail '"pip>=23",' "" \
      --replace-fail '"pytest",' ""
  '';

  build-system = [
    cmake
    pybind11
    setuptools
  ];
  dontUseCmakeConfigure = true;

  dependencies = [
    sentencepiece
    tiktoken
    tokenizers
  ];

  pythonImportsCheck = [
    "pytorch_tokenizers"
    "pytorch_tokenizers.pytorch_tokenizers_cpp"
  ];

  preCheck = ''
    rm -rf pytorch_tokenizers
  '';

  nativeCheckInputs = [
    pytestCheckHook
    transformers
  ];

  disabledTestPaths = [
    # Require downloading models from huggingface
    "test/test_hf_tokenizer.py"
  ];

  meta = {
    description = "C++ implementations for various tokenizers (sentencepiece, tiktoken, etc.)";
    homepage = "https://github.com/meta-pytorch/tokenizers";
    license = lib.licenses.bsd3;
    maintainers = with lib.maintainers; [ GaetanLepage ];
  };
}