1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
{
lib,
buildPythonPackage,
fetchFromGitHub,
replaceVars,
# build-system
cmake,
pybind11,
setuptools,
# dependencies
sentencepiece,
tiktoken,
tokenizers,
# tests
pytestCheckHook,
transformers,
}:
let
# https://github.com/meta-pytorch/tokenizers/blob/v1.0.1/CMakeLists.txt#L174-L175
pybind11-src = fetchFromGitHub {
owner = "pybind";
repo = "pybind11";
tag = "v2.13.6";
hash = "sha256-SNLdtrOjaC3lGHN9MAqTf51U9EzNKQLyTMNPe0GcdrU=";
};
in
buildPythonPackage rec {
pname = "pytorch-tokenizers";
version = "1.0.1";
pyproject = true;
src = fetchFromGitHub {
owner = "meta-pytorch";
repo = "tokenizers";
tag = "v${version}";
fetchSubmodules = true;
hash = "sha256-1BGazimbauNBN/VfLiuhk21VEhbP07GEpPc+GAfKTQY=";
};
patches = [
(replaceVars ./dont-fetch-pybind11.patch {
pybind11 = pybind11-src;
})
# error: ‘uint32_t’ does not name a type
./add-missing-cstdint-sentencepiece.patch
];
postPatch = ''
substituteInPlace pyproject.toml \
--replace-fail '"pip>=23",' "" \
--replace-fail '"pytest",' ""
'';
build-system = [
cmake
pybind11
setuptools
];
dontUseCmakeConfigure = true;
dependencies = [
sentencepiece
tiktoken
tokenizers
];
pythonImportsCheck = [
"pytorch_tokenizers"
"pytorch_tokenizers.pytorch_tokenizers_cpp"
];
preCheck = ''
rm -rf pytorch_tokenizers
'';
nativeCheckInputs = [
pytestCheckHook
transformers
];
disabledTestPaths = [
# Require downloading models from huggingface
"test/test_hf_tokenizer.py"
];
meta = {
description = "C++ implementations for various tokenizers (sentencepiece, tiktoken, etc.)";
homepage = "https://github.com/meta-pytorch/tokenizers";
license = lib.licenses.bsd3;
maintainers = with lib.maintainers; [ GaetanLepage ];
};
}
|