blob: fd4a4888bca7841559360fd485d5e4723a1f3362 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
{
lib,
buildPythonPackage,
fetchFromGitHub,
cython,
setuptools,
regex,
pytestCheckHook,
}:
buildPythonPackage rec {
pname = "curated-tokenizers";
version = "2.0.0";
pyproject = true;
src = fetchFromGitHub {
owner = "explosion";
repo = "curated-tokenizers";
tag = "v${version}";
hash = "sha256-VkDV/9c5b8TzYlthCZ38ufbrne4rihtkmkZ/gyAQXLE=";
fetchSubmodules = true;
};
# Fix gcc15 build failures due to missing <cstdint>
postPatch = ''
sed -i '1i #include <cstdint>' sentencepiece/src/sentencepiece_processor.h
'';
build-system = [
cython
setuptools
];
dependencies = [
regex
];
nativeCheckInputs = [
pytestCheckHook
];
# Explicitly set the path to avoid running vendored
# sentencepiece tests.
enabledTestPaths = [ "tests" ];
preCheck = ''
# avoid local paths, relative imports wont resolve correctly
mv curated_tokenizers/tests tests
rm -r curated_tokenizers
'';
pythonImportsCheck = [ "curated_tokenizers" ];
meta = {
description = "Lightweight piece tokenization library";
homepage = "https://github.com/explosion/curated-tokenizers";
changelog = "https://github.com/explosion/curated-tokenizers/releases/tag/${src.tag}";
license = lib.licenses.mit;
maintainers = with lib.maintainers; [ danieldk ];
};
}
|