summaryrefslogtreecommitdiff
path: root/pkgs/development/python-modules/dedupe/default.nix
blob: fd8f38f70d4bd88d8fea0545ad0ed1eb2a12bf59 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,

  # build-system
  cython,
  setuptools,

  # dependencies
  affinegap,
  btrees,
  categorical-distance,
  dedupe-levenshtein-search,
  doublemetaphone,
  haversine,
  highered,
  numpy,
  scikit-learn,
  simplecosine,
  zope-index,
  dedupe,

  # tests
  pytest-cov-stub,
  pytestCheckHook,
  python,
  runCommand,
}:

buildPythonPackage rec {
  pname = "dedupe";
  version = "3.0.3";
  pyproject = true;

  src = fetchFromGitHub {
    owner = "dedupeio";
    repo = "dedupe";
    tag = "v${version}";
    hash = "sha256-tfBJeaeZw5w5OwM+AOfy9H6P2zbShjN/kuzEbpxATHI=";
  };

  build-system = [
    cython
    setuptools
  ];

  dependencies = [
    affinegap
    btrees
    categorical-distance
    dedupe-levenshtein-search
    doublemetaphone
    haversine
    highered
    numpy
    scikit-learn
    simplecosine
    zope-index
  ];

  nativeCheckInputs = [
    pytest-cov-stub
    pytestCheckHook
  ];

  # Remove source directory so pytest imports compiled extension from $out
  preCheck = ''
    rm -rf dedupe
  '';

  pythonImportsCheck = [
    "dedupe"
  ];

  passthru.tests.benchmarks =
    runCommand "dedupe-benchmarks-test"
      {
        nativeBuildInputs = [ (python.withPackages (ps: [ dedupe ])) ];
      }
      ''
        # Copy benchmarks to writable location
        cp -r ${src}/benchmarks benchmarks
        chmod -R +w benchmarks
        cd benchmarks

        # Run all three canonical benchmarks
        for benchmark in canonical canonical_gazetteer canonical_matching; do
          echo "Running $benchmark benchmark..."
          # Redirect stderr to /dev/null (`2>/dev/null`) to suppress Python 3.13
          # multiprocessing resource tracker warnings from scikit-learn/joblib subprocesses
          # `|| exit 1` provides fail-fast behavior: exit immediately if any benchmark fails
          PYTHONPATH=$PWD python -m benchmarks.$benchmark 2>/dev/null || exit 1
        done

        touch $out
      '';

  meta = {
    description = "Library for accurate and scalable fuzzy matching, deduplication and entity resolution";
    homepage = "https://github.com/dedupeio/dedupe";
    changelog = "https://github.com/dedupeio/dedupe/blob/${src.tag}/CHANGELOG.md";
    license = lib.licenses.mit;
    maintainers = with lib.maintainers; [ daniel-fahey ];
  };
}