summaryrefslogtreecommitdiff
path: root/pkgs/development/python-modules/unstructured/default.nix
blob: 9d811ac4b06f521d5681462b4fcd4fba658af8e3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,

  # build-system
  setuptools,

  # core networking and async dependencies
  anyio,
  backoff,
  certifi,
  httpcore,
  httpx,
  h11,
  nest-asyncio,
  requests,
  requests-toolbelt,
  sniffio,
  urllib3,

  # core parsing and processing
  beautifulsoup4,
  chardet,
  charset-normalizer,
  emoji,
  filetype,
  html5lib,
  idna,
  joblib,
  # jsonpath-python,
  nltk,
  olefile,
  orderly-set,
  python-dateutil,
  python-iso639,
  python-magic,
  python-oxmsg,
  rapidfuzz,
  regex,
  soupsieve,
  webencodings,

  # core data handling
  dataclasses-json,
  deepdiff,
  marshmallow,
  mypy-extensions,
  packaging,
  typing-extensions,
  typing-inspect,

  # core system utilities
  cffi,
  cryptography,
  psutil,
  pycparser,
  six,
  tqdm,
  wrapt,

  # document format support
  markdown,
  pdfminer-six,
  pdfplumber,
  # pi-heif,
  pikepdf,
  pypandoc,
  pypdf,
  python-docx,
  unstructured-client,
  # unstructured-pytesseract,
  # optional dependencies
  # csv
  pytz,
  tzdata,
  # markdown
  importlib-metadata,
  zipp,
  # pdf
  opencv-python,
  paddlepaddle,
  pdf2image,
  # unstructured-paddleocr,
  # pptx
  lxml,
  pillow,
  python-pptx,
  xlsxwriter,
  # xslx
  et-xmlfile,
  networkx,
  numpy,
  openpyxl,
  pandas,
  xlrd,
  # huggingface
  langdetect,
  sacremoses,
  sentencepiece,
  torch,
  transformers,
  # local-inference
  unstructured-inference,
  # test dependencies
  pytestCheckHook,
  black,
  coverage,
  click,
  freezegun,
  # , label-studio-sdk
  mypy,
  pytest-cov-stub,
  pytest-mock,
  vcrpy,
  grpcio,
}:
let
  version = "0.18.27";
in
buildPythonPackage rec {
  pname = "unstructured";
  inherit version;
  pyproject = true;

  src = fetchFromGitHub {
    owner = "Unstructured-IO";
    repo = "unstructured";
    tag = version;
    hash = "sha256-QPCnMDKk10AeiMRNFMRekvSdqKoAyCJmwMnr9qJIzmg=";
  };

  build-system = [ setuptools ];

  dependencies = [
    # Base dependencies
    anyio
    backoff
    beautifulsoup4
    certifi
    cffi
    chardet
    charset-normalizer
    click
    cryptography
    dataclasses-json
    deepdiff
    emoji
    filetype
    h11
    html5lib
    httpcore
    httpx
    idna
    joblib
    # jsonpath-python
    langdetect
    lxml
    marshmallow
    mypy-extensions
    nest-asyncio
    nltk
    numpy
    olefile
    orderly-set
    packaging
    psutil
    pycparser
    pypdf
    python-dateutil
    python-iso639
    python-magic
    python-oxmsg
    rapidfuzz
    regex
    requests
    requests-toolbelt
    six
    sniffio
    soupsieve
    tqdm
    typing-extensions
    typing-inspect
    unstructured-client
    urllib3
    webencodings
    wrapt
  ];

  optional-dependencies = rec {
    all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx;
    csv = [
      numpy
      pandas
      python-dateutil
      pytz
      tzdata
    ];
    docx = [
      lxml
      python-docx
      typing-extensions
    ];
    epub = [ pypandoc ];
    req-markdown = [
      importlib-metadata
      markdown
      zipp
    ];
    odt = [
      lxml
      pypandoc
      python-docx
      typing-extensions
    ];
    org = [
      pypandoc
    ];
    paddleocr = [
      opencv-python
      # paddlepaddle # 3.12 not supported for now
      pdf2image
      # unstructured-paddleocr
    ];
    pdf = [
      pdf2image
      pdfminer-six
      pdfplumber
      # pi-heif
      pikepdf
      pypdf
      unstructured-inference
      # unstructured-pytesseract
    ];
    pptx = [
      lxml
      pillow
      python-pptx
      xlsxwriter
    ];
    xlsx = [
      et-xmlfile
      networkx
      numpy
      openpyxl
      pandas
      xlrd
    ];
    huggingface = [
      langdetect
      sacremoses
      sentencepiece
      torch
      transformers
    ];
  };

  pythonImportsCheck = [ "unstructured" ];

  # test try to download punkt from nltk
  # figure out how to make it available to enable the tests
  doCheck = false;

  nativeCheckInputs = [
    pytestCheckHook
    black
    coverage
    click
    freezegun
    mypy
    pytest-cov-stub
    pytest-mock
    vcrpy
    grpcio
  ];

  meta = {
    description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
    mainProgram = "unstructured-ingest";
    homepage = "https://github.com/Unstructured-IO/unstructured";
    changelog = "https://github.com/Unstructured-IO/unstructured/blob/${src.tag}/CHANGELOG.md";
    license = lib.licenses.asl20;
    maintainers = with lib.maintainers; [ happysalada ];
  };
}