about summary refs log tree commit diff
path: root/pkgs/development/python-modules/unstructured/default.nix
blob: afa02b44773985a0267645fc012f5e6a4a285ad9 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,
  # propagated build inputs
  chardet,
  filetype,
  lxml,
  msg-parser,
  nltk,
  openpyxl,
  pandas,
  pdf2image,
  pdfminer-six,
  pillow,
  pypandoc,
  python-docx,
  python-pptx,
  python-magic,
  markdown,
  requests,
  tabulate,
  xlrd,
  # optional-dependencies
  langdetect,
  sacremoses,
  sentencepiece,
  torch,
  transformers,
  unstructured-inference,
  s3fs,
  fsspec,
  adlfs,
  # , discord-py
  pygithub,
  python-gitlab,
  praw,
  slack-sdk,
  wikipedia,
  google-api-python-client,
  # , gcsfs
  elasticsearch8,
  jq,
  # , dropboxdrivefs
  atlassian-python-api,
  # test dependencies
  pytestCheckHook,
  black,
  coverage,
  click,
  freezegun,
  # , label-studio-sdk
  mypy,
  pytest-cov,
  pytest-mock,
  vcrpy,
  grpcio,
}:
let
  version = "0.15.13";
  optional-dependencies = {
    huggingflace = [
      langdetect
      sacremoses
      sentencepiece
      torch
      transformers
    ];
    local-inference = [ unstructured-inference ];
    s3 = [
      s3fs
      fsspec
    ];
    azure = [
      adlfs
      fsspec
    ];
    discord = [ ]; # discord-py
    github = [ pygithub ];
    gitlab = [ python-gitlab ];
    reddit = [ praw ];
    slack = [ slack-sdk ];
    wikipedia = [ wikipedia ];
    google-drive = [ google-api-python-client ];
    gcs = [ ]; # gcsfs fsspec
    elasticsearch = [
      elasticsearch8
      jq
    ];
    dropbox = [ ]; # dropboxdrivefs fsspec
    confluence = [ atlassian-python-api ];
  };
in
buildPythonPackage {
  pname = "unstructured";
  inherit version;
  format = "setuptools";

  src = fetchFromGitHub {
    owner = "Unstructured-IO";
    repo = "unstructured";
    rev = "refs/tags/${version}";
    hash = "sha256-DbOuNh+p+4vsEO6AQUeMq25RTLm5Zn9FyzcTKJedbTM=";
  };

  propagatedBuildInputs = [
    chardet
    filetype
    lxml
    msg-parser
    nltk
    openpyxl
    pandas
    pdf2image
    pdfminer-six
    pillow
    pypandoc
    python-docx
    python-pptx
    python-magic
    markdown
    requests
    tabulate
    xlrd
  ];

  pythonImportsCheck = [ "unstructured" ];

  # test try to download punkt from nltk
  # figure out how to make it available to enable the tests
  doCheck = false;

  nativeCheckInputs = [
    pytestCheckHook
    black
    coverage
    click
    freezegun
    mypy
    pytest-cov
    pytest-mock
    vcrpy
    grpcio
  ];

  passthru.optional-dependencies = optional-dependencies;

  meta = with lib; {
    description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
    mainProgram = "unstructured-ingest";
    homepage = "https://github.com/Unstructured-IO/unstructured";
    changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md";
    license = licenses.asl20;
    maintainers = with maintainers; [ happysalada ];
  };
}