1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
{
lib,
buildPythonPackage,
fetchFromGitHub,
# propagated build inputs
chardet,
filetype,
lxml,
msg-parser,
nltk,
openpyxl,
pandas,
pdf2image,
pdfminer-six,
pillow,
pypandoc,
python-docx,
python-pptx,
python-magic,
markdown,
requests,
tabulate,
xlrd,
# optional-dependencies
langdetect,
sacremoses,
sentencepiece,
torch,
transformers,
unstructured-inference,
s3fs,
fsspec,
adlfs,
# , discord-py
pygithub,
python-gitlab,
praw,
slack-sdk,
wikipedia,
google-api-python-client,
# , gcsfs
elasticsearch8,
jq,
# , dropboxdrivefs
atlassian-python-api,
# test dependencies
pytestCheckHook,
black,
coverage,
click,
freezegun,
# , label-studio-sdk
mypy,
pytest-cov,
pytest-mock,
vcrpy,
grpcio,
}:
let
version = "0.15.13";
optional-dependencies = {
huggingflace = [
langdetect
sacremoses
sentencepiece
torch
transformers
];
local-inference = [ unstructured-inference ];
s3 = [
s3fs
fsspec
];
azure = [
adlfs
fsspec
];
discord = [ ]; # discord-py
github = [ pygithub ];
gitlab = [ python-gitlab ];
reddit = [ praw ];
slack = [ slack-sdk ];
wikipedia = [ wikipedia ];
google-drive = [ google-api-python-client ];
gcs = [ ]; # gcsfs fsspec
elasticsearch = [
elasticsearch8
jq
];
dropbox = [ ]; # dropboxdrivefs fsspec
confluence = [ atlassian-python-api ];
};
in
buildPythonPackage {
pname = "unstructured";
inherit version;
format = "setuptools";
src = fetchFromGitHub {
owner = "Unstructured-IO";
repo = "unstructured";
rev = "refs/tags/${version}";
hash = "sha256-DbOuNh+p+4vsEO6AQUeMq25RTLm5Zn9FyzcTKJedbTM=";
};
propagatedBuildInputs = [
chardet
filetype
lxml
msg-parser
nltk
openpyxl
pandas
pdf2image
pdfminer-six
pillow
pypandoc
python-docx
python-pptx
python-magic
markdown
requests
tabulate
xlrd
];
pythonImportsCheck = [ "unstructured" ];
# test try to download punkt from nltk
# figure out how to make it available to enable the tests
doCheck = false;
nativeCheckInputs = [
pytestCheckHook
black
coverage
click
freezegun
mypy
pytest-cov
pytest-mock
vcrpy
grpcio
];
passthru.optional-dependencies = optional-dependencies;
meta = with lib; {
description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
mainProgram = "unstructured-ingest";
homepage = "https://github.com/Unstructured-IO/unstructured";
changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md";
license = licenses.asl20;
maintainers = with maintainers; [ happysalada ];
};
}
|