1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
|
{
lib,
stdenv,
python,
buildPythonPackage,
pythonRelaxDepsHook,
fetchFromGitHub,
which,
ninja,
cmake,
packaging,
setuptools,
torch,
outlines,
wheel,
psutil,
ray,
pandas,
pyarrow,
sentencepiece,
numpy,
transformers,
xformers,
fastapi,
uvicorn,
pydantic,
aioprometheus,
pynvml,
openai,
pyzmq,
tiktoken,
torchvision,
py-cpuinfo,
lm-format-enforcer,
prometheus-fastapi-instrumentator,
cupy,
writeShellScript,
config,
cudaSupport ? config.cudaSupport,
cudaPackages ? { },
# Has to be either rocm or cuda, default to the free one
rocmSupport ? !config.cudaSupport,
rocmPackages ? { },
gpuTargets ? [ ],
}@args:
let
cutlass = fetchFromGitHub {
owner = "NVIDIA";
repo = "cutlass";
rev = "refs/tags/v3.5.0";
sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4=";
};
in
buildPythonPackage rec {
pname = "vllm";
version = "0.5.3.post1";
pyproject = true;
stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv;
src = fetchFromGitHub {
owner = "vllm-project";
repo = pname;
rev = "refs/tags/v${version}";
hash = "sha256-++DK2Y2zz+1KrEcdQc5XFrSjc7fCwMD2DQ/RqY7PoFU=";
};
patches = [
./0001-setup.py-don-t-ask-for-hipcc-version.patch
./0002-setup.py-nix-support-respect-cmakeFlags.patch
];
# Ignore the python version check because it hard-codes minor versions and
# lags behind `ray`'s python interpreter support
postPatch = ''
substituteInPlace CMakeLists.txt \
--replace-fail \
'set(PYTHON_SUPPORTED_VERSIONS' \
'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
'';
nativeBuildInputs = [
cmake
ninja
pythonRelaxDepsHook
which
] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ];
build-system = [
packaging
setuptools
wheel
];
buildInputs =
(lib.optionals cudaSupport (
with cudaPackages;
[
cuda_cudart # cuda_runtime.h, -lcudart
cuda_cccl
libcusparse # cusparse.h
libcusolver # cusolverDn.h
cuda_nvcc
cuda_nvtx
libcublas
]
))
++ (lib.optionals rocmSupport (
with rocmPackages;
[
clr
rocthrust
rocprim
hipsparse
hipblas
]
));
dependencies =
[
aioprometheus
fastapi
lm-format-enforcer
numpy
openai
outlines
pandas
prometheus-fastapi-instrumentator
psutil
py-cpuinfo
pyarrow
pydantic
pyzmq
ray
sentencepiece
tiktoken
torch
torchvision
transformers
uvicorn
xformers
]
++ uvicorn.optional-dependencies.standard
++ aioprometheus.optional-dependencies.starlette
++ lib.optionals cudaSupport [
cupy
pynvml
];
dontUseCmakeConfigure = true;
cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ];
env =
lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; }
// lib.optionalAttrs rocmSupport {
# Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
ROCM_HOME = "${rocmPackages.clr}";
};
pythonRelaxDeps = true;
pythonImportsCheck = [ "vllm" ];
meta = with lib; {
description = "High-throughput and memory-efficient inference and serving engine for LLMs";
changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
homepage = "https://github.com/vllm-project/vllm";
license = licenses.asl20;
maintainers = with maintainers; [
happysalada
lach
];
broken = !cudaSupport && !rocmSupport;
};
}
|