diff options
Diffstat (limited to 'pkgs/by-name/lo/local-ai')
-rw-r--r-- | pkgs/by-name/lo/local-ai/module.nix | 28 | ||||
-rw-r--r-- | pkgs/by-name/lo/local-ai/package.nix | 83 | ||||
-rw-r--r-- | pkgs/by-name/lo/local-ai/tests.nix | 30 |
3 files changed, 105 insertions, 36 deletions
diff --git a/pkgs/by-name/lo/local-ai/module.nix b/pkgs/by-name/lo/local-ai/module.nix index d7b70048121f3..e26a3637820f8 100644 --- a/pkgs/by-name/lo/local-ai/module.nix +++ b/pkgs/by-name/lo/local-ai/module.nix @@ -5,7 +5,7 @@ let in { options.services.local-ai = { - enable = lib.mkEnableOption "Enable service"; + enable = lib.mkEnableOption "local-ai"; package = lib.mkPackageOption pkgs "local-ai" { }; @@ -28,25 +28,33 @@ in type = types.either types.package types.str; default = "models"; }; + + parallelRequests = mkOption { + type = types.int; + default = 1; + }; + + logLevel = mkOption { + type = types.enum [ "error" "warn" "info" "debug" "trace" ]; + default = "warn"; + }; }; config = lib.mkIf cfg.enable { systemd.services.local-ai = { wantedBy = [ "multi-user.target" ]; + environment.LLAMACPP_PARALLEL = toString cfg.parallelRequests; serviceConfig = { DynamicUser = true; ExecStart = lib.escapeShellArgs ([ "${cfg.package}/bin/local-ai" - "--debug" - "--address" - ":${toString cfg.port}" - "--threads" - (toString cfg.threads) - "--localai-config-dir" - "." - "--models-path" - (toString cfg.models) + "--address=:${toString cfg.port}" + "--threads=${toString cfg.threads}" + "--localai-config-dir=." + "--models-path=${cfg.models}" + "--log-level=${cfg.logLevel}" ] + ++ lib.optional (cfg.parallelRequests > 1) "--parallel-requests" ++ cfg.extraArgs); RuntimeDirectory = "local-ai"; WorkingDirectory = "%t/local-ai"; diff --git a/pkgs/by-name/lo/local-ai/package.nix b/pkgs/by-name/lo/local-ai/package.nix index 061122c3f848a..3e2c3fc165e49 100644 --- a/pkgs/by-name/lo/local-ai/package.nix +++ b/pkgs/by-name/lo/local-ai/package.nix @@ -3,7 +3,6 @@ , stdenv , lib , addDriverRunpath -, fetchpatch , fetchFromGitHub , protobuf , protoc-gen-go @@ -17,6 +16,7 @@ , pkg-config , buildGoModule , makeWrapper +, ncurses # apply feature parameter names according to # https://github.com/NixOS/rfcs/pull/169 @@ -61,7 +61,7 @@ let else if with_clblas then "clblas" else ""; - inherit (cudaPackages) libcublas cuda_nvcc cuda_cccl cuda_cudart cudatoolkit; + inherit (cudaPackages) libcublas cuda_nvcc cuda_cccl cuda_cudart libcufft; go-llama = effectiveStdenv.mkDerivation { name = "go-llama"; @@ -78,13 +78,12 @@ let ]; buildInputs = [ ] + ++ lib.optionals with_cublas [ cuda_cccl cuda_cudart libcublas ] ++ lib.optionals with_clblas [ clblast ocl-icd opencl-headers ] ++ lib.optionals with_openblas [ openblas.dev ]; nativeBuildInputs = [ cmake ] - # backward compatiblity with nixos-23.11 - # use cuda_nvcc after release of nixos-24.05 - ++ lib.optionals with_cublas [ cudatoolkit ]; + ++ lib.optionals with_cublas [ cuda_nvcc ]; dontUseCmakeConfigure = true; @@ -95,13 +94,33 @@ let ''; }; + llama-cpp-rpc = (llama-cpp-grpc.overrideAttrs (prev: { + name = "llama-cpp-rpc"; + cmakeFlags = prev.cmakeFlags ++ [ + (lib.cmakeBool "LLAMA_AVX" false) + (lib.cmakeBool "LLAMA_AVX2" false) + (lib.cmakeBool "LLAMA_AVX512" false) + (lib.cmakeBool "LLAMA_FMA" false) + (lib.cmakeBool "LLAMA_F16C" false) + (lib.cmakeBool "LLAMA_RPC" true) + ]; + postPatch = prev.postPatch + '' + sed -i examples/rpc/CMakeLists.txt \ + -e '$a\install(TARGETS rpc-server RUNTIME)' + ''; + })).override { + cudaSupport = false; + openclSupport = false; + blasSupport = false; + }; + llama-cpp-grpc = (llama-cpp.overrideAttrs (final: prev: { name = "llama-cpp-grpc"; src = fetchFromGitHub { owner = "ggerganov"; repo = "llama.cpp"; - rev = "784e11dea1f5ce9638851b2b0dddb107e2a609c8"; - hash = "sha256-yAQAUo5J+a6O2kTqhFL1UH0tANxpQn3JhAd3MByaC6I="; + rev = "37bef8943312d91183ff06d8f1214082a17344a5"; + hash = "sha256-E3kCMDK5TXozBsprp4D581WHTVP9aljxB1KZUKug1pM="; fetchSubmodules = true; }; postPatch = prev.postPatch + '' @@ -124,6 +143,8 @@ let (lib.cmakeBool "LLAMA_FMA" enable_fma) (lib.cmakeBool "LLAMA_F16C" enable_f16c) ]; + postInstall = null; + buildInputs = prev.buildInputs ++ [ protobuf # provides also abseil_cpp as propagated build input grpc @@ -254,15 +275,15 @@ let src = fetchFromGitHub { owner = "ggerganov"; repo = "whisper.cpp"; - rev = "858452d58dba3acdc3431c9bced2bb8cfd9bf418"; - hash = "sha256-2fT3RgGpBex1mF6GJsVDo4rb0F31YqxTymsXcrpQAZk="; + rev = "b29b3b29240aac8b71ce8e5a4360c1f1562ad66f"; + hash = "sha256-vSd+AP9AexbG4wvdkk6wjxYQBZdKWGK2Ix7c86MUfB8="; }; nativeBuildInputs = [ cmake pkg-config ] ++ lib.optionals with_cublas [ cuda_nvcc ]; buildInputs = [ ] - ++ lib.optionals with_cublas [ cuda_cccl cuda_cudart libcublas ] + ++ lib.optionals with_cublas [ cuda_cccl cuda_cudart libcublas libcufft ] ++ lib.optionals with_clblas [ clblast ocl-icd opencl-headers ] ++ lib.optionals with_openblas [ openblas.dev ]; @@ -286,8 +307,8 @@ let src = fetchFromGitHub { owner = "go-skynet"; repo = "go-bert.cpp"; - rev = "6abe312cded14042f6b7c3cd8edf082713334a4d"; - hash = "sha256-lh9cvXc032Eq31kysxFOkRd0zPjsCznRl0tzg9P2ygo="; + rev = "710044b124545415f555e4260d16b146c725a6e4"; + hash = "sha256-UNrs3unYjvSzCVaVISFFBDD+s37lmN6/7ajmGNcYgrU="; fetchSubmodules = true; }; buildFlags = [ "libgobert.a" ]; @@ -305,8 +326,8 @@ let src = fetchFromGitHub { owner = "mudler"; repo = "go-stable-diffusion"; - rev = "362df9da29f882dbf09ade61972d16a1f53c3485"; - hash = "sha256-A5KvMZOviPsIpPHxM8cacT+qE2x1iFJAbPsRs4sLijY="; + rev = "4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f"; + hash = "sha256-KXUvMP6cDyWib4rG0RmVRm3pgrdsfKXaH3k0v5/mTe8="; fetchSubmodules = true; }; buildFlags = [ "libstablediffusion.a" ]; @@ -342,8 +363,8 @@ let src = fetchFromGitHub { owner = "M0Rf30"; repo = "go-tiny-dream"; - rev = "22a12a4bc0ac5455856f28f3b771331a551a4293"; - hash = "sha256-DAVHD6E0OKHf4C2ldoI0Mm7813DIrmWFONUhSCQPCfc="; + rev = "c04fa463ace9d9a6464313aa5f9cd0f953b6c057"; + hash = "sha256-uow3vbAI4F/fTGjYOKOLqTpKq7NgGYSZhGlEhn7h6s0="; fetchSubmodules = true; }; postUnpack = '' @@ -373,18 +394,18 @@ let stdenv; pname = "local-ai"; - version = "2.13.0"; + version = "2.17.1"; src = fetchFromGitHub { owner = "go-skynet"; repo = "LocalAI"; rev = "v${version}"; - hash = "sha256-jZE8Ow9FFhnx/jvsURLYlYtSuKpE4UWBezxg/mpHs9g="; + hash = "sha256-G9My4t3vJ1sWyD+vxUgON4ezXURVAAgu1nAtTjd3ZR8="; }; self = buildGoModule.override { stdenv = effectiveStdenv; } { inherit pname version src; - vendorHash = "sha256-nWNK2YekQnBSLx4ouNSe6esIe0yFuo69E0HStYLQANg="; + vendorHash = "sha256-Hu7aJFi40CKNWAxYOR47VBZI1A/9SlBIVQVcB8iqcxA="; env.NIX_CFLAGS_COMPILE = lib.optionalString with_stablediffusion " -isystem ${opencv}/include/opencv4"; @@ -404,11 +425,24 @@ let -e 's;git clone.*go-tiny-dream$;${cp} ${if with_tinydream then go-tiny-dream else go-tiny-dream.src} sources/go-tiny-dream;' \ -e 's, && git checkout.*,,g' \ -e '/mod download/ d' \ + -e '/^ALL_GRPC_BACKENDS+=backend-assets\/grpc\/llama-cpp-fallback/ d' \ + -e '/^ALL_GRPC_BACKENDS+=backend-assets\/grpc\/llama-cpp-avx/ d' \ + -e '/^ALL_GRPC_BACKENDS+=backend-assets\/grpc\/llama-cpp-cuda/ d' \ - ${cp} ${llama-cpp-grpc}/bin/*grpc-server backend/cpp/llama/grpc-server - echo "grpc-server:" > backend/cpp/llama/Makefile - '' - ; + '' + lib.optionalString with_cublas '' + sed -i Makefile \ + -e '/^CGO_LDFLAGS_WHISPER?=/ s;$;-L${libcufft}/lib -L${cuda_cudart}/lib;' + ''; + + postConfigure = '' + shopt -s extglob + mkdir -p backend-assets/grpc + cp ${llama-cpp-grpc}/bin/grpc-server backend-assets/grpc/llama-cpp-avx2 + cp ${llama-cpp-rpc}/bin/grpc-server backend-assets/grpc/llama-cpp-grpc + + mkdir -p backend-assets/util + cp ${llama-cpp-rpc}/bin/rpc-server backend-assets/util/llama-cpp-rpc-server + ''; buildInputs = [ ] ++ lib.optionals with_cublas [ libcublas ] @@ -422,6 +456,7 @@ let protoc-gen-go protoc-gen-go-grpc makeWrapper + ncurses # tput ] ++ lib.optionals with_cublas [ cuda_nvcc ]; @@ -495,7 +530,7 @@ let inherit go-tiny-dream go-rwkv go-bert go-llama gpt4all go-piper llama-cpp-grpc whisper-cpp go-tiny-dream-ncnn espeak-ng' piper-phonemize - piper-tts'; + piper-tts' llama-cpp-rpc; }; passthru.features = { diff --git a/pkgs/by-name/lo/local-ai/tests.nix b/pkgs/by-name/lo/local-ai/tests.nix index 7cebc6fff9387..5740362f24efd 100644 --- a/pkgs/by-name/lo/local-ai/tests.nix +++ b/pkgs/by-name/lo/local-ai/tests.nix @@ -6,6 +6,7 @@ , writers , symlinkJoin , jq +, prom2json }: let common-config = { config, ... }: { @@ -14,6 +15,7 @@ let enable = true; package = self; threads = config.virtualisation.cores; + logLevel = "debug"; }; }; @@ -26,7 +28,7 @@ in command = "local-ai --help"; }; - health = testers.runNixOSTest ({ config, ... }: { + health = testers.runNixOSTest { name = self.name + "-health"; nodes.machine = common-config; testScript = @@ -36,8 +38,11 @@ in '' machine.wait_for_open_port(${port}) machine.succeed("curl -f http://localhost:${port}/readyz") + + machine.succeed("${prom2json}/bin/prom2json http://localhost:${port}/metrics > metrics.json") + machine.copy_from_vm("metrics.json") ''; - }); + }; # https://localai.io/features/embeddings/#bert-embeddings bert = @@ -78,8 +83,13 @@ in machine.succeed("curl -f http://localhost:${port}/readyz") machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json") machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json") + machine.succeed("curl -f http://localhost:${port}/embeddings --json @${writers.writeJSON "request.json" requests.request} --output embeddings.json") + machine.copy_from_vm("embeddings.json") machine.succeed("${jq}/bin/jq --exit-status 'debug | .model == \"${model}\"' embeddings.json") + + machine.succeed("${prom2json}/bin/prom2json http://localhost:${port}/metrics > metrics.json") + machine.copy_from_vm("metrics.json") ''; }; @@ -92,6 +102,7 @@ in # https://localai.io/advanced/#full-config-model-file-reference model-configs.${model} = rec { context_size = 8192; + backend = "llama-cpp"; parameters = { # https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF # https://ai.meta.com/blog/meta-llama-3/ @@ -157,6 +168,8 @@ in virtualisation.cores = 4; virtualisation.memorySize = 8192; services.local-ai.models = models; + # TODO: Add test case parallel requests + services.local-ai.parallelRequests = 2; }; passthru = { inherit models requests; }; testScript = @@ -170,16 +183,22 @@ in machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json") machine.succeed("curl -f http://localhost:${port}/v1/chat/completions --json @${writers.writeJSON "request-chat-completions.json" requests.chat-completions} --output chat-completions.json") + machine.copy_from_vm("chat-completions.json") machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"chat.completion\"' chat-completions.json") machine.succeed("${jq}/bin/jq --exit-status 'debug | .choices | first.message.content | tonumber == 3' chat-completions.json") machine.succeed("curl -f http://localhost:${port}/v1/edits --json @${writers.writeJSON "request-edit-completions.json" requests.edit-completions} --output edit-completions.json") + machine.copy_from_vm("edit-completions.json") machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"edit\"' edit-completions.json") machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${toString requests.edit-completions.max_tokens}' edit-completions.json") machine.succeed("curl -f http://localhost:${port}/v1/completions --json @${writers.writeJSON "request-completions.json" requests.completions} --output completions.json") + machine.copy_from_vm("completions.json") machine.succeed("${jq}/bin/jq --exit-status 'debug | .object ==\"text_completion\"' completions.json") machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${toString model-configs.${model}.parameters.max_tokens}' completions.json") + + machine.succeed("${prom2json}/bin/prom2json http://localhost:${port}/metrics > metrics.json") + machine.copy_from_vm("metrics.json") ''; }; @@ -240,9 +259,16 @@ in machine.succeed("curl -f http://localhost:${port}/readyz") machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json") machine.succeed("${jq}/bin/jq --exit-status 'debug' models.json") + machine.succeed("curl -f http://localhost:${port}/tts --json @${writers.writeJSON "request.json" requests.request} --output out.wav") + machine.copy_from_vm("out.wav") + machine.succeed("curl -f http://localhost:${port}/v1/audio/transcriptions --header 'Content-Type: multipart/form-data' --form file=@out.wav --form model=${model-stt} --output transcription.json") + machine.copy_from_vm("transcription.json") machine.succeed("${jq}/bin/jq --exit-status 'debug | .segments | first.text == \"${requests.request.input}\"' transcription.json") + + machine.succeed("${prom2json}/bin/prom2json http://localhost:${port}/metrics > metrics.json") + machine.copy_from_vm("metrics.json") ''; }; } |