4 files changed, 298 insertions, 116 deletions
diff --git a/pkgs/by-name/lo/local-ai/lib.nix b/pkgs/by-name/lo/local-ai/lib.nix
new file mode 100644
index 0000000000000..46f3ba88e5db7
--- /dev/null
+++ b/pkgs/by-name/lo/local-ai/lib.nix
@@ -0,0 +1,30 @@
+{ lib
+, writers
+, writeText
+, linkFarmFromDrvs
+}: {
+  genModels = configs:
+    let
+      name = lib.strings.sanitizeDerivationName
+        (builtins.concatStringsSep "_" ([ "local-ai-models" ] ++ (builtins.attrNames configs)));
+
+      genModelFiles = name: config:
+        let
+          templateName = type: name + "_" + type;
+
+          config' = lib.recursiveUpdate config ({
+            inherit name;
+          } // lib.optionalAttrs (lib.isDerivation config.parameters.model) {
+            parameters.model = config.parameters.model.name;
+          } // lib.optionalAttrs (config ? template) {
+            template = builtins.mapAttrs (n: _: templateName n) config.template;
+          });
+        in
+        [ (writers.writeYAML "${name}.yaml" config') ]
+        ++ lib.optional (lib.isDerivation config.parameters.model)
+          config.parameters.model
+        ++ lib.optionals (config ? template)
+          (lib.mapAttrsToList (n: writeText "${templateName n}.tmpl") config.template);
+    in
+    linkFarmFromDrvs name (lib.flatten (lib.mapAttrsToList genModelFiles configs));
+}
diff --git a/pkgs/by-name/lo/local-ai/module.nix b/pkgs/by-name/lo/local-ai/module.nix
new file mode 100644
index 0000000000000..d7b70048121f3
--- /dev/null
+++ b/pkgs/by-name/lo/local-ai/module.nix
@@ -0,0 +1,56 @@
+{ pkgs, config, lib, ... }:
+let
+  cfg = config.services.local-ai;
+  inherit (lib) mkOption types;
+in
+{
+  options.services.local-ai = {
+    enable = lib.mkEnableOption "Enable service";
+
+    package = lib.mkPackageOption pkgs "local-ai" { };
+
+    extraArgs = mkOption {
+      type = types.listOf types.str;
+      default = [ ];
+    };
+
+    port = mkOption {
+      type = types.port;
+      default = 8080;
+    };
+
+    threads = mkOption {
+      type = types.int;
+      default = 1;
+    };
+
+    models = mkOption {
+      type = types.either types.package types.str;
+      default = "models";
+    };
+  };
+
+  config = lib.mkIf cfg.enable {
+    systemd.services.local-ai = {
+      wantedBy = [ "multi-user.target" ];
+      serviceConfig = {
+        DynamicUser = true;
+        ExecStart = lib.escapeShellArgs ([
+          "${cfg.package}/bin/local-ai"
+          "--debug"
+          "--address"
+          ":${toString cfg.port}"
+          "--threads"
+          (toString cfg.threads)
+          "--localai-config-dir"
+          "."
+          "--models-path"
+          (toString cfg.models)
+        ]
+        ++ cfg.extraArgs);
+        RuntimeDirectory = "local-ai";
+        WorkingDirectory = "%t/local-ai";
+      };
+    };
+  };
+}
diff --git a/pkgs/by-name/lo/local-ai/package.nix b/pkgs/by-name/lo/local-ai/package.nix
index f597097dc8e36..061122c3f848a 100644
--- a/pkgs/by-name/lo/local-ai/package.nix
+++ b/pkgs/by-name/lo/local-ai/package.nix
@@ -6,6 +6,8 @@
 , fetchpatch
 , fetchFromGitHub
 , protobuf
+, protoc-gen-go
+, protoc-gen-go-grpc
 , grpc
 , openssl
 , llama-cpp
@@ -61,8 +63,8 @@ let
 
   inherit (cudaPackages) libcublas cuda_nvcc cuda_cccl cuda_cudart cudatoolkit;
 
-  go-llama-ggml = effectiveStdenv.mkDerivation {
-    name = "go-llama-ggml";
+  go-llama = effectiveStdenv.mkDerivation {
+    name = "go-llama";
     src = fetchFromGitHub {
       owner = "go-skynet";
       repo = "go-llama.cpp";
@@ -98,8 +100,8 @@ let
     src = fetchFromGitHub {
       owner = "ggerganov";
       repo = "llama.cpp";
-      rev = "1b67731e184e27a465b8c5476061294a4af668ea";
-      hash = "sha256-0WWbsklpW6HhFRkvWpYh8Lhi8VIansS/zmyIKNQRkIs=";
+      rev = "784e11dea1f5ce9638851b2b0dddb107e2a609c8";
+      hash = "sha256-yAQAUo5J+a6O2kTqhFL1UH0tANxpQn3JhAd3MByaC6I=";
       fetchSubmodules = true;
     };
     postPatch = prev.postPatch + ''
@@ -252,8 +254,8 @@ let
     src = fetchFromGitHub {
       owner = "ggerganov";
       repo = "whisper.cpp";
-      rev = "8f253ef3af1c62c04316ba4afa7145fc4d701a8c";
-      hash = "sha256-yHHjhpQIn99A/hqFwAb7TfTf4Q9KnKat93zyXS70bT8=";
+      rev = "858452d58dba3acdc3431c9bced2bb8cfd9bf418";
+      hash = "sha256-2fT3RgGpBex1mF6GJsVDo4rb0F31YqxTymsXcrpQAZk=";
     };
 
     nativeBuildInputs = [ cmake pkg-config ]
@@ -371,18 +373,18 @@ let
       stdenv;
 
   pname = "local-ai";
-  version = "2.12.4";
+  version = "2.13.0";
   src = fetchFromGitHub {
     owner = "go-skynet";
     repo = "LocalAI";
     rev = "v${version}";
-    hash = "sha256-piu2B6u4ZfxiOd9SXrE7jiiiwL2SM8EqXo2s5qeKRl0=";
+    hash = "sha256-jZE8Ow9FFhnx/jvsURLYlYtSuKpE4UWBezxg/mpHs9g=";
   };
 
   self = buildGoModule.override { stdenv = effectiveStdenv; } {
     inherit pname version src;
 
-    vendorHash = "sha256-8Hu1y/PK21twnB7D22ltslFFzRrsB8d1R2hkgIFB/XY=";
+    vendorHash = "sha256-nWNK2YekQnBSLx4ouNSe6esIe0yFuo69E0HStYLQANg=";
 
     env.NIX_CFLAGS_COMPILE = lib.optionalString with_stablediffusion " -isystem ${opencv}/include/opencv4";
 
@@ -392,12 +394,12 @@ let
       in
       ''
         sed -i Makefile \
-          -e 's;git clone.*go-llama-ggml$;${cp} ${go-llama-ggml} sources/go-llama-ggml;' \
+          -e 's;git clone.*go-llama\.cpp$;${cp} ${go-llama} sources/go-llama\.cpp;' \
           -e 's;git clone.*gpt4all$;${cp} ${gpt4all} sources/gpt4all;' \
           -e 's;git clone.*go-piper$;${cp} ${if with_tts then go-piper else go-piper.src} sources/go-piper;' \
-          -e 's;git clone.*go-rwkv$;${cp} ${go-rwkv} sources/go-rwkv;' \
+          -e 's;git clone.*go-rwkv\.cpp$;${cp} ${go-rwkv} sources/go-rwkv\.cpp;' \
           -e 's;git clone.*whisper\.cpp$;${cp} ${whisper-cpp.src} sources/whisper\.cpp;' \
-          -e 's;git clone.*go-bert$;${cp} ${go-bert} sources/go-bert;' \
+          -e 's;git clone.*go-bert\.cpp$;${cp} ${go-bert} sources/go-bert\.cpp;' \
           -e 's;git clone.*diffusion$;${cp} ${if with_stablediffusion then go-stable-diffusion else go-stable-diffusion.src} sources/go-stable-diffusion;' \
           -e 's;git clone.*go-tiny-dream$;${cp} ${if with_tinydream then go-tiny-dream else go-tiny-dream.src} sources/go-tiny-dream;' \
           -e 's, && git checkout.*,,g' \
@@ -415,14 +417,19 @@ let
       ++ lib.optionals with_stablediffusion go-stable-diffusion.buildInputs
       ++ lib.optionals with_tts go-piper.buildInputs;
 
-    nativeBuildInputs = [ makeWrapper ]
-      ++ lib.optionals with_cublas [ cuda_nvcc ];
+    nativeBuildInputs = [
+      protobuf
+      protoc-gen-go
+      protoc-gen-go-grpc
+      makeWrapper
+    ]
+    ++ lib.optionals with_cublas [ cuda_nvcc ];
 
     enableParallelBuilding = false;
 
     modBuildPhase = ''
       mkdir sources
-      make prepare-sources
+      make prepare-sources protogen-go
       go mod tidy -v
     '';
 
@@ -486,7 +493,7 @@ let
 
     passthru.local-packages = {
       inherit
-        go-tiny-dream go-rwkv go-bert go-llama-ggml gpt4all go-piper
+        go-tiny-dream go-rwkv go-bert go-llama gpt4all go-piper
         llama-cpp-grpc whisper-cpp go-tiny-dream-ncnn espeak-ng' piper-phonemize
         piper-tts';
     };
@@ -498,6 +505,7 @@ let
     };
 
     passthru.tests = callPackages ./tests.nix { inherit self; };
+    passthru.lib = callPackages ./lib.nix { };
 
     meta = with lib; {
       description = "OpenAI alternative to run local LLMs, image and audio generation";
diff --git a/pkgs/by-name/lo/local-ai/tests.nix b/pkgs/by-name/lo/local-ai/tests.nix
index 82d1b775dab82..7cebc6fff9387 100644
--- a/pkgs/by-name/lo/local-ai/tests.nix
+++ b/pkgs/by-name/lo/local-ai/tests.nix
@@ -5,156 +5,244 @@
 , fetchurl
 , writers
 , symlinkJoin
-, linkFarmFromDrvs
 , jq
 }:
+let
+  common-config = { config, ... }: {
+    imports = [ ./module.nix ];
+    services.local-ai = {
+      enable = true;
+      package = self;
+      threads = config.virtualisation.cores;
+    };
+  };
+
+  inherit (self.lib) genModels;
+in
 {
   version = testers.testVersion {
     package = self;
     version = "v" + self.version;
+    command = "local-ai --help";
   };
 
-  health =
+  health = testers.runNixOSTest ({ config, ... }: {
+    name = self.name + "-health";
+    nodes.machine = common-config;
+    testScript =
+      let
+        port = "8080";
+      in
+      ''
+        machine.wait_for_open_port(${port})
+        machine.succeed("curl -f http://localhost:${port}/readyz")
+      '';
+  });
+
+  # https://localai.io/features/embeddings/#bert-embeddings
+  bert =
     let
-      port = "8080";
+      model = "embedding";
+      model-configs.${model} = {
+        # Note: q4_0 and q4_1 models can not be loaded
+        parameters.model = fetchurl {
+          url = "https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-f16.bin";
+          sha256 = "9c195b2453a4fef60a4f6be3a88a39211366214df6498a4fe4885c9e22314f50";
+        };
+        backend = "bert-embeddings";
+        embeddings = true;
+      };
+
+      models = genModels model-configs;
+
+      requests.request = {
+        inherit model;
+        input = "Your text string goes here";
+      };
     in
     testers.runNixOSTest {
-      name = self.name + "-health";
+      name = self.name + "-bert";
       nodes.machine = {
-        systemd.services.local-ai = {
-          wantedBy = [ "multi-user.target" ];
-          serviceConfig.ExecStart = "${self}/bin/local-ai --debug --localai-config-dir . --address :${port}";
-        };
+        imports = [ common-config ];
+        virtualisation.cores = 2;
+        virtualisation.memorySize = 2048;
+        services.local-ai.models = models;
       };
-      testScript = ''
-        machine.wait_for_open_port(${port})
-        machine.succeed("curl -f http://localhost:${port}/readyz")
-      '';
+      passthru = { inherit models requests; };
+      testScript =
+        let
+          port = "8080";
+        in
+        ''
+          machine.wait_for_open_port(${port})
+          machine.succeed("curl -f http://localhost:${port}/readyz")
+          machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
+          machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json")
+          machine.succeed("curl -f http://localhost:${port}/embeddings --json @${writers.writeJSON "request.json" requests.request} --output embeddings.json")
+          machine.succeed("${jq}/bin/jq --exit-status 'debug | .model == \"${model}\"' embeddings.json")
+        '';
     };
 
+} // lib.optionalAttrs (!self.features.with_cublas && !self.features.with_clblas) {
   # https://localai.io/docs/getting-started/manual/
   llama =
     let
-      port = "8080";
-      gguf = fetchurl {
-        url = "https://huggingface.co/TheBloke/Luna-AI-Llama2-Uncensored-GGUF/resolve/main/luna-ai-llama2-uncensored.Q4_K_M.gguf";
-        sha256 = "6a9dc401c84f0d48996eaa405174999c3a33bf12c2bfd8ea4a1e98f376de1f15";
+      model = "gpt-3.5-turbo";
+
+      # https://localai.io/advanced/#full-config-model-file-reference
+      model-configs.${model} = rec {
+        context_size = 8192;
+        parameters = {
+          # https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF
+          # https://ai.meta.com/blog/meta-llama-3/
+          model = fetchurl {
+            url = "https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf";
+            sha256 = "ab9e4eec7e80892fd78f74d9a15d0299f1e22121cea44efd68a7a02a3fe9a1da";
+          };
+          # defaults from:
+          # https://deepinfra.com/meta-llama/Meta-Llama-3-8B-Instruct
+          temperature = 0.7;
+          top_p = 0.9;
+          top_k = 0;
+          # following parameter leads to outputs like: !!!!!!!!!!!!!!!!!!!
+          #repeat_penalty = 1;
+          presence_penalty = 0;
+          frequency_penalty = 0;
+          max_tokens = 100;
+        };
+        stopwords = [ "<|eot_id|>" ];
+        template = {
+          # Templates implement following specifications
+          # https://github.com/meta-llama/llama3/tree/main?tab=readme-ov-file#instruction-tuned-models
+          # ... and are insprired by:
+          # https://github.com/mudler/LocalAI/blob/master/embedded/models/llama3-instruct.yaml
+          #
+          # The rules for template evaluateion are defined here:
+          # https://pkg.go.dev/text/template
+          chat_message = ''
+            <|start_header_id|>{{.RoleName}}<|end_header_id|>
+
+            {{.Content}}${builtins.head stopwords}'';
+
+          chat = "<|begin_of_text|>{{.Input}}<|start_header_id|>assistant<|end_header_id|>";
+        };
+      };
+
+      models = genModels model-configs;
+
+      requests = {
+        # https://localai.io/features/text-generation/#chat-completions
+        chat-completions = {
+          inherit model;
+          messages = [{ role = "user"; content = "1 + 2 = ?"; }];
+        };
+        # https://localai.io/features/text-generation/#edit-completions
+        edit-completions = {
+          inherit model;
+          instruction = "rephrase";
+          input = "Black cat jumped out of the window";
+          max_tokens = 50;
+        };
+        # https://localai.io/features/text-generation/#completions
+        completions = {
+          inherit model;
+          prompt = "A long time ago in a galaxy far, far away";
+        };
       };
-      models = linkFarmFromDrvs "models" [
-        gguf
-      ];
     in
     testers.runNixOSTest {
       name = self.name + "-llama";
-      nodes.machine =
-        let
-          cores = 4;
-        in
-        {
-          virtualisation = {
-            inherit cores;
-            memorySize = 8192;
-          };
-          systemd.services.local-ai = {
-            wantedBy = [ "multi-user.target" ];
-            serviceConfig.ExecStart = "${self}/bin/local-ai --debug --threads ${toString cores} --models-path ${models} --localai-config-dir . --address :${port}";
-          };
-        };
+      nodes.machine = {
+        imports = [ common-config ];
+        virtualisation.cores = 4;
+        virtualisation.memorySize = 8192;
+        services.local-ai.models = models;
+      };
+      passthru = { inherit models requests; };
       testScript =
         let
-          # https://localai.io/features/text-generation/#chat-completions
-          request-chat-completions = {
-            model = gguf.name;
-            messages = [{ role = "user"; content = "Say this is a test!"; }];
-            temperature = 0.7;
-          };
-          # https://localai.io/features/text-generation/#edit-completions
-          request-edit-completions = {
-            model = gguf.name;
-            instruction = "rephrase";
-            input = "Black cat jumped out of the window";
-            temperature = 0.7;
-          };
-          # https://localai.io/features/text-generation/#completions
-          request-completions = {
-            model = gguf.name;
-            prompt = "A long time ago in a galaxy far, far away";
-            temperature = 0.7;
-          };
+          port = "8080";
         in
         ''
           machine.wait_for_open_port(${port})
           machine.succeed("curl -f http://localhost:${port}/readyz")
           machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
-          machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${gguf.name}\"' models.json")
-          machine.succeed("curl -f http://localhost:${port}/v1/chat/completions --json @${writers.writeJSON "request-chat-completions.json" request-chat-completions} --output chat-completions.json")
+          machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json")
+
+          machine.succeed("curl -f http://localhost:${port}/v1/chat/completions --json @${writers.writeJSON "request-chat-completions.json" requests.chat-completions} --output chat-completions.json")
           machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"chat.completion\"' chat-completions.json")
-          machine.succeed("curl -f http://localhost:${port}/v1/edits --json @${writers.writeJSON "request-edit-completions.json" request-edit-completions} --output edit-completions.json")
+          machine.succeed("${jq}/bin/jq --exit-status 'debug | .choices | first.message.content | tonumber == 3' chat-completions.json")
+
+          machine.succeed("curl -f http://localhost:${port}/v1/edits --json @${writers.writeJSON "request-edit-completions.json" requests.edit-completions} --output edit-completions.json")
           machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"edit\"' edit-completions.json")
-          machine.succeed("curl -f http://localhost:${port}/v1/completions --json @${writers.writeJSON "request-completions.json" request-completions} --output completions.json")
+          machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${toString requests.edit-completions.max_tokens}' edit-completions.json")
+
+          machine.succeed("curl -f http://localhost:${port}/v1/completions --json @${writers.writeJSON "request-completions.json" requests.completions} --output completions.json")
           machine.succeed("${jq}/bin/jq --exit-status 'debug | .object ==\"text_completion\"' completions.json")
+          machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${toString model-configs.${model}.parameters.max_tokens}' completions.json")
         '';
     };
 
-} // lib.optionalAttrs self.features.with_tts {
+} // lib.optionalAttrs (self.features.with_tts && !self.features.with_cublas && !self.features.with_clblas) {
   # https://localai.io/features/text-to-audio/#piper
   tts =
     let
-      port = "8080";
-      voice-en-us = fetchzip {
-        url = "https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-danny-low.tar.gz";
-        hash = "sha256-5wf+6H5HeQY0qgdqnAG1vSqtjIFM9lXH53OgouuPm0M=";
-        stripRoot = false;
-      };
-      ggml-tiny-en = fetchurl {
-        url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin";
-        hash = "sha256-x3xXZvHO8JtrfUfyG1Rsvd1BV4hrO11tT3CekeZsfCs=";
-      };
-      whisper-en = {
-        name = "whisper-en";
+      model-stt = "whisper-en";
+      model-configs.${model-stt} = {
         backend = "whisper";
-        parameters.model = ggml-tiny-en.name;
+        parameters.model = fetchurl {
+          url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin";
+          hash = "sha256-x3xXZvHO8JtrfUfyG1Rsvd1BV4hrO11tT3CekeZsfCs=";
+        };
       };
-      models = symlinkJoin {
-        name = "models";
-        paths = [
-          voice-en-us
-          (linkFarmFromDrvs "whisper-en" [
-            (writers.writeYAML "whisper-en.yaml" whisper-en)
-            ggml-tiny-en
-          ])
-        ];
+
+      model-tts = "piper-en";
+      model-configs.${model-tts} = {
+        backend = "piper";
+        parameters.model = "en-us-danny-low.onnx";
       };
-    in
-    testers.runNixOSTest {
-      name = self.name + "-tts";
-      nodes.machine =
+
+      models =
         let
-          cores = 2;
+          models = genModels model-configs;
         in
-        {
-          virtualisation = {
-            inherit cores;
-          };
-          systemd.services.local-ai = {
-            wantedBy = [ "multi-user.target" ];
-            serviceConfig.ExecStart = "${self}/bin/local-ai --debug --threads ${toString cores} --models-path ${models} --localai-config-dir . --address :${port}";
-          };
+        symlinkJoin {
+          inherit (models) name;
+          paths = [
+            models
+            (fetchzip {
+              url = "https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-danny-low.tar.gz";
+              hash = "sha256-5wf+6H5HeQY0qgdqnAG1vSqtjIFM9lXH53OgouuPm0M=";
+              stripRoot = false;
+            })
+          ];
         };
+
+      requests.request = {
+        model = model-tts;
+        input = "Hello, how are you?";
+      };
+    in
+    testers.runNixOSTest {
+      name = self.name + "-tts";
+      nodes.machine = {
+        imports = [ common-config ];
+        virtualisation.cores = 2;
+        services.local-ai.models = models;
+      };
+      passthru = { inherit models requests; };
       testScript =
         let
-          request = {
-            model = "en-us-danny-low.onnx";
-            backend = "piper";
-            input = "Hello, how are you?";
-          };
+          port = "8080";
         in
         ''
           machine.wait_for_open_port(${port})
           machine.succeed("curl -f http://localhost:${port}/readyz")
-          machine.succeed("curl -f http://localhost:${port}/tts --json @${writers.writeJSON "request.json" request} --output out.wav")
-          machine.succeed("curl -f http://localhost:${port}/v1/audio/transcriptions --header 'Content-Type: multipart/form-data' --form file=@out.wav --form model=${whisper-en.name} --output transcription.json")
-          machine.succeed("${jq}/bin/jq --exit-status 'debug | .segments | first.text == \"${request.input}\"' transcription.json")
+          machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
+          machine.succeed("${jq}/bin/jq --exit-status 'debug' models.json")
+          machine.succeed("curl -f http://localhost:${port}/tts --json @${writers.writeJSON "request.json" requests.request} --output out.wav")
+          machine.succeed("curl -f http://localhost:${port}/v1/audio/transcriptions --header 'Content-Type: multipart/form-data' --form file=@out.wav --form model=${model-stt} --output transcription.json")
+          machine.succeed("${jq}/bin/jq --exit-status 'debug | .segments | first.text == \"${requests.request.input}\"' transcription.json")
         '';
     };
 }