tree-wide: cudaPackages should not break default eval

cudaPackages: guard expressions against null values
author: Connor Baker <connor.baker@tweag.io> 2023-12-14 22:19:02 +0000
committer: Connor Baker <connor.baker@tweag.io> 2024-01-10 01:15:01 +0000
commit: 9bebd9e72d6b552fcfd3d1e6716eca6563944f42 (patch)
tree: d0b45b861d0d47be9e43c26481f68ce53a3d6987
parent: 501a1af970ca54cb300474a00aacfbd01f8a5b24 (diff)
14 files changed, 119 insertions, 98 deletions
diff --git a/pkgs/applications/science/math/caffe/default.nix b/pkgs/applications/science/math/caffe/default.nix
index 6595f0b846ddb..25f7229a845ae 100644
--- a/pkgs/applications/science/math/caffe/default.nix
+++ b/pkgs/applications/science/math/caffe/default.nix
@@ -153,7 +153,7 @@ stdenv.mkDerivation rec {
       || cudaSupport
       || !(leveldbSupport -> (leveldb != null && snappy != null))
       || !(cudnnSupport -> (hasCudnn && cudaSupport))
-      || !(ncclSupport -> cudaSupport)
+      || !(ncclSupport -> (cudaSupport && !nccl.meta.unsupported))
       || !(pythonSupport -> (python != null && numpy != null))
     ;
     license = licenses.bsd2;
diff --git a/pkgs/development/cuda-modules/cudnn/shims.nix b/pkgs/development/cuda-modules/cudnn/shims.nix
index e9eca8ef7c8b9..a36ee26dab5dc 100644
--- a/pkgs/development/cuda-modules/cudnn/shims.nix
+++ b/pkgs/development/cuda-modules/cudnn/shims.nix
@@ -1,10 +1,18 @@
 # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix
-{package, redistArch}:
 {
-  featureRelease.${redistArch}.outputs = {
-    lib = true;
-    static = true;
-    dev = true;
+  lib,
+  package,
+  # redistArch :: String
+  # String is "unsupported" if the given architecture is unsupported.
+  redistArch,
+}:
+{
+  featureRelease = lib.optionalAttrs (redistArch != "unsupported") {
+    ${redistArch}.outputs = {
+      lib = true;
+      static = true;
+      dev = true;
+    };
   };
   redistribRelease = {
     name = "NVIDIA CUDA Deep Neural Network library (cuDNN)";
diff --git a/pkgs/development/cuda-modules/cutensor/extension.nix b/pkgs/development/cuda-modules/cutensor/extension.nix
index b762fd22ede88..534941887c6e4 100644
--- a/pkgs/development/cuda-modules/cutensor/extension.nix
+++ b/pkgs/development/cuda-modules/cutensor/extension.nix
@@ -92,6 +92,7 @@ let
   # A release is supported if it has a libPath that matches our CUDA version for our platform.
   # LibPath are not constant across the same release -- one platform may support fewer
   # CUDA versions than another.
+  # redistArch :: String
   redistArch = flags.getRedistArch hostPlatform.system;
   # platformIsSupported :: Manifests -> Boolean
   platformIsSupported =
diff --git a/pkgs/development/cuda-modules/flags.nix b/pkgs/development/cuda-modules/flags.nix
index a123c7bce5a16..d5e01be01fd51 100644
--- a/pkgs/development/cuda-modules/flags.nix
+++ b/pkgs/development/cuda-modules/flags.nix
@@ -131,39 +131,29 @@ let
   # `linux-aarch64` redist (which is for Jetson devices) if we're building any Jetson devices.
   # Since both are based on aarch64, we can only have one or the other, otherwise there's an
   # ambiguity as to which should be used.
+  # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of
+  # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported
+  # systems gracefully.
   # getRedistArch :: String -> String
-  getRedistArch =
-    nixSystem:
-    if nixSystem == "aarch64-linux" then
-      if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa"
-    else if nixSystem == "x86_64-linux" then
-      "linux-x86_64"
-    else if nixSystem == "ppc64le-linux" then
-      "linux-ppc64le"
-    else if nixSystem == "x86_64-windows" then
-      "windows-x86_64"
-    else
-      "unsupported";
+  getRedistArch = nixSystem: attrsets.attrByPath [ nixSystem ] "unsupported" {
+    aarch64-linux = if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa";
+    x86_64-linux = "linux-x86_64";
+    ppc64le-linux = "linux-ppc64le";
+    x86_64-windows = "windows-x86_64";
+  };
 
   # Maps NVIDIA redist arch to Nix system.
-  # It is imperative that we include the boolean condition based on jetsonTargets to ensure
-  # we don't advertise availability of packages only available on server-grade ARM
-  # as being available for the Jetson, since both `linux-sbsa` and `linux-aarch64` are
-  # mapped to the Nix system `aarch64-linux`.
-  getNixSystem =
-    redistArch:
-    if redistArch == "linux-sbsa" && jetsonTargets == [] then
-      "aarch64-linux"
-    else if redistArch == "linux-aarch64" && jetsonTargets != [] then
-      "aarch64-linux"
-    else if redistArch == "linux-x86_64" then
-      "x86_64-linux"
-    else if redistArch == "linux-ppc64le" then
-      "ppc64le-linux"
-    else if redistArch == "windows-x86_64" then
-      "x86_64-windows"
-    else
-      "unsupported-${redistArch}";
+  # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of
+  # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported
+  # systems gracefully.
+  # getNixSystem :: String -> String
+  getNixSystem = redistArch: attrsets.attrByPath [ redistArch ] "unsupported-${redistArch}" {
+    linux-sbsa = "aarch64-linux";
+    linux-aarch64 = "aarch64-linux";
+    linux-x86_64 = "x86_64-linux";
+    linux-ppc64le = "ppc64le-linux";
+    windows-x86_64 = "x86_64-windows";
+  };
 
   formatCapabilities =
     {
diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index 5a4c5280d7dbd..64204346791a8 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -42,6 +42,9 @@ let
   # Get the redist architectures for which package provides distributables.
   # These are used by meta.platforms.
   supportedRedistArchs = builtins.attrNames featureRelease;
+  # redistArch :: String
+  # The redistArch is the name of the architecture for which the redistributable is built.
+  # It is `"unsupported"` if the redistributable is not supported on the target platform.
   redistArch = flags.getRedistArch hostPlatform.system;
 in
 backendStdenv.mkDerivation (
@@ -86,8 +89,18 @@ backendStdenv.mkDerivation (
           "sample"
           "python"
         ];
+        # Filter out outputs that don't exist in the redistributable.
+        # NOTE: In the case the redistributable isn't supported on the target platform,
+        # we will have `outputs = [ "out" ] ++ possibleOutputs`. This is of note because platforms which
+        # aren't supported would otherwise have evaluation errors when trying to access outputs other than `out`.
+        # The alternative would be to have `outputs = [ "out" ]` when`redistArch = "unsupported"`, but that would
+        # require adding guards throughout the entirety of the CUDA package set to ensure `cudaSupport` is true --
+        # recall that OfBorg will evaluate packages marked as broken and that `cudaPackages` will be evaluated with
+        # `cudaSupport = false`!
         additionalOutputs =
-          if redistArch == "unsupported" then possibleOutputs else builtins.filter hasOutput possibleOutputs;
+          if redistArch == "unsupported"
+          then possibleOutputs
+          else builtins.filter hasOutput possibleOutputs;
         # The out output is special -- it's the default output and we always include it.
         outputs = [ "out" ] ++ additionalOutputs;
       in
@@ -114,19 +127,28 @@ backendStdenv.mkDerivation (
     # Useful for introspecting why something went wrong.
     # Maps descriptions of why the derivation would be marked broken to
     # booleans indicating whether that description is true.
-    brokenConditions = {};
-
-    src = fetchurl {
-      url =
-        if (builtins.hasAttr redistArch redistribRelease) then
-          "https://developer.download.nvidia.com/compute/${redistName}/redist/${
-            redistribRelease.${redistArch}.relative_path
-          }"
-        else
-          "cannot-construct-an-url-for-the-${redistArch}-platform";
-      sha256 = redistribRelease.${redistArch}.sha256 or lib.fakeHash;
+    # brokenConditions :: AttrSet Bool
+    brokenConditions = {
+      # Using an unrecognized redistArch
+      "Unrecognized NixOS platform ${hostPlatform.system}" = redistArch == "unsupported";
+      # Trying to build for a platform that doesn't have a redistributable
+      "Unsupported NixOS platform (or configuration) ${hostPlatform.system}" = finalAttrs.src == null;
     };
 
+    # src :: Optional Derivation
+    src = trivial.pipe redistArch [
+      # If redistArch doesn't exist in redistribRelease, return null.
+      (redistArch: redistribRelease.${redistArch} or null)
+      # If the release is non-null, fetch the source; otherwise, return null.
+      (trivial.mapNullable (
+        { relative_path, sha256, ... }:
+        fetchurl {
+          url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}";
+          inherit sha256;
+        }
+      ))
+    ];
+
     postPatch = ''
       if [[ -d pkg-config ]] ; then
         mkdir -p share/pkg-config
@@ -284,16 +306,12 @@ backendStdenv.mkDerivation (
     meta = {
       description = "${redistribRelease.name}. By downloading and using the packages you accept the terms and conditions of the ${finalAttrs.meta.license.shortName}";
       sourceProvenance = [sourceTypes.binaryNativeCode];
-      platforms =
-        lists.concatMap
-          (
-            redistArch:
-            let
-              nixSystem = flags.getNixSystem redistArch;
-            in
-            lists.optionals (!(strings.hasPrefix "unsupported-" nixSystem)) [ nixSystem ]
-          )
-          supportedRedistArchs;
+      platforms = trivial.pipe supportedRedistArchs [
+        # Map each redist arch to the equivalent nix system or null if there is no equivalent.
+        (builtins.map flags.getNixSystem)
+        # Filter out unsupported systems
+        (builtins.filter (nixSystem: !(strings.hasPrefix "unsupported-" nixSystem)))
+      ];
       broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions);
       license = licenses.unfree;
       maintainers = teams.cuda.members;
diff --git a/pkgs/development/cuda-modules/generic-builders/multiplex.nix b/pkgs/development/cuda-modules/generic-builders/multiplex.nix
index 5480da7307261..6353b07545a4a 100644
--- a/pkgs/development/cuda-modules/generic-builders/multiplex.nix
+++ b/pkgs/development/cuda-modules/generic-builders/multiplex.nix
@@ -20,7 +20,7 @@
   # The featureRelease is used to populate meta.platforms (by way of looking at the attribute names)
   # and to determine the outputs of the package.
   # shimFn :: {package, redistArch} -> AttrSet
-  shimsFn ? ({package, redistArch}: throw "shimsFn must be provided"),
+  shimsFn ? (throw "shimsFn must be provided"),
   # fixupFn :: Path
   # A path (or nix expression) to be evaluated with callPackage and then
   # provided to the package's overrideAttrs function.
@@ -29,16 +29,8 @@
   # - cudaVersion
   # - mkVersionedPackageName
   # - package
-  fixupFn ? (
-    {
-      final,
-      cudaVersion,
-      mkVersionedPackageName,
-      package,
-      ...
-    }:
-    throw "fixupFn must be provided"
-  ),
+  # - ...
+  fixupFn ? (throw "fixupFn must be provided"),
 }:
 let
   inherit (lib)
@@ -80,9 +72,11 @@ let
     && strings.versionAtLeast package.maxCudaVersion cudaVersion;
 
   # Get all of the packages for our given platform.
+  # redistArch :: String
+  # Value is `"unsupported"` if the platform is not supported.
   redistArch = flags.getRedistArch hostPlatform.system;
 
-  allReleases = builtins.concatMap (xs: xs) (builtins.attrValues releaseSets);
+  allReleases = lists.flatten (builtins.attrValues releaseSets);
 
   # All the supported packages we can build for our platform.
   # perSystemReleases :: List Package
diff --git a/pkgs/development/cuda-modules/nccl/default.nix b/pkgs/development/cuda-modules/nccl/default.nix
index c56d59cb42068..6e385688d0f8d 100644
--- a/pkgs/development/cuda-modules/nccl/default.nix
+++ b/pkgs/development/cuda-modules/nccl/default.nix
@@ -100,6 +100,9 @@ backendStdenv.mkDerivation (
       homepage = "https://developer.nvidia.com/nccl";
       license = licenses.bsd3;
       platforms = platforms.linux;
+      # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
+      # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
+      badPlatforms = lib.optionals cudaFlags.isJetsonBuild [ "aarch64-linux" ];
       maintainers =
         with maintainers;
         [
diff --git a/pkgs/development/cuda-modules/tensorrt/fixup.nix b/pkgs/development/cuda-modules/tensorrt/fixup.nix
index 43a7dfb817840..51ca3d652bd1a 100644
--- a/pkgs/development/cuda-modules/tensorrt/fixup.nix
+++ b/pkgs/development/cuda-modules/tensorrt/fixup.nix
@@ -11,18 +11,17 @@
 }:
 let
   inherit (lib)
+    attrsets
     maintainers
     meta
     strings
     versions
     ;
-  targetArch =
-    if hostPlatform.isx86_64 then
-      "x86_64-linux-gnu"
-    else if hostPlatform.isAarch64 then
-      "aarch64-linux-gnu"
-    else
-      "unsupported";
+  # targetArch :: String
+  targetArch = attrsets.attrByPath [ hostPlatform.system ] "unsupported" {
+    x86_64-linux = "x86_64-linux-gnu";
+    aarch64-linux = "aarch64-linux-gnu";
+  };
 in
 finalAttrs: prevAttrs: {
   # Useful for inspecting why something went wrong.
@@ -69,7 +68,7 @@ finalAttrs: prevAttrs: {
 
   preInstall =
     (prevAttrs.preInstall or "")
-    + ''
+    + strings.optionalString (targetArch != "unsupported") ''
       # Replace symlinks to bin and lib with the actual directories from targets.
       for dir in bin lib; do
         rm "$dir"
diff --git a/pkgs/development/cuda-modules/tensorrt/shims.nix b/pkgs/development/cuda-modules/tensorrt/shims.nix
index 8be3e7988bb34..12465434ec85c 100644
--- a/pkgs/development/cuda-modules/tensorrt/shims.nix
+++ b/pkgs/development/cuda-modules/tensorrt/shims.nix
@@ -1,13 +1,21 @@
 # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix
-{package, redistArch}:
 {
-  featureRelease.${redistArch}.outputs = {
-    bin = true;
-    lib = true;
-    static = true;
-    dev = true;
-    sample = true;
-    python = true;
+  lib,
+  package,
+  # redistArch :: String
+  # String is `"unsupported"` if the given architecture is unsupported.
+  redistArch,
+}:
+{
+  featureRelease = lib.optionalAttrs (redistArch != "unsupported") {
+    ${redistArch}.outputs = {
+      bin = true;
+      lib = true;
+      static = true;
+      dev = true;
+      sample = true;
+      python = true;
+    };
   };
   redistribRelease = {
     name = "TensorRT: a high-performance deep learning interface";
diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix
index 1aaab46e1d1d0..b27b42bf3ae85 100644
--- a/pkgs/development/libraries/science/math/magma/generic.nix
+++ b/pkgs/development/libraries/science/math/magma/generic.nix
@@ -159,7 +159,7 @@ stdenv.mkDerivation {
     description = "Matrix Algebra on GPU and Multicore Architectures";
     license = licenses.bsd3;
     homepage = "http://icl.cs.utk.edu/magma/index.html";
-    platforms = platforms.unix;
+    platforms = platforms.linux;
     maintainers = with maintainers; [ connorbaker ];
 
     # Cf. https://bitbucket.org/icl/magma/src/fcfe5aa61c1a4c664b36a73ebabbdbab82765e9f/CMakeLists.txt#lines-20
diff --git a/pkgs/development/libraries/xgboost/default.nix b/pkgs/development/libraries/xgboost/default.nix
index 2a44ffc443825..0af51a40dfb1e 100644
--- a/pkgs/development/libraries/xgboost/default.nix
+++ b/pkgs/development/libraries/xgboost/default.nix
@@ -14,7 +14,7 @@
 , rPackages
 }@inputs:
 
-assert ncclSupport -> cudaSupport;
+assert ncclSupport -> (cudaSupport && !cudaPackages.nccl.meta.unsupported);
 # Disable regular tests when building the R package
 # because 1) the R package runs its own tests and
 # 2) the R package creates a different binary shared
diff --git a/pkgs/development/python-modules/jaxlib/default.nix b/pkgs/development/python-modules/jaxlib/default.nix
index 27b9e61fbc821..d8dc4d67a5942 100644
--- a/pkgs/development/python-modules/jaxlib/default.nix
+++ b/pkgs/development/python-modules/jaxlib/default.nix
@@ -64,7 +64,8 @@ let
     # aarch64-darwin is broken because of https://github.com/bazelbuild/rules_cc/pull/136
     # however even with that fix applied, it doesn't work for everyone:
     # https://github.com/NixOS/nixpkgs/pull/184395#issuecomment-1207287129
-    broken = stdenv.isDarwin;
+    # NOTE: We always build with NCCL; if it is unsupported, then our build is broken.
+    broken = stdenv.isDarwin || nccl.meta.unsupported;
   };
 
   cudatoolkit_joined = symlinkJoin {
diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index 8fb227cbd36be..802d1a920141e 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -7,7 +7,8 @@
   magma,
   magma-hip,
   magma-cuda-static,
-  useSystemNccl ? true,
+  # Use the system NCCL as long as it is supported.
+  useSystemNccl ? !cudaPackages.nccl.meta.unsupported,
   MPISupport ? false, mpi,
   buildDocs ? false,
 
@@ -57,6 +58,7 @@
 let
   inherit (lib) attrsets lists strings trivial;
   inherit (cudaPackages) cudaFlags cudnn nccl;
+  ncclSupported = cudaSupport && !cudaPackages.nccl.meta.unsupported;
 
   setBool = v: if v then "1" else "0";
 
@@ -121,6 +123,7 @@ let
     "Unsupported CUDA version" = cudaSupport && !(builtins.elem cudaPackages.cudaMajorVersion [ "11" "12" ]);
     "MPI cudatoolkit does not match cudaPackages.cudatoolkit" = MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit);
     "Magma cudaPackages does not match cudaPackages" = cudaSupport && (effectiveMagma.cudaPackages != cudaPackages);
+    "Requested system NCCL, but cudaPackages.nccl is not supported" = useSystemNccl && !ncclSupported;
   };
 in buildPythonPackage rec {
   pname = "torch";
@@ -273,9 +276,9 @@ in buildPythonPackage rec {
   PYTORCH_BUILD_VERSION = version;
   PYTORCH_BUILD_NUMBER = 0;
 
-  USE_NCCL = setBool (cudaSupport && cudaPackages ? nccl);
-  USE_SYSTEM_NCCL = setBool useSystemNccl;                  # don't build pytorch's third_party NCCL
-  USE_STATIC_NCCL = setBool useSystemNccl;
+  USE_NCCL = setBool (cudaSupport && ncclSupported);
+  USE_SYSTEM_NCCL = setBool (cudaSupport && useSystemNccl);                  # don't build pytorch's third_party NCCL
+  USE_STATIC_NCCL = setBool (cudaSupport && useSystemNccl);
 
   # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
   # (upstream seems to have fixed this in the wrong place?)
@@ -363,7 +366,7 @@ in buildPythonPackage rec {
     ] ++ lists.optionals (cudaPackages ? cudnn) [
       cudnn.dev
       cudnn.lib
-    ] ++ lists.optionals (useSystemNccl && cudaPackages ? nccl) [
+    ] ++ lists.optionals (useSystemNccl && ncclSupported) [
       # Some platforms do not support NCCL (i.e., Jetson)
       nccl.dev # Provides nccl.h AND a static copy of NCCL!
     ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
diff --git a/pkgs/top-level/cuda-packages.nix b/pkgs/top-level/cuda-packages.nix
index f997963ff468c..f20a361522031 100644
--- a/pkgs/top-level/cuda-packages.nix
+++ b/pkgs/top-level/cuda-packages.nix
@@ -73,10 +73,6 @@ let
         # Loose packages
         cudatoolkit = final.callPackage ../development/cuda-modules/cudatoolkit {};
         saxpy = final.callPackage ../development/cuda-modules/saxpy {};
-      }
-      # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
-      # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
-      // attrsets.optionalAttrs (!flags.isJetsonBuild) {
         nccl = final.callPackage ../development/cuda-modules/nccl {};
         nccl-tests = final.callPackage ../development/cuda-modules/nccl-tests {};
       }
author	Connor Baker <connor.baker@tweag.io>	2023-12-14 22:19:02 +0000
committer	Connor Baker <connor.baker@tweag.io>	2024-01-10 01:15:01 +0000
commit	9bebd9e72d6b552fcfd3d1e6716eca6563944f42 (patch)
tree	d0b45b861d0d47be9e43c26481f68ce53a3d6987
parent	501a1af970ca54cb300474a00aacfbd01f8a5b24 (diff)