diff options
author | Connor Baker <connor.baker@tweag.io> | 2023-12-14 22:19:02 +0000 |
---|---|---|
committer | Connor Baker <connor.baker@tweag.io> | 2024-01-10 01:15:01 +0000 |
commit | 9bebd9e72d6b552fcfd3d1e6716eca6563944f42 (patch) | |
tree | d0b45b861d0d47be9e43c26481f68ce53a3d6987 | |
parent | 501a1af970ca54cb300474a00aacfbd01f8a5b24 (diff) |
tree-wide: cudaPackages should not break default eval
cudaPackages: guard expressions against null values
-rw-r--r-- | pkgs/applications/science/math/caffe/default.nix | 2 | ||||
-rw-r--r-- | pkgs/development/cuda-modules/cudnn/shims.nix | 18 | ||||
-rw-r--r-- | pkgs/development/cuda-modules/cutensor/extension.nix | 1 | ||||
-rw-r--r-- | pkgs/development/cuda-modules/flags.nix | 50 | ||||
-rw-r--r-- | pkgs/development/cuda-modules/generic-builders/manifest.nix | 62 | ||||
-rw-r--r-- | pkgs/development/cuda-modules/generic-builders/multiplex.nix | 18 | ||||
-rw-r--r-- | pkgs/development/cuda-modules/nccl/default.nix | 3 | ||||
-rw-r--r-- | pkgs/development/cuda-modules/tensorrt/fixup.nix | 15 | ||||
-rw-r--r-- | pkgs/development/cuda-modules/tensorrt/shims.nix | 24 | ||||
-rw-r--r-- | pkgs/development/libraries/science/math/magma/generic.nix | 2 | ||||
-rw-r--r-- | pkgs/development/libraries/xgboost/default.nix | 2 | ||||
-rw-r--r-- | pkgs/development/python-modules/jaxlib/default.nix | 3 | ||||
-rw-r--r-- | pkgs/development/python-modules/torch/default.nix | 13 | ||||
-rw-r--r-- | pkgs/top-level/cuda-packages.nix | 4 |
14 files changed, 119 insertions, 98 deletions
diff --git a/pkgs/applications/science/math/caffe/default.nix b/pkgs/applications/science/math/caffe/default.nix index 6595f0b846ddb..25f7229a845ae 100644 --- a/pkgs/applications/science/math/caffe/default.nix +++ b/pkgs/applications/science/math/caffe/default.nix @@ -153,7 +153,7 @@ stdenv.mkDerivation rec { || cudaSupport || !(leveldbSupport -> (leveldb != null && snappy != null)) || !(cudnnSupport -> (hasCudnn && cudaSupport)) - || !(ncclSupport -> cudaSupport) + || !(ncclSupport -> (cudaSupport && !nccl.meta.unsupported)) || !(pythonSupport -> (python != null && numpy != null)) ; license = licenses.bsd2; diff --git a/pkgs/development/cuda-modules/cudnn/shims.nix b/pkgs/development/cuda-modules/cudnn/shims.nix index e9eca8ef7c8b9..a36ee26dab5dc 100644 --- a/pkgs/development/cuda-modules/cudnn/shims.nix +++ b/pkgs/development/cuda-modules/cudnn/shims.nix @@ -1,10 +1,18 @@ # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix -{package, redistArch}: { - featureRelease.${redistArch}.outputs = { - lib = true; - static = true; - dev = true; + lib, + package, + # redistArch :: String + # String is "unsupported" if the given architecture is unsupported. + redistArch, +}: +{ + featureRelease = lib.optionalAttrs (redistArch != "unsupported") { + ${redistArch}.outputs = { + lib = true; + static = true; + dev = true; + }; }; redistribRelease = { name = "NVIDIA CUDA Deep Neural Network library (cuDNN)"; diff --git a/pkgs/development/cuda-modules/cutensor/extension.nix b/pkgs/development/cuda-modules/cutensor/extension.nix index b762fd22ede88..534941887c6e4 100644 --- a/pkgs/development/cuda-modules/cutensor/extension.nix +++ b/pkgs/development/cuda-modules/cutensor/extension.nix @@ -92,6 +92,7 @@ let # A release is supported if it has a libPath that matches our CUDA version for our platform. # LibPath are not constant across the same release -- one platform may support fewer # CUDA versions than another. + # redistArch :: String redistArch = flags.getRedistArch hostPlatform.system; # platformIsSupported :: Manifests -> Boolean platformIsSupported = diff --git a/pkgs/development/cuda-modules/flags.nix b/pkgs/development/cuda-modules/flags.nix index a123c7bce5a16..d5e01be01fd51 100644 --- a/pkgs/development/cuda-modules/flags.nix +++ b/pkgs/development/cuda-modules/flags.nix @@ -131,39 +131,29 @@ let # `linux-aarch64` redist (which is for Jetson devices) if we're building any Jetson devices. # Since both are based on aarch64, we can only have one or the other, otherwise there's an # ambiguity as to which should be used. + # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of + # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported + # systems gracefully. # getRedistArch :: String -> String - getRedistArch = - nixSystem: - if nixSystem == "aarch64-linux" then - if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa" - else if nixSystem == "x86_64-linux" then - "linux-x86_64" - else if nixSystem == "ppc64le-linux" then - "linux-ppc64le" - else if nixSystem == "x86_64-windows" then - "windows-x86_64" - else - "unsupported"; + getRedistArch = nixSystem: attrsets.attrByPath [ nixSystem ] "unsupported" { + aarch64-linux = if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa"; + x86_64-linux = "linux-x86_64"; + ppc64le-linux = "linux-ppc64le"; + x86_64-windows = "windows-x86_64"; + }; # Maps NVIDIA redist arch to Nix system. - # It is imperative that we include the boolean condition based on jetsonTargets to ensure - # we don't advertise availability of packages only available on server-grade ARM - # as being available for the Jetson, since both `linux-sbsa` and `linux-aarch64` are - # mapped to the Nix system `aarch64-linux`. - getNixSystem = - redistArch: - if redistArch == "linux-sbsa" && jetsonTargets == [] then - "aarch64-linux" - else if redistArch == "linux-aarch64" && jetsonTargets != [] then - "aarch64-linux" - else if redistArch == "linux-x86_64" then - "x86_64-linux" - else if redistArch == "linux-ppc64le" then - "ppc64le-linux" - else if redistArch == "windows-x86_64" then - "x86_64-windows" - else - "unsupported-${redistArch}"; + # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of + # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported + # systems gracefully. + # getNixSystem :: String -> String + getNixSystem = redistArch: attrsets.attrByPath [ redistArch ] "unsupported-${redistArch}" { + linux-sbsa = "aarch64-linux"; + linux-aarch64 = "aarch64-linux"; + linux-x86_64 = "x86_64-linux"; + linux-ppc64le = "ppc64le-linux"; + windows-x86_64 = "x86_64-windows"; + }; formatCapabilities = { diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix index 5a4c5280d7dbd..64204346791a8 100644 --- a/pkgs/development/cuda-modules/generic-builders/manifest.nix +++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix @@ -42,6 +42,9 @@ let # Get the redist architectures for which package provides distributables. # These are used by meta.platforms. supportedRedistArchs = builtins.attrNames featureRelease; + # redistArch :: String + # The redistArch is the name of the architecture for which the redistributable is built. + # It is `"unsupported"` if the redistributable is not supported on the target platform. redistArch = flags.getRedistArch hostPlatform.system; in backendStdenv.mkDerivation ( @@ -86,8 +89,18 @@ backendStdenv.mkDerivation ( "sample" "python" ]; + # Filter out outputs that don't exist in the redistributable. + # NOTE: In the case the redistributable isn't supported on the target platform, + # we will have `outputs = [ "out" ] ++ possibleOutputs`. This is of note because platforms which + # aren't supported would otherwise have evaluation errors when trying to access outputs other than `out`. + # The alternative would be to have `outputs = [ "out" ]` when`redistArch = "unsupported"`, but that would + # require adding guards throughout the entirety of the CUDA package set to ensure `cudaSupport` is true -- + # recall that OfBorg will evaluate packages marked as broken and that `cudaPackages` will be evaluated with + # `cudaSupport = false`! additionalOutputs = - if redistArch == "unsupported" then possibleOutputs else builtins.filter hasOutput possibleOutputs; + if redistArch == "unsupported" + then possibleOutputs + else builtins.filter hasOutput possibleOutputs; # The out output is special -- it's the default output and we always include it. outputs = [ "out" ] ++ additionalOutputs; in @@ -114,19 +127,28 @@ backendStdenv.mkDerivation ( # Useful for introspecting why something went wrong. # Maps descriptions of why the derivation would be marked broken to # booleans indicating whether that description is true. - brokenConditions = {}; - - src = fetchurl { - url = - if (builtins.hasAttr redistArch redistribRelease) then - "https://developer.download.nvidia.com/compute/${redistName}/redist/${ - redistribRelease.${redistArch}.relative_path - }" - else - "cannot-construct-an-url-for-the-${redistArch}-platform"; - sha256 = redistribRelease.${redistArch}.sha256 or lib.fakeHash; + # brokenConditions :: AttrSet Bool + brokenConditions = { + # Using an unrecognized redistArch + "Unrecognized NixOS platform ${hostPlatform.system}" = redistArch == "unsupported"; + # Trying to build for a platform that doesn't have a redistributable + "Unsupported NixOS platform (or configuration) ${hostPlatform.system}" = finalAttrs.src == null; }; + # src :: Optional Derivation + src = trivial.pipe redistArch [ + # If redistArch doesn't exist in redistribRelease, return null. + (redistArch: redistribRelease.${redistArch} or null) + # If the release is non-null, fetch the source; otherwise, return null. + (trivial.mapNullable ( + { relative_path, sha256, ... }: + fetchurl { + url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}"; + inherit sha256; + } + )) + ]; + postPatch = '' if [[ -d pkg-config ]] ; then mkdir -p share/pkg-config @@ -284,16 +306,12 @@ backendStdenv.mkDerivation ( meta = { description = "${redistribRelease.name}. By downloading and using the packages you accept the terms and conditions of the ${finalAttrs.meta.license.shortName}"; sourceProvenance = [sourceTypes.binaryNativeCode]; - platforms = - lists.concatMap - ( - redistArch: - let - nixSystem = flags.getNixSystem redistArch; - in - lists.optionals (!(strings.hasPrefix "unsupported-" nixSystem)) [ nixSystem ] - ) - supportedRedistArchs; + platforms = trivial.pipe supportedRedistArchs [ + # Map each redist arch to the equivalent nix system or null if there is no equivalent. + (builtins.map flags.getNixSystem) + # Filter out unsupported systems + (builtins.filter (nixSystem: !(strings.hasPrefix "unsupported-" nixSystem))) + ]; broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions); license = licenses.unfree; maintainers = teams.cuda.members; diff --git a/pkgs/development/cuda-modules/generic-builders/multiplex.nix b/pkgs/development/cuda-modules/generic-builders/multiplex.nix index 5480da7307261..6353b07545a4a 100644 --- a/pkgs/development/cuda-modules/generic-builders/multiplex.nix +++ b/pkgs/development/cuda-modules/generic-builders/multiplex.nix @@ -20,7 +20,7 @@ # The featureRelease is used to populate meta.platforms (by way of looking at the attribute names) # and to determine the outputs of the package. # shimFn :: {package, redistArch} -> AttrSet - shimsFn ? ({package, redistArch}: throw "shimsFn must be provided"), + shimsFn ? (throw "shimsFn must be provided"), # fixupFn :: Path # A path (or nix expression) to be evaluated with callPackage and then # provided to the package's overrideAttrs function. @@ -29,16 +29,8 @@ # - cudaVersion # - mkVersionedPackageName # - package - fixupFn ? ( - { - final, - cudaVersion, - mkVersionedPackageName, - package, - ... - }: - throw "fixupFn must be provided" - ), + # - ... + fixupFn ? (throw "fixupFn must be provided"), }: let inherit (lib) @@ -80,9 +72,11 @@ let && strings.versionAtLeast package.maxCudaVersion cudaVersion; # Get all of the packages for our given platform. + # redistArch :: String + # Value is `"unsupported"` if the platform is not supported. redistArch = flags.getRedistArch hostPlatform.system; - allReleases = builtins.concatMap (xs: xs) (builtins.attrValues releaseSets); + allReleases = lists.flatten (builtins.attrValues releaseSets); # All the supported packages we can build for our platform. # perSystemReleases :: List Package diff --git a/pkgs/development/cuda-modules/nccl/default.nix b/pkgs/development/cuda-modules/nccl/default.nix index c56d59cb42068..6e385688d0f8d 100644 --- a/pkgs/development/cuda-modules/nccl/default.nix +++ b/pkgs/development/cuda-modules/nccl/default.nix @@ -100,6 +100,9 @@ backendStdenv.mkDerivation ( homepage = "https://developer.nvidia.com/nccl"; license = licenses.bsd3; platforms = platforms.linux; + # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication. + # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9 + badPlatforms = lib.optionals cudaFlags.isJetsonBuild [ "aarch64-linux" ]; maintainers = with maintainers; [ diff --git a/pkgs/development/cuda-modules/tensorrt/fixup.nix b/pkgs/development/cuda-modules/tensorrt/fixup.nix index 43a7dfb817840..51ca3d652bd1a 100644 --- a/pkgs/development/cuda-modules/tensorrt/fixup.nix +++ b/pkgs/development/cuda-modules/tensorrt/fixup.nix @@ -11,18 +11,17 @@ }: let inherit (lib) + attrsets maintainers meta strings versions ; - targetArch = - if hostPlatform.isx86_64 then - "x86_64-linux-gnu" - else if hostPlatform.isAarch64 then - "aarch64-linux-gnu" - else - "unsupported"; + # targetArch :: String + targetArch = attrsets.attrByPath [ hostPlatform.system ] "unsupported" { + x86_64-linux = "x86_64-linux-gnu"; + aarch64-linux = "aarch64-linux-gnu"; + }; in finalAttrs: prevAttrs: { # Useful for inspecting why something went wrong. @@ -69,7 +68,7 @@ finalAttrs: prevAttrs: { preInstall = (prevAttrs.preInstall or "") - + '' + + strings.optionalString (targetArch != "unsupported") '' # Replace symlinks to bin and lib with the actual directories from targets. for dir in bin lib; do rm "$dir" diff --git a/pkgs/development/cuda-modules/tensorrt/shims.nix b/pkgs/development/cuda-modules/tensorrt/shims.nix index 8be3e7988bb34..12465434ec85c 100644 --- a/pkgs/development/cuda-modules/tensorrt/shims.nix +++ b/pkgs/development/cuda-modules/tensorrt/shims.nix @@ -1,13 +1,21 @@ # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix -{package, redistArch}: { - featureRelease.${redistArch}.outputs = { - bin = true; - lib = true; - static = true; - dev = true; - sample = true; - python = true; + lib, + package, + # redistArch :: String + # String is `"unsupported"` if the given architecture is unsupported. + redistArch, +}: +{ + featureRelease = lib.optionalAttrs (redistArch != "unsupported") { + ${redistArch}.outputs = { + bin = true; + lib = true; + static = true; + dev = true; + sample = true; + python = true; + }; }; redistribRelease = { name = "TensorRT: a high-performance deep learning interface"; diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix index 1aaab46e1d1d0..b27b42bf3ae85 100644 --- a/pkgs/development/libraries/science/math/magma/generic.nix +++ b/pkgs/development/libraries/science/math/magma/generic.nix @@ -159,7 +159,7 @@ stdenv.mkDerivation { description = "Matrix Algebra on GPU and Multicore Architectures"; license = licenses.bsd3; homepage = "http://icl.cs.utk.edu/magma/index.html"; - platforms = platforms.unix; + platforms = platforms.linux; maintainers = with maintainers; [ connorbaker ]; # Cf. https://bitbucket.org/icl/magma/src/fcfe5aa61c1a4c664b36a73ebabbdbab82765e9f/CMakeLists.txt#lines-20 diff --git a/pkgs/development/libraries/xgboost/default.nix b/pkgs/development/libraries/xgboost/default.nix index 2a44ffc443825..0af51a40dfb1e 100644 --- a/pkgs/development/libraries/xgboost/default.nix +++ b/pkgs/development/libraries/xgboost/default.nix @@ -14,7 +14,7 @@ , rPackages }@inputs: -assert ncclSupport -> cudaSupport; +assert ncclSupport -> (cudaSupport && !cudaPackages.nccl.meta.unsupported); # Disable regular tests when building the R package # because 1) the R package runs its own tests and # 2) the R package creates a different binary shared diff --git a/pkgs/development/python-modules/jaxlib/default.nix b/pkgs/development/python-modules/jaxlib/default.nix index 27b9e61fbc821..d8dc4d67a5942 100644 --- a/pkgs/development/python-modules/jaxlib/default.nix +++ b/pkgs/development/python-modules/jaxlib/default.nix @@ -64,7 +64,8 @@ let # aarch64-darwin is broken because of https://github.com/bazelbuild/rules_cc/pull/136 # however even with that fix applied, it doesn't work for everyone: # https://github.com/NixOS/nixpkgs/pull/184395#issuecomment-1207287129 - broken = stdenv.isDarwin; + # NOTE: We always build with NCCL; if it is unsupported, then our build is broken. + broken = stdenv.isDarwin || nccl.meta.unsupported; }; cudatoolkit_joined = symlinkJoin { diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix index 8fb227cbd36be..802d1a920141e 100644 --- a/pkgs/development/python-modules/torch/default.nix +++ b/pkgs/development/python-modules/torch/default.nix @@ -7,7 +7,8 @@ magma, magma-hip, magma-cuda-static, - useSystemNccl ? true, + # Use the system NCCL as long as it is supported. + useSystemNccl ? !cudaPackages.nccl.meta.unsupported, MPISupport ? false, mpi, buildDocs ? false, @@ -57,6 +58,7 @@ let inherit (lib) attrsets lists strings trivial; inherit (cudaPackages) cudaFlags cudnn nccl; + ncclSupported = cudaSupport && !cudaPackages.nccl.meta.unsupported; setBool = v: if v then "1" else "0"; @@ -121,6 +123,7 @@ let "Unsupported CUDA version" = cudaSupport && !(builtins.elem cudaPackages.cudaMajorVersion [ "11" "12" ]); "MPI cudatoolkit does not match cudaPackages.cudatoolkit" = MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit); "Magma cudaPackages does not match cudaPackages" = cudaSupport && (effectiveMagma.cudaPackages != cudaPackages); + "Requested system NCCL, but cudaPackages.nccl is not supported" = useSystemNccl && !ncclSupported; }; in buildPythonPackage rec { pname = "torch"; @@ -273,9 +276,9 @@ in buildPythonPackage rec { PYTORCH_BUILD_VERSION = version; PYTORCH_BUILD_NUMBER = 0; - USE_NCCL = setBool (cudaSupport && cudaPackages ? nccl); - USE_SYSTEM_NCCL = setBool useSystemNccl; # don't build pytorch's third_party NCCL - USE_STATIC_NCCL = setBool useSystemNccl; + USE_NCCL = setBool (cudaSupport && ncclSupported); + USE_SYSTEM_NCCL = setBool (cudaSupport && useSystemNccl); # don't build pytorch's third_party NCCL + USE_STATIC_NCCL = setBool (cudaSupport && useSystemNccl); # Suppress a weird warning in mkl-dnn, part of ideep in pytorch # (upstream seems to have fixed this in the wrong place?) @@ -363,7 +366,7 @@ in buildPythonPackage rec { ] ++ lists.optionals (cudaPackages ? cudnn) [ cudnn.dev cudnn.lib - ] ++ lists.optionals (useSystemNccl && cudaPackages ? nccl) [ + ] ++ lists.optionals (useSystemNccl && ncclSupported) [ # Some platforms do not support NCCL (i.e., Jetson) nccl.dev # Provides nccl.h AND a static copy of NCCL! ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [ diff --git a/pkgs/top-level/cuda-packages.nix b/pkgs/top-level/cuda-packages.nix index f997963ff468c..f20a361522031 100644 --- a/pkgs/top-level/cuda-packages.nix +++ b/pkgs/top-level/cuda-packages.nix @@ -73,10 +73,6 @@ let # Loose packages cudatoolkit = final.callPackage ../development/cuda-modules/cudatoolkit {}; saxpy = final.callPackage ../development/cuda-modules/saxpy {}; - } - # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication. - # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9 - // attrsets.optionalAttrs (!flags.isJetsonBuild) { nccl = final.callPackage ../development/cuda-modules/nccl {}; nccl-tests = final.callPackage ../development/cuda-modules/nccl-tests {}; } |