about summary refs log tree commit diff
path: root/nixos/modules/system
diff options
context:
space:
mode:
Diffstat (limited to 'nixos/modules/system')
-rw-r--r--nixos/modules/system/activation/activation-script.nix2
-rw-r--r--nixos/modules/system/activation/lib/lib.sh5
-rw-r--r--nixos/modules/system/activation/lib/test.nix36
-rwxr-xr-xnixos/modules/system/activation/lib/test.sh34
-rw-r--r--nixos/modules/system/activation/specialisation.nix19
-rwxr-xr-xnixos/modules/system/activation/switch-to-configuration.pl5
-rw-r--r--nixos/modules/system/boot/binfmt.nix4
-rw-r--r--nixos/modules/system/boot/initrd-ssh.nix10
-rw-r--r--nixos/modules/system/boot/loader/generations-dir/generations-dir.nix2
-rw-r--r--nixos/modules/system/boot/loader/systemd-boot/boot-counting.md38
-rw-r--r--nixos/modules/system/boot/loader/systemd-boot/systemd-boot-builder.py355
-rw-r--r--nixos/modules/system/boot/loader/systemd-boot/systemd-boot.nix52
-rw-r--r--nixos/modules/system/boot/networkd.nix26
-rw-r--r--nixos/modules/system/boot/plymouth.nix2
-rw-r--r--nixos/modules/system/boot/stage-1.nix1
-rw-r--r--nixos/modules/system/boot/systemd.nix39
-rw-r--r--nixos/modules/system/boot/systemd/initrd.nix28
-rw-r--r--nixos/modules/system/boot/systemd/journald.nix3
-rw-r--r--nixos/modules/system/boot/systemd/nspawn.nix3
-rw-r--r--nixos/modules/system/boot/systemd/shutdown.nix10
-rw-r--r--nixos/modules/system/boot/systemd/sysusers.nix215
-rw-r--r--nixos/modules/system/boot/systemd/tmpfiles.nix6
-rw-r--r--nixos/modules/system/etc/etc.nix41
23 files changed, 671 insertions, 265 deletions
diff --git a/nixos/modules/system/activation/activation-script.nix b/nixos/modules/system/activation/activation-script.nix
index fc29aa3cb2f71..195ad31b1e56c 100644
--- a/nixos/modules/system/activation/activation-script.nix
+++ b/nixos/modules/system/activation/activation-script.nix
@@ -33,6 +33,8 @@ let
     ''
       #!${pkgs.runtimeShell}
 
+      source ${./lib/lib.sh}
+
       systemConfig='@out@'
 
       export PATH=/empty
diff --git a/nixos/modules/system/activation/lib/lib.sh b/nixos/modules/system/activation/lib/lib.sh
new file mode 100644
index 0000000000000..5ecf94e81604c
--- /dev/null
+++ b/nixos/modules/system/activation/lib/lib.sh
@@ -0,0 +1,5 @@
+# shellcheck shell=bash
+
+warn() {
+    printf "\033[1;35mwarning:\033[0m %s\n" "$*" >&2
+}
diff --git a/nixos/modules/system/activation/lib/test.nix b/nixos/modules/system/activation/lib/test.nix
new file mode 100644
index 0000000000000..39886d305195a
--- /dev/null
+++ b/nixos/modules/system/activation/lib/test.nix
@@ -0,0 +1,36 @@
+# Run:
+#   nix-build -A nixosTests.activation-lib
+{ lib, stdenv, testers }:
+let
+  inherit (lib) fileset;
+
+  runTests = stdenv.mkDerivation {
+    name = "tests-activation-lib";
+    src = fileset.toSource {
+      root = ./.;
+      fileset = fileset.unions [
+        ./lib.sh
+        ./test.sh
+      ];
+    };
+    buildPhase = ":";
+    doCheck = true;
+    postUnpack = ''
+      patchShebangs --build .
+    '';
+    checkPhase = ''
+      ./test.sh
+    '';
+    installPhase = ''
+      touch $out
+    '';
+  };
+
+  runShellcheck = testers.shellcheck {
+    src = runTests.src;
+  };
+
+in
+lib.recurseIntoAttrs {
+  inherit runTests runShellcheck;
+}
diff --git a/nixos/modules/system/activation/lib/test.sh b/nixos/modules/system/activation/lib/test.sh
new file mode 100755
index 0000000000000..9b146383ad4b0
--- /dev/null
+++ b/nixos/modules/system/activation/lib/test.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+# Run:
+#   ./test.sh
+# or:
+#   nix-build -A nixosTests.activation-lib
+
+cd "$(dirname "${BASH_SOURCE[0]}")"
+set -euo pipefail
+
+# report failure
+onerr() {
+  set +e
+  # find failed statement
+  echo "call trace:"
+  local i=0
+  while t="$(caller $i)"; do
+    line="${t%% *}"
+    file="${t##* }"
+    echo "  $file:$line" >&2
+    ((i++))
+  done
+  # red
+  printf "\033[1;31mtest failed\033[0m\n" >&2
+  exit 1
+}
+trap onerr ERR
+
+source ./lib.sh
+
+(warn hi, this works >/dev/null) 2>&1 | grep -E $'.*warning:.* hi, this works' >/dev/null
+
+# green
+printf "\033[1;32mok\033[0m\n"
diff --git a/nixos/modules/system/activation/specialisation.nix b/nixos/modules/system/activation/specialisation.nix
index fdab287802fa5..fc348ad94c03a 100644
--- a/nixos/modules/system/activation/specialisation.nix
+++ b/nixos/modules/system/activation/specialisation.nix
@@ -1,10 +1,14 @@
-{ config, lib, pkgs, extendModules, noUserModules, ... }:
+{ config, lib, extendModules, noUserModules, ... }:
 
 let
   inherit (lib)
+    attrNames
     concatStringsSep
+    filter
+    length
     mapAttrs
     mapAttrsToList
+    match
     mkOption
     types
     ;
@@ -73,6 +77,19 @@ in
   };
 
   config = {
+    assertions = [(
+      let
+        invalidNames = filter (name: match "[[:alnum:]_]+" name == null) (attrNames config.specialisation);
+      in
+      {
+        assertion = length invalidNames == 0;
+        message = ''
+          Specialisation names can only contain alphanumeric characters and underscores
+          Invalid specialisation names: ${concatStringsSep ", " invalidNames}
+        '';
+      }
+    )];
+
     system.systemBuilderCommands = ''
       mkdir $out/specialisation
       ${concatStringsSep "\n"
diff --git a/nixos/modules/system/activation/switch-to-configuration.pl b/nixos/modules/system/activation/switch-to-configuration.pl
index cabc1dcc2d65a..4beca4f0a42a9 100755
--- a/nixos/modules/system/activation/switch-to-configuration.pl
+++ b/nixos/modules/system/activation/switch-to-configuration.pl
@@ -1,5 +1,10 @@
 #! @perl@/bin/perl
 
+# NOTE: This script has an alternative implementation at
+# <nixpkgs/pkgs/by-name/sw/switch-to-configuration-ng>. Any behavioral
+# modifications to this script should also be made to that implementation.
+
+
 # Issue #166838 uncovered a situation in which a configuration not suitable
 # for the target architecture caused a cryptic error message instead of
 # a clean failure. Due to this mismatch, the perl interpreter in the shebang
diff --git a/nixos/modules/system/boot/binfmt.nix b/nixos/modules/system/boot/binfmt.nix
index 1d702442f7f66..a0ef92c0505f4 100644
--- a/nixos/modules/system/boot/binfmt.nix
+++ b/nixos/modules/system/boot/binfmt.nix
@@ -141,6 +141,10 @@ let
       magicOrExtension = ''\x00asm'';
       mask = ''\xff\xff\xff\xff'';
     };
+    s390x-linux = {
+      magicOrExtension = ''\x7fELF\x02\x02\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x16'';
+      mask = ''\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff'';
+    };
     x86_64-windows.magicOrExtension = "MZ";
     i686-windows.magicOrExtension = "MZ";
   };
diff --git a/nixos/modules/system/boot/initrd-ssh.nix b/nixos/modules/system/boot/initrd-ssh.nix
index cbeec4588f593..650ce593e945b 100644
--- a/nixos/modules/system/boot/initrd-ssh.nix
+++ b/nixos/modules/system/boot/initrd-ssh.nix
@@ -166,6 +166,10 @@ in
         UseDNS no
       ''}
 
+      ${optionalString (!config.boot.initrd.systemd.enable) ''
+        SshdSessionPath /bin/sshd-session
+      ''}
+
       ${cfg.extraConfig}
     '';
   in mkIf enabled {
@@ -191,6 +195,7 @@ in
 
     boot.initrd.extraUtilsCommands = mkIf (!config.boot.initrd.systemd.enable) ''
       copy_bin_and_libs ${package}/bin/sshd
+      copy_bin_and_libs ${package}/libexec/sshd-session
       cp -pv ${pkgs.glibc.out}/lib/libnss_files.so.* $out/lib
     '';
 
@@ -265,7 +270,10 @@ in
             config.boot.initrd.network.ssh.authorizedKeys ++
             (map (file: lib.fileContents file) config.boot.initrd.network.ssh.authorizedKeyFiles));
       };
-      storePaths = ["${package}/bin/sshd"];
+      storePaths = [
+        "${package}/bin/sshd"
+        "${package}/libexec/sshd-session"
+      ];
 
       services.sshd = {
         description = "SSH Daemon";
diff --git a/nixos/modules/system/boot/loader/generations-dir/generations-dir.nix b/nixos/modules/system/boot/loader/generations-dir/generations-dir.nix
index 630c6e1870e6e..397326899d8d6 100644
--- a/nixos/modules/system/boot/loader/generations-dir/generations-dir.nix
+++ b/nixos/modules/system/boot/loader/generations-dir/generations-dir.nix
@@ -42,7 +42,7 @@ in
         default = false;
         type = types.bool;
         description = ''
-          Whether copy the necessary boot files into /boot, so
+          Whether to copy the necessary boot files into /boot, so
           /nix/store is not needed by the boot loader.
         '';
       };
diff --git a/nixos/modules/system/boot/loader/systemd-boot/boot-counting.md b/nixos/modules/system/boot/loader/systemd-boot/boot-counting.md
new file mode 100644
index 0000000000000..743584b525915
--- /dev/null
+++ b/nixos/modules/system/boot/loader/systemd-boot/boot-counting.md
@@ -0,0 +1,38 @@
+# Automatic boot assessment with systemd-boot {#sec-automatic-boot-assessment}
+
+## Overview {#sec-automatic-boot-assessment-overview}
+
+Automatic boot assessment (or boot-counting) is a feature of `systemd-boot` that allows for automatically detecting invalid boot entries.
+When the feature is active, each boot entry has an associated counter with a user defined number of trials. Whenever `systemd-boot` boots an entry, its counter is decreased by one, ultimately being marked as *bad* if the counter ever reaches zero. However, if an entry is successfully booted, systemd will permanently mark it as *good* and remove the counter altogether. Whenever an entry is marked as *bad*, it is sorted last in the `systemd-boot` menu.
+A complete explanation of how that feature works can be found [here](https://systemd.io/AUTOMATIC_BOOT_ASSESSMENT/).
+
+## Enabling the feature {#sec-automatic-boot-assessment-enable}
+
+The feature can be enabled by toogling the [boot.loader.systemd-boot.bootCounting](#opt-boot.loader.systemd-boot.bootCounting.enable) option.
+
+## The boot-complete.target unit {#sec-automatic-boot-assessment-boot-complete-target}
+
+A *successful boot* for an entry is defined in terms of the `boot-complete.target` synchronisation point. It is up to the user to schedule all necessary units for the machine to be considered successfully booted before that synchronisation point.
+For example, if you are running `docker` on a machine and you want to be sure that a *good* entry is an entry where docker is started successfully.
+A configuration for that NixOS machine could look like that:
+
+```
+boot.loader.systemd-boot.bootCounting.enable = true;
+services.docker.enable = true;
+
+systemd.services.docker = {
+  before = [ "boot-complete.target" ];
+  wantedBy = [ "boot-complete.target" ];
+  unitConfig.FailureAction = "reboot";
+};
+```
+
+The systemd service type must be of type `notify` or `oneshot` for systemd to dectect the startup error properly.
+
+## Interaction with specialisations {#sec-automatic-boot-assessment-specialisations}
+
+When the boot-counting feature is enabled, `systemd-boot` will still try the boot entries in the same order as they are displayed in the boot menu. This means that the specialisations of a given generation will be tried directly after that generation, but that behavior is customizable with the [boot.loader.systemd-boot.sortKey](#opt-boot.loader.systemd-boot.sortKey) option.
+
+## Limitations {#sec-automatic-boot-assessment-limitations}
+
+This feature has to be used wisely to not risk any data integrity issues. Rollbacking into past generations can sometimes be dangerous, for example if some of the services may have undefined behaviors in the presence of unrecognized data migrations from future versions of themselves.
diff --git a/nixos/modules/system/boot/loader/systemd-boot/systemd-boot-builder.py b/nixos/modules/system/boot/loader/systemd-boot/systemd-boot-builder.py
index 694d34d1c059a..c4324a8eae5bc 100644
--- a/nixos/modules/system/boot/loader/systemd-boot/systemd-boot-builder.py
+++ b/nixos/modules/system/boot/loader/systemd-boot/systemd-boot-builder.py
@@ -12,8 +12,9 @@ import subprocess
 import sys
 import warnings
 import json
-from typing import NamedTuple, Dict, List
+from typing import NamedTuple, Any, Type
 from dataclasses import dataclass
+from pathlib import Path
 
 # These values will be replaced with actual values during the package build
 EFI_SYS_MOUNT_POINT = "@efiSysMountPoint@"
@@ -21,34 +22,145 @@ BOOT_MOUNT_POINT = "@bootMountPoint@"
 LOADER_CONF = f"{EFI_SYS_MOUNT_POINT}/loader/loader.conf"  # Always stored on the ESP
 NIXOS_DIR = "@nixosDir@"
 TIMEOUT = "@timeout@"
-EDITOR = "@editor@" == "1"
+EDITOR = "@editor@" == "1" # noqa: PLR0133
 CONSOLE_MODE = "@consoleMode@"
 BOOTSPEC_TOOLS = "@bootspecTools@"
 DISTRO_NAME = "@distroName@"
 NIX = "@nix@"
 SYSTEMD = "@systemd@"
 CONFIGURATION_LIMIT = int("@configurationLimit@")
+REBOOT_FOR_BITLOCKER = bool("@rebootForBitlocker@")
 CAN_TOUCH_EFI_VARIABLES = "@canTouchEfiVariables@"
 GRACEFUL = "@graceful@"
 COPY_EXTRA_FILES = "@copyExtraFiles@"
 CHECK_MOUNTPOINTS = "@checkMountpoints@"
+BOOT_COUNTING_TRIES = "@bootCountingTries@"
+BOOT_COUNTING = "@bootCounting@" == "True"
 
 @dataclass
 class BootSpec:
     init: str
     initrd: str
     kernel: str
-    kernelParams: List[str]
+    kernelParams: list[str]  # noqa: N815
     label: str
     system: str
     toplevel: str
-    specialisations: Dict[str, "BootSpec"]
-    sortKey: str
-    initrdSecrets: str | None = None
+    specialisations: dict[str, "BootSpec"]
+    sortKey: str  # noqa: N815
+    devicetree: str | None = None  # noqa: N815
+    initrdSecrets: str | None = None  # noqa: N815
 
+@dataclass
+class Entry:
+    profile: str | None
+    generation_number: int
+    specialisation: str | None
+
+    @classmethod
+    def from_path(cls: Type["Entry"], path: Path) -> "Entry":
+        filename = path.name
+        # Matching nixos-$profile-generation-*.conf
+        rex_profile = re.compile(r"^nixos-(.*)-generation-.*\.conf$")
+        # Matching nixos*-generation-$number*.conf
+        rex_generation = re.compile(r"^nixos.*-generation-([0-9]+).*\.conf$")
+        # Matching nixos*-generation-$number-specialisation-$specialisation_name*.conf
+        rex_specialisation = re.compile(r"^nixos.*-generation-([0-9]+)-specialisation-([a-zA-Z0-9_]+).*\.conf$")
+        profile = rex_profile.sub(r"\1", filename) if rex_profile.match(filename) else None
+        specialisation = rex_specialisation.sub(r"\2", filename) if rex_specialisation.match(filename) else None
+        try:
+            generation_number = int(rex_generation.sub(r"\1", filename))
+        except ValueError:
+            raise
+        return cls(profile, generation_number, specialisation)
+
+@dataclass
+class DiskEntry:
+    entry: Entry
+    default: bool
+    counters: str | None
+    title: str | None
+    description: str | None
+    kernel: str
+    initrd: str
+    kernel_params: str | None
+    machine_id: str | None
+    sort_key: str
+    devicetree: str | None
+
+    @classmethod
+    def from_path(cls: Type["DiskEntry"], path: Path) -> "DiskEntry":
+        entry = Entry.from_path(path)
+        data = path.read_text().splitlines()
+        if '' in data:
+            data.remove('')
+        entry_map = dict(lines.split(' ', 1) for lines in data)
+        assert "linux" in entry_map
+        assert "initrd" in entry_map
+        filename = path.name
+        # Matching nixos*-generation-*$counters.conf
+        rex_counters = re.compile(r"^nixos.*-generation-.*(\+\d(-\d)?)\.conf$")
+        counters = rex_counters.sub(r"\1", filename) if rex_counters.match(filename) else None
+        disk_entry = cls(
+            entry=entry,
+            default=(entry_map.get("sort-key") == "default"),
+            counters=counters,
+            title=entry_map.get("title"),
+            description=entry_map.get("version"),
+            kernel=entry_map["linux"],
+            initrd=entry_map["initrd"],
+            kernel_params=entry_map.get("options"),
+            machine_id=entry_map.get("machine-id"),
+            sort_key=entry_map.get("sort_key", "nixos"),
+            devicetree=entry_map.get("devicetree"),
+        )
+        return disk_entry
+
+    def write(self, sorted_first: str) -> None:
+        # Compute a sort-key sorted before sorted_first
+        # This will compute something like: nixos -> nixor-default to make sure we come before other nixos entries,
+        # while allowing users users can pre-pend their own entries before.
+        default_sort_key = sorted_first[:-1] + chr(ord(sorted_first[-1])-1) + "-default"
+        tmp_path = self.path.with_suffix(".tmp")
+        with tmp_path.open('w') as f:
+            # We use "sort-key" to sort the default generation first.
+            # The "default" string is sorted before "non-default" (alphabetically)
+            boot_entry = [
+                f"title {self.title}" if self.title is not None else None,
+                f"version {self.description}" if self.description is not None else None,
+                f"linux {self.kernel}",
+                f"initrd  {self.initrd}",
+                f"options {self.kernel_params}" if self.kernel_params is not None else None,
+                f"machine-id {self.machine_id}" if self.machine_id is not None else None,
+                f"sort-key {default_sort_key if self.default else self.sort_key}",
+                f"devicetree {self.devicetree}" if self.devicetree is not None else None,
+            ]
+
+            f.write("\n".join(filter(None, boot_entry)))
+            f.flush()
+            os.fsync(f.fileno())
+        tmp_path.rename(self.path)
+
+
+    @property
+    def path(self) -> Path:
+        pieces = [
+            "nixos",
+            self.entry.profile or None,
+            "generation",
+            str(self.entry.generation_number),
+            f"specialisation-{self.entry.specialisation}" if self.entry.specialisation else None,
+        ]
+        prefix = "-".join(p for p in pieces if p)
+        return Path(f"{BOOT_MOUNT_POINT}/loader/entries/{prefix}{self.counters if self.counters else ''}.conf")
 
 libc = ctypes.CDLL("libc.so.6")
 
+FILE = None | int
+
+def run(cmd: list[str], stdout: FILE = None) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(cmd, check=True, text=True, stdout=stdout)
+
 class SystemIdentifier(NamedTuple):
     profile: str | None
     generation: int
@@ -73,37 +185,35 @@ def system_dir(profile: str | None, generation: int, specialisation: str | None)
     else:
         return d
 
-BOOT_ENTRY = """title {title}
-sort-key {sort_key}
-version Generation {generation} {description}
-linux {kernel}
-initrd {initrd}
-options {kernel_params}
-"""
-
-def generation_conf_filename(profile: str | None, generation: int, specialisation: str | None) -> str:
-    pieces = [
-        "nixos",
-        profile or None,
-        "generation",
-        str(generation),
-        f"specialisation-{specialisation}" if specialisation else None,
-    ]
-    return "-".join(p for p in pieces if p) + ".conf"
-
-
-def write_loader_conf(profile: str | None, generation: int, specialisation: str | None) -> None:
-    with open(f"{LOADER_CONF}.tmp", 'w') as f:
-        if TIMEOUT != "":
-            f.write(f"timeout {TIMEOUT}\n")
-        f.write("default %s\n" % generation_conf_filename(profile, generation, specialisation))
+def write_loader_conf(profile: str | None) -> None:
+    with open(f"{EFI_SYS_MOUNT_POINT}/loader/loader.conf.tmp", 'w') as f:
+        f.write(f"timeout {TIMEOUT}\n")
+        if profile:
+            f.write("default nixos-%s-generation-*\n" % profile)
+        else:
+            f.write("default nixos-generation-*\n")
         if not EDITOR:
             f.write("editor 0\n")
+        if REBOOT_FOR_BITLOCKER:
+            f.write("reboot-for-bitlocker yes\n");
         f.write(f"console-mode {CONSOLE_MODE}\n")
         f.flush()
         os.fsync(f.fileno())
     os.rename(f"{LOADER_CONF}.tmp", LOADER_CONF)
 
+def scan_entries() -> list[DiskEntry]:
+    """
+    Scan all entries in $ESP/loader/entries/*
+    Does not support Type 2 entries as we do not support them for now.
+    Returns a generator of Entry.
+    """
+    entries = []
+    for path in Path(f"{EFI_SYS_MOUNT_POINT}/loader/entries/").glob("nixos*-generation-[1-9]*.conf"):
+        try:
+            entries.append(DiskEntry.from_path(path))
+        except ValueError:
+            continue
+    return entries
 
 def get_bootspec(profile: str | None, generation: int) -> BootSpec:
     system_directory = system_dir(profile, generation, None)
@@ -112,25 +222,30 @@ def get_bootspec(profile: str | None, generation: int) -> BootSpec:
         boot_json_f = open(boot_json_path, 'r')
         bootspec_json = json.load(boot_json_f)
     else:
-        boot_json_str = subprocess.check_output([
-        f"{BOOTSPEC_TOOLS}/bin/synthesize",
-        "--version",
-        "1",
-        system_directory,
-        "/dev/stdout"],
-        universal_newlines=True)
+        boot_json_str = run(
+            [
+                f"{BOOTSPEC_TOOLS}/bin/synthesize",
+                "--version",
+                "1",
+                system_directory,
+                "/dev/stdout",
+            ],
+            stdout=subprocess.PIPE,
+        ).stdout
         bootspec_json = json.loads(boot_json_str)
     return bootspec_from_json(bootspec_json)
 
-def bootspec_from_json(bootspec_json: Dict) -> BootSpec:
+def bootspec_from_json(bootspec_json: dict[str, Any]) -> BootSpec:
     specialisations = bootspec_json['org.nixos.specialisation.v1']
     specialisations = {k: bootspec_from_json(v) for k, v in specialisations.items()}
     systemdBootExtension = bootspec_json.get('org.nixos.systemd-boot', {})
     sortKey = systemdBootExtension.get('sortKey', 'nixos')
+    devicetree = systemdBootExtension.get('devicetree')
     return BootSpec(
         **bootspec_json['org.nixos.bootspec.v1'],
         specialisations=specialisations,
-        sortKey=sortKey
+        sortKey=sortKey,
+        devicetree=devicetree,
     )
 
 
@@ -143,12 +258,19 @@ def copy_from_file(file: str, dry_run: bool = False) -> str:
         copy_if_not_exists(store_file_path, f"{BOOT_MOUNT_POINT}{efi_file_path}")
     return efi_file_path
 
-def write_entry(profile: str | None, generation: int, specialisation: str | None,
-                machine_id: str, bootspec: BootSpec, current: bool) -> None:
+def write_entry(profile: str | None,
+                generation: int,
+                specialisation: str | None,
+                machine_id: str,
+                bootspec: BootSpec,
+                entries: list[DiskEntry],
+                sorted_first: str,
+                current: bool) -> None:
     if specialisation:
         bootspec = bootspec.specialisations[specialisation]
     kernel = copy_from_file(bootspec.kernel)
     initrd = copy_from_file(bootspec.initrd)
+    devicetree = copy_from_file(bootspec.devicetree) if bootspec.devicetree is not None else None
 
     title = "{name}{profile}{specialisation}".format(
         name=DISTRO_NAME,
@@ -157,7 +279,7 @@ def write_entry(profile: str | None, generation: int, specialisation: str | None
 
     try:
         if bootspec.initrdSecrets is not None:
-            subprocess.check_call([bootspec.initrdSecrets, f"{BOOT_MOUNT_POINT}%s" % (initrd)])
+            run([bootspec.initrdSecrets, f"{BOOT_MOUNT_POINT}%s" % (initrd)])
     except subprocess.CalledProcessError:
         if current:
             print("failed to create initrd secrets!", file=sys.stderr)
@@ -167,38 +289,46 @@ def write_entry(profile: str | None, generation: int, specialisation: str | None
                   f'for "{title} - Configuration {generation}", an older generation', file=sys.stderr)
             print("note: this is normal after having removed "
                   "or renamed a file in `boot.initrd.secrets`", file=sys.stderr)
-    entry_file = f"{BOOT_MOUNT_POINT}/loader/entries/%s" % (
-        generation_conf_filename(profile, generation, specialisation))
-    tmp_path = "%s.tmp" % (entry_file)
     kernel_params = "init=%s " % bootspec.init
-
     kernel_params = kernel_params + " ".join(bootspec.kernelParams)
     build_time = int(os.path.getctime(system_dir(profile, generation, specialisation)))
     build_date = datetime.datetime.fromtimestamp(build_time).strftime('%F')
-
-    with open(tmp_path, 'w') as f:
-        f.write(BOOT_ENTRY.format(title=title,
-                    sort_key=bootspec.sortKey,
-                    generation=generation,
-                    kernel=kernel,
-                    initrd=initrd,
-                    kernel_params=kernel_params,
-                    description=f"{bootspec.label}, built on {build_date}"))
-        if machine_id is not None:
-            f.write("machine-id %s\n" % machine_id)
-        f.flush()
-        os.fsync(f.fileno())
-    os.rename(tmp_path, entry_file)
-
+    counters = f"+{BOOT_COUNTING_TRIES}" if BOOT_COUNTING else ""
+    entry = Entry(profile, generation, specialisation)
+    # We check if the entry we are writing is already on disk
+    # and we update its "default entry" status
+    for entry_on_disk in entries:
+        if entry == entry_on_disk.entry:
+            entry_on_disk.default = current
+            entry_on_disk.write(sorted_first)
+            return
+
+    DiskEntry(
+        entry=entry,
+        title=title,
+        kernel=kernel,
+        initrd=initrd,
+        counters=counters,
+        kernel_params=kernel_params,
+        machine_id=machine_id,
+        description=f"Generation {generation} {bootspec.label}, built on {build_date}",
+        sort_key=bootspec.sortKey,
+        devicetree=devicetree,
+        default=current
+    ).write(sorted_first)
 
 def get_generations(profile: str | None = None) -> list[SystemIdentifier]:
-    gen_list = subprocess.check_output([
-        f"{NIX}/bin/nix-env",
-        "--list-generations",
-        "-p",
-        "/nix/var/nix/profiles/%s" % ("system-profiles/" + profile if profile else "system")],
-        universal_newlines=True)
-    gen_lines = gen_list.split('\n')
+    gen_list = run(
+        [
+            f"{NIX}/bin/nix-env",
+            "--list-generations",
+            "-p",
+            "/nix/var/nix/profiles/%s"
+            % ("system-profiles/" + profile if profile else "system"),
+        ],
+        stdout=subprocess.PIPE,
+    ).stdout
+    gen_lines = gen_list.split("\n")
     gen_lines.pop()
 
     configurationLimit = CONFIGURATION_LIMIT
@@ -213,30 +343,19 @@ def get_generations(profile: str | None = None) -> list[SystemIdentifier]:
     return configurations[-configurationLimit:]
 
 
-def remove_old_entries(gens: list[SystemIdentifier]) -> None:
-    rex_profile = re.compile(r"^" + re.escape(BOOT_MOUNT_POINT) + "/loader/entries/nixos-(.*)-generation-.*\.conf$")
-    rex_generation = re.compile(r"^" + re.escape(BOOT_MOUNT_POINT) + "/loader/entries/nixos.*-generation-([0-9]+)(-specialisation-.*)?\.conf$")
+def remove_old_entries(gens: list[SystemIdentifier], disk_entries: list[DiskEntry]) -> None:
     known_paths = []
     for gen in gens:
         bootspec = get_bootspec(gen.profile, gen.generation)
         known_paths.append(copy_from_file(bootspec.kernel, True))
         known_paths.append(copy_from_file(bootspec.initrd, True))
-    for path in glob.iglob(f"{BOOT_MOUNT_POINT}/loader/entries/nixos*-generation-[1-9]*.conf"):
-        if rex_profile.match(path):
-            prof = rex_profile.sub(r"\1", path)
-        else:
-            prof = None
-        try:
-            gen_number = int(rex_generation.sub(r"\1", path))
-        except ValueError:
-            continue
-        if not (prof, gen_number, None) in gens:
-            os.unlink(path)
-    for path in glob.iglob(f"{BOOT_MOUNT_POINT}/{NIXOS_DIR}/*"):
-        if not path in known_paths and not os.path.isdir(path):
+    for disk_entry in disk_entries:
+        if (disk_entry.entry.profile, disk_entry.entry.generation_number, None) not in gens:
+            os.unlink(disk_entry.path)
+    for path in glob.iglob(f"{EFI_SYS_MOUNT_POINT}/efi/nixos/*"):
+        if path not in known_paths and not os.path.isdir(path):
             os.unlink(path)
 
-
 def cleanup_esp() -> None:
     for path in glob.iglob(f"{EFI_SYS_MOUNT_POINT}/loader/entries/nixos*"):
         os.unlink(path)
@@ -255,7 +374,7 @@ def get_profiles() -> list[str]:
 def install_bootloader(args: argparse.Namespace) -> None:
     try:
         with open("/etc/machine-id") as machine_file:
-            machine_id = machine_file.readlines()[0]
+            machine_id = machine_file.readlines()[0].strip()
     except IOError as e:
         if e.errno != errno.ENOENT:
             raise
@@ -263,9 +382,7 @@ def install_bootloader(args: argparse.Namespace) -> None:
         # be there on newly installed systems, so let's generate one so that
         # bootctl can find it and we can also pass it to write_entry() later.
         cmd = [f"{SYSTEMD}/bin/systemd-machine-id-setup", "--print"]
-        machine_id = subprocess.run(
-          cmd, text=True, check=True, stdout=subprocess.PIPE
-        ).stdout.rstrip()
+        machine_id = run(cmd, stdout=subprocess.PIPE).stdout.rstrip()
 
     if os.getenv("NIXOS_INSTALL_GRUB") == "1":
         warnings.warn("NIXOS_INSTALL_GRUB env var deprecated, use NIXOS_INSTALL_BOOTLOADER", DeprecationWarning)
@@ -288,14 +405,32 @@ def install_bootloader(args: argparse.Namespace) -> None:
         if os.path.exists(LOADER_CONF):
             os.unlink(LOADER_CONF)
 
-        subprocess.check_call([f"{SYSTEMD}/bin/bootctl", f"--esp-path={EFI_SYS_MOUNT_POINT}"] + bootctl_flags + ["install"])
+        run(
+            [f"{SYSTEMD}/bin/bootctl", f"--esp-path={EFI_SYS_MOUNT_POINT}"]
+            + bootctl_flags
+            + ["install"]
+        )
     else:
         # Update bootloader to latest if needed
-        available_out = subprocess.check_output([f"{SYSTEMD}/bin/bootctl", "--version"], universal_newlines=True).split()[2]
-        installed_out = subprocess.check_output([f"{SYSTEMD}/bin/bootctl", f"--esp-path={EFI_SYS_MOUNT_POINT}", "status"], universal_newlines=True)
+        available_out = run(
+            [f"{SYSTEMD}/bin/bootctl", "--version"], stdout=subprocess.PIPE
+        ).stdout.split()[2]
+        installed_out = run(
+            [f"{SYSTEMD}/bin/bootctl", f"--esp-path={EFI_SYS_MOUNT_POINT}", "status"],
+            stdout=subprocess.PIPE,
+        ).stdout
 
         # See status_binaries() in systemd bootctl.c for code which generates this
-        installed_match = re.search(r"^\W+File:.*/EFI/(?:BOOT|systemd)/.*\.efi \(systemd-boot ([\d.]+[^)]*)\)$",
+        # Matches
+        # Available Boot Loaders on ESP:
+        #  ESP: /boot (/dev/disk/by-partuuid/9b39b4c4-c48b-4ebf-bfea-a56b2395b7e0)
+        # File: └─/EFI/systemd/systemd-bootx64.efi (systemd-boot 255.2)
+        # But also:
+        # Available Boot Loaders on ESP:
+        #  ESP: /boot (/dev/disk/by-partuuid/9b39b4c4-c48b-4ebf-bfea-a56b2395b7e0)
+        # File: ├─/EFI/systemd/HashTool.efi
+        #       └─/EFI/systemd/systemd-bootx64.efi (systemd-boot 255.2)
+        installed_match = re.search(r"^\W+.*/EFI/(?:BOOT|systemd)/.*\.efi \(systemd-boot ([\d.]+[^)]*)\)$",
                       installed_out, re.IGNORECASE | re.MULTILINE)
 
         available_match = re.search(r"^\((.*)\)$", available_out)
@@ -311,7 +446,11 @@ def install_bootloader(args: argparse.Namespace) -> None:
 
         if installed_version < available_version:
             print("updating systemd-boot from %s to %s" % (installed_version, available_version))
-            subprocess.check_call([f"{SYSTEMD}/bin/bootctl", f"--esp-path={EFI_SYS_MOUNT_POINT}"] + bootctl_flags + ["update"])
+            run(
+                [f"{SYSTEMD}/bin/bootctl", f"--esp-path={EFI_SYS_MOUNT_POINT}"]
+                + bootctl_flags
+                + ["update"]
+            )
 
     os.makedirs(f"{BOOT_MOUNT_POINT}/{NIXOS_DIR}", exist_ok=True)
     os.makedirs(f"{BOOT_MOUNT_POINT}/loader/entries", exist_ok=True)
@@ -319,18 +458,32 @@ def install_bootloader(args: argparse.Namespace) -> None:
     gens = get_generations()
     for profile in get_profiles():
         gens += get_generations(profile)
-
-    remove_old_entries(gens)
+    entries = scan_entries()
+    remove_old_entries(gens, entries)
+    # Compute the sort-key that will be sorted first.
+    sorted_first = ""
+    for gen in gens:
+        try:
+            bootspec = get_bootspec(gen.profile, gen.generation)
+            if bootspec.sortKey < sorted_first or sorted_first == "":
+                sorted_first = bootspec.sortKey
+        except OSError as e:
+            # See https://github.com/NixOS/nixpkgs/issues/114552
+            if e.errno == errno.EINVAL:
+                profile = f"profile '{gen.profile}'" if gen.profile else "default profile"
+                print("ignoring {} in the list of boot entries because of the following error:\n{}".format(profile, e), file=sys.stderr)
+            else:
+                raise e
 
     for gen in gens:
         try:
             bootspec = get_bootspec(gen.profile, gen.generation)
             is_default = os.path.dirname(bootspec.init) == args.default_config
-            write_entry(*gen, machine_id, bootspec, current=is_default)
+            write_entry(*gen, machine_id, bootspec, entries, sorted_first, current=is_default)
             for specialisation in bootspec.specialisations.keys():
-                write_entry(gen.profile, gen.generation, specialisation, machine_id, bootspec, current=is_default)
+                write_entry(gen.profile, gen.generation, specialisation, machine_id, bootspec, entries, sorted_first, current=(is_default and bootspec.specialisations[specialisation].sortKey == bootspec.sortKey))
             if is_default:
-                write_loader_conf(*gen)
+                write_loader_conf(gen.profile)
         except OSError as e:
             # See https://github.com/NixOS/nixpkgs/issues/114552
             if e.errno == errno.EINVAL:
@@ -362,7 +515,7 @@ def install_bootloader(args: argparse.Namespace) -> None:
 
     os.makedirs(f"{BOOT_MOUNT_POINT}/{NIXOS_DIR}/.extra-files", exist_ok=True)
 
-    subprocess.check_call(COPY_EXTRA_FILES)
+    run([COPY_EXTRA_FILES])
 
 
 def main() -> None:
@@ -370,7 +523,7 @@ def main() -> None:
     parser.add_argument('default_config', metavar='DEFAULT-CONFIG', help=f"The default {DISTRO_NAME} config to boot")
     args = parser.parse_args()
 
-    subprocess.check_call(CHECK_MOUNTPOINTS)
+    run([CHECK_MOUNTPOINTS])
 
     try:
         install_bootloader(args)
diff --git a/nixos/modules/system/boot/loader/systemd-boot/systemd-boot.nix b/nixos/modules/system/boot/loader/systemd-boot/systemd-boot.nix
index e73048dc2ecbe..bd4dbe96ff3a7 100644
--- a/nixos/modules/system/boot/loader/systemd-boot/systemd-boot.nix
+++ b/nixos/modules/system/boot/loader/systemd-boot/systemd-boot.nix
@@ -22,6 +22,8 @@ let
   '';
 
   systemdBootBuilder = pkgs.substituteAll rec {
+    name = "systemd-boot";
+
     src = checkedSource;
 
     isExecutable = true;
@@ -34,11 +36,11 @@ let
 
     nix = config.nix.package.out;
 
-    timeout = optionalString (config.boot.loader.timeout != null) config.boot.loader.timeout;
+    timeout = if config.boot.loader.timeout == null then "menu-force" else config.boot.loader.timeout;
 
     configurationLimit = if cfg.configurationLimit == null then 0 else cfg.configurationLimit;
 
-    inherit (cfg) consoleMode graceful editor;
+    inherit (cfg) consoleMode graceful editor rebootForBitlocker;
 
     inherit (efi) efiSysMountPoint canTouchEfiVariables;
 
@@ -78,6 +80,8 @@ let
         ${pkgs.coreutils}/bin/install -D $empty_file "${bootMountPoint}/${nixosDir}/.extra-files/loader/entries/"${escapeShellArg n}
       '') cfg.extraEntries)}
     '';
+    bootCountingTries = cfg.bootCounting.tries;
+    bootCounting = if cfg.bootCounting.enable then "True" else "False";
   };
 
   finalSystemdBootBuilder = pkgs.writeScript "install-systemd-boot.sh" ''
@@ -87,7 +91,10 @@ let
   '';
 in {
 
-  meta.maintainers = with lib.maintainers; [ julienmalka ];
+  meta = {
+    maintainers = with lib.maintainers; [ julienmalka ];
+    doc = ./boot-counting.md;
+  };
 
   imports =
     [ (mkRenamedOptionModule [ "boot" "loader" "gummiboot" "enable" ] [ "boot" "loader" "systemd-boot" "enable" ])
@@ -184,6 +191,15 @@ in {
       '';
     };
 
+    installDeviceTree = mkOption {
+      default = with config.hardware.deviceTree; enable && name != null;
+      defaultText = ''with config.hardware.deviceTree; enable && name != null'';
+      description = ''
+        Install the devicetree blob specified by `config.hardware.deviceTree.name`
+        to the ESP and instruct systemd-boot to pass this DTB to linux.
+      '';
+    };
+
     extraInstallCommands = mkOption {
       default = "";
       example = ''
@@ -317,6 +333,31 @@ in {
       '';
     };
 
+    bootCounting = {
+      enable = mkEnableOption "automatic boot assessment";
+      tries = mkOption {
+        default = 3;
+        type = types.int;
+        description = "number of tries each entry should start with";
+      };
+    };
+
+    rebootForBitlocker = mkOption {
+      default = false;
+
+      type = types.bool;
+
+      description = ''
+        Enable *EXPERIMENTAL* BitLocker support.
+
+        Try to detect BitLocker encrypted drives along with an active
+        TPM. If both are found and Windows Boot Manager is selected in
+        the boot menu, set the "BootNext" EFI variable and restart the
+        system. The firmware will then start Windows Boot Manager
+        directly, leaving the TPM PCRs in expected states so that
+        Windows can unseal the encryption key.
+      '';
+    };
   };
 
   config = mkIf cfg.enable {
@@ -337,6 +378,10 @@ in {
         assertion = (config.boot.kernelPackages.kernel.features or { efiBootStub = true; }) ? efiBootStub;
         message = "This kernel does not support the EFI boot stub";
       }
+      {
+        assertion = cfg.installDeviceTree -> config.hardware.deviceTree.enable -> config.hardware.deviceTree.name != null;
+        message = "Cannot install devicetree without 'config.hardware.deviceTree.enable' enabled and 'config.hardware.deviceTree.name' set";
+      }
     ] ++ concatMap (filename: [
       {
         assertion = !(hasInfix "/" filename);
@@ -394,6 +439,7 @@ in {
 
     boot.bootspec.extensions."org.nixos.systemd-boot" = {
       inherit (config.boot.loader.systemd-boot) sortKey;
+      devicetree = lib.mkIf cfg.installDeviceTree "${config.hardware.deviceTree.package}/${config.hardware.deviceTree.name}";
     };
 
     system = {
diff --git a/nixos/modules/system/boot/networkd.nix b/nixos/modules/system/boot/networkd.nix
index 761bbe6e03d4a..43fa93aadf1c2 100644
--- a/nixos/modules/system/boot/networkd.nix
+++ b/nixos/modules/system/boot/networkd.nix
@@ -18,12 +18,16 @@ let
           "ManageForeignRoutes"
           "RouteTable"
           "IPv6PrivacyExtensions"
+          "IPv4Forwarding"
+          "IPv6Forwarding"
         ])
         (assertValueOneOf "SpeedMeter" boolValues)
         (assertInt "SpeedMeterIntervalSec")
         (assertValueOneOf "ManageForeignRoutingPolicyRules" boolValues)
         (assertValueOneOf "ManageForeignRoutes" boolValues)
         (assertValueOneOf "IPv6PrivacyExtensions" (boolValues ++ ["prefer-public" "kernel"]))
+        (assertValueOneOf "IPv4Forwarding" boolValues)
+        (assertValueOneOf "IPv6Forwarding" boolValues)
       ];
 
       sectionDHCPv4 = checkUnitConfig "DHCPv4" [
@@ -119,10 +123,12 @@ let
           "VNetHeader"
           "User"
           "Group"
+          "KeepCarrier"
         ])
         (assertValueOneOf "MultiQueue" boolValues)
         (assertValueOneOf "PacketInfo" boolValues)
         (assertValueOneOf "VNetHeader" boolValues)
+        (assertValueOneOf "KeepCarrier" boolValues)
       ];
 
       # See https://www.freedesktop.org/software/systemd/man/latest/systemd.netdev.html#%5BIPVTAP%5D%20Section%20Options
@@ -632,6 +638,7 @@ let
           "LinkLocalAddressing"
           "IPv6LinkLocalAddressGenerationMode"
           "IPv6StableSecretAddress"
+          "IPv4LLStartAddress"
           "IPv4LLRoute"
           "DefaultRouteOnDevice"
           "LLMNR"
@@ -649,17 +656,23 @@ let
           "DNSDefaultRoute"
           "NTP"
           "IPForward"
+          "IPv4Forwarding"
+          "IPv6Forwarding"
           "IPMasquerade"
           "IPv6PrivacyExtensions"
           "IPv6AcceptRA"
           "IPv6DuplicateAddressDetection"
           "IPv6HopLimit"
+          "IPv4ReversePathFilter"
+          "IPv4AcceptLocal"
+          "IPv4RouteLocalnet"
           "IPv4ProxyARP"
           "IPv6ProxyNDP"
           "IPv6ProxyNDPAddress"
           "IPv6SendRA"
           "DHCPPrefixDelegation"
           "IPv6MTUBytes"
+          "KeepMaster"
           "Bridge"
           "Bond"
           "VRF"
@@ -693,7 +706,9 @@ let
         (assertValueOneOf "LLDP" (boolValues ++ ["routers-only"]))
         (assertValueOneOf "EmitLLDP" (boolValues ++ ["nearest-bridge" "non-tpmr-bridge" "customer-bridge"]))
         (assertValueOneOf "DNSDefaultRoute" boolValues)
-        (assertValueOneOf "IPForward" (boolValues ++ ["ipv4" "ipv6"]))
+        (assertRemoved "IPForward" "IPv4Forwarding and IPv6Forwarding in systemd.network(5) and networkd.conf(5)")
+        (assertValueOneOf "IPv4Forwarding" boolValues)
+        (assertValueOneOf "IPv6Forwarding" boolValues)
         (assertValueOneOf "IPMasquerade" (boolValues ++ ["ipv4" "ipv6" "both"]))
         (assertValueOneOf "IPv6PrivacyExtensions" (boolValues ++ ["prefer-public" "kernel"]))
         (assertValueOneOf "IPv6AcceptRA" boolValues)
@@ -701,11 +716,15 @@ let
         (assertMinimum "IPv6DuplicateAddressDetection" 0)
         (assertInt "IPv6HopLimit")
         (assertMinimum "IPv6HopLimit" 0)
+        (assertValueOneOf "IPv4ReversePathFilter" ["no" "strict" "loose"])
+        (assertValueOneOf "IPv4AcceptLocal" boolValues)
+        (assertValueOneOf "IPv4RouteLocalnet" boolValues)
         (assertValueOneOf "IPv4ProxyARP" boolValues)
         (assertValueOneOf "IPv6ProxyNDP" boolValues)
         (assertValueOneOf "IPv6SendRA" boolValues)
         (assertValueOneOf "DHCPPrefixDelegation" boolValues)
         (assertByteFormat "IPv6MTUBytes")
+        (assertValueOneOf "KeepMaster" boolValues)
         (assertValueOneOf "ActiveSlave" boolValues)
         (assertValueOneOf "PrimarySlave" boolValues)
         (assertValueOneOf "ConfigureWithoutCarrier" boolValues)
@@ -759,8 +778,7 @@ let
         ])
         (assertInt "TypeOfService")
         (assertRange "TypeOfService" 0 255)
-        (assertInt "FirewallMark")
-        (assertRange "FirewallMark" 1 4294967295)
+        (assertRangeWithOptionalMask "FirewallMark" 1 4294967295)
         (assertInt "Priority")
         (assertPortOrPortRange "SourcePort")
         (assertPortOrPortRange "DestinationPort")
@@ -999,6 +1017,7 @@ let
           "BootServerAddress"
           "BootServerName"
           "BootFilename"
+          "IPv6OnlyPreferredSec"
         ])
         (assertInt "PoolOffset")
         (assertMinimum "PoolOffset" 0)
@@ -2824,6 +2843,7 @@ let
         "systemd-networkd-wait-online.service"
         "systemd-networkd.service"
         "systemd-networkd.socket"
+        "systemd-networkd-persistent-storage.service"
       ];
 
       environment.etc."systemd/networkd.conf" = renderConfig cfg.config;
diff --git a/nixos/modules/system/boot/plymouth.nix b/nixos/modules/system/boot/plymouth.nix
index 4fed6335f7421..68c3286b22a06 100644
--- a/nixos/modules/system/boot/plymouth.nix
+++ b/nixos/modules/system/boot/plymouth.nix
@@ -219,7 +219,7 @@ in
         # Fonts
         "/etc/plymouth/fonts".source = pkgs.runCommand "plymouth-initrd-fonts" {} ''
           mkdir -p $out
-          cp ${cfg.font} $out
+          cp ${escapeShellArg cfg.font} $out
         '';
         "/etc/fonts/fonts.conf".text = ''
           <?xml version="1.0"?>
diff --git a/nixos/modules/system/boot/stage-1.nix b/nixos/modules/system/boot/stage-1.nix
index ae05bc5ae88c4..082380216d2a7 100644
--- a/nixos/modules/system/boot/stage-1.nix
+++ b/nixos/modules/system/boot/stage-1.nix
@@ -131,6 +131,7 @@ let
 
       # Copy udev.
       copy_bin_and_libs ${udev}/bin/udevadm
+      cp ${lib.getLib udev.kmod}/lib/libkmod.so* $out/lib
       copy_bin_and_libs ${udev}/lib/systemd/systemd-sysctl
       for BIN in ${udev}/lib/udev/*_id; do
         copy_bin_and_libs $BIN
diff --git a/nixos/modules/system/boot/systemd.nix b/nixos/modules/system/boot/systemd.nix
index 14a4ab596b52c..85e9b0a68b46a 100644
--- a/nixos/modules/system/boot/systemd.nix
+++ b/nixos/modules/system/boot/systemd.nix
@@ -37,6 +37,8 @@ let
       "cryptsetup.target"
       "cryptsetup-pre.target"
       "remote-cryptsetup.target"
+    ] ++ optionals cfg.package.withTpm2Tss [
+      "tpm2.target"
     ] ++ [
       "sigpwr.target"
       "timers.target"
@@ -105,6 +107,10 @@ let
       "systemd-rfkill.service"
       "systemd-rfkill.socket"
 
+      # Boot counting
+      "boot-complete.target"
+    ] ++ lib.optional config.boot.loader.systemd-boot.bootCounting.enable "systemd-bless-boot.service" ++ [
+
       # Hibernate / suspend.
       "hibernate.target"
       "suspend.target"
@@ -112,6 +118,7 @@ let
       "sleep.target"
       "hybrid-sleep.target"
       "systemd-hibernate.service"
+      "systemd-hibernate-clear.service"
       "systemd-hybrid-sleep.service"
       "systemd-suspend.service"
       "systemd-suspend-then-hibernate.service"
@@ -136,6 +143,16 @@ let
       "systemd-ask-password-wall.path"
       "systemd-ask-password-wall.service"
 
+      # Varlink APIs
+      "systemd-bootctl@.service"
+      "systemd-bootctl.socket"
+      "systemd-creds@.service"
+      "systemd-creds.socket"
+    ] ++ lib.optional cfg.package.withTpm2Tss [
+      "systemd-pcrlock@.service"
+      "systemd-pcrlock.socket"
+    ] ++ [
+
       # Slices / containers.
       "slices.target"
     ] ++ optionals cfg.package.withImportd [
@@ -158,6 +175,7 @@ let
     ] ++ optionals cfg.package.withHostnamed [
       "dbus-org.freedesktop.hostname1.service"
       "systemd-hostnamed.service"
+      "systemd-hostnamed.socket"
     ] ++ optionals cfg.package.withPortabled [
       "dbus-org.freedesktop.portable1.service"
       "systemd-portabled.service"
@@ -323,14 +341,6 @@ in
       '';
     };
 
-    enableUnifiedCgroupHierarchy = mkOption {
-      default = true;
-      type = types.bool;
-      description = ''
-        Whether to enable the unified cgroup hierarchy (cgroupsv2); see {manpage}`cgroups(7)`.
-      '';
-    };
-
     extraConfig = mkOption {
       default = "";
       type = types.lines;
@@ -489,7 +499,7 @@ in
     system.nssModules = [ cfg.package.out ];
     system.nssDatabases = {
       hosts = (mkMerge [
-        (mkOrder 400 ["mymachines"]) # 400 to ensure it comes before resolve (which is mkBefore'd)
+        (mkOrder 400 ["mymachines"]) # 400 to ensure it comes before resolve (which is 501)
         (mkOrder 999 ["myhostname"]) # after files (which is 998), but before regular nss modules
       ]);
       passwd = (mkMerge [
@@ -676,12 +686,6 @@ in
     # https://github.com/systemd/systemd/pull/12226
     boot.kernel.sysctl."kernel.pid_max" = mkIf pkgs.stdenv.is64bit (lib.mkDefault 4194304);
 
-    boot.kernelParams = optional (!cfg.enableUnifiedCgroupHierarchy) "systemd.unified_cgroup_hierarchy=0";
-
-    # Avoid potentially degraded system state due to
-    # "Userspace Out-Of-Memory (OOM) Killer was skipped because of a failed condition check (ConditionControlGroupController=v2)."
-    systemd.oomd.enable = mkIf (!cfg.enableUnifiedCgroupHierarchy) false;
-
     services.logrotate.settings = {
       "/var/log/btmp" = mapAttrs (_: mkDefault) {
         frequency = "monthly";
@@ -705,5 +709,10 @@ in
       (mkRenamedOptionModule [ "boot" "systemd" "services" ] [ "systemd" "services" ])
       (mkRenamedOptionModule [ "jobs" ] [ "systemd" "services" ])
       (mkRemovedOptionModule [ "systemd" "generator-packages" ] "Use systemd.packages instead.")
+      (mkRemovedOptionModule ["systemd" "enableUnifiedCgroupHierarchy"] ''
+          In 256 support for cgroup v1 ('legacy' and 'hybrid' hierarchies) is now considered obsolete and systemd by default will refuse to boot under it.
+          To forcibly reenable cgroup v1 support, you can set boot.kernelParams = [ "systemd.unified_cgroup_hierachy=0" "SYSTEMD_CGROUP_ENABLE_LEGACY_FORCE=1" ].
+          NixOS does not officially support this configuration and might cause your system to be unbootable in future versions. You are on your own.
+      '')
     ];
 }
diff --git a/nixos/modules/system/boot/systemd/initrd.nix b/nixos/modules/system/boot/systemd/initrd.nix
index 6107a2594baf8..0caea104b1b52 100644
--- a/nixos/modules/system/boot/systemd/initrd.nix
+++ b/nixos/modules/system/boot/systemd/initrd.nix
@@ -70,6 +70,7 @@ let
     "systemd-tmpfiles-setup-dev.service"
     "systemd-tmpfiles-setup.service"
     "timers.target"
+    "tpm2.target"
     "umount.target"
     "systemd-bsod.service"
   ] ++ cfg.additionalUpstreamUnits;
@@ -102,7 +103,7 @@ let
   initrdBinEnv = pkgs.buildEnv {
     name = "initrd-bin-env";
     paths = map getBin cfg.initrdBin;
-    pathsToLink = ["/bin" "/sbin"];
+    pathsToLink = ["/bin"];
     postBuild = concatStringsSep "\n" (mapAttrsToList (n: v: "ln -sf '${v}' $out/bin/'${n}'") cfg.extraBin);
   };
 
@@ -111,8 +112,7 @@ let
     inherit (config.boot.initrd) compressor compressorArgs prepend;
     inherit (cfg) strip;
 
-    contents = map (path: { object = path; symlink = ""; }) (subtractLists cfg.suppressedStorePaths cfg.storePaths)
-      ++ mapAttrsToList (_: v: { object = v.source; symlink = v.target; }) (filterAttrs (_: v: v.enable) cfg.contents);
+    contents = lib.filter ({ source, ... }: !lib.elem source cfg.suppressedStorePaths) cfg.storePaths;
   };
 
 in {
@@ -160,7 +160,7 @@ in {
       description = "Set of files that have to be linked into the initrd";
       example = literalExpression ''
         {
-          "/etc/hostname".text = "mymachine";
+          "/etc/machine-id".source = /etc/machine-id;
         }
       '';
       default = {};
@@ -171,7 +171,7 @@ in {
       description = ''
         Store paths to copy into the initrd as well.
       '';
-      type = with types; listOf (oneOf [ singleLineStr package ]);
+      type = utils.systemdUtils.types.initrdStorePath;
       default = [];
     };
 
@@ -344,7 +344,8 @@ in {
     };
 
     enableTpm2 = mkOption {
-      default = true;
+      default = cfg.package.withTpm2Tss;
+      defaultText = "boot.initrd.systemd.package.withTpm2Tss";
       type = types.bool;
       description = ''
         Whether to enable TPM2 support in the initrd.
@@ -407,7 +408,7 @@ in {
         fsck = "${cfg.package.util-linux}/bin/fsck";
       };
 
-      managerEnvironment.PATH = "/bin:/sbin";
+      managerEnvironment.PATH = "/bin";
 
       contents = {
         "/tmp/.keep".text = "systemd requires the /tmp mount point in the initrd cpio archive";
@@ -416,7 +417,7 @@ in {
 
         "/etc/systemd/system.conf".text = ''
           [Manager]
-          DefaultEnvironment=PATH=/bin:/sbin
+          DefaultEnvironment=PATH=/bin
           ${cfg.extraConfig}
           ManagerEnvironment=${lib.concatStringsSep " " (lib.mapAttrsToList (n: v: "${n}=${lib.escapeShellArg v}") cfg.managerEnvironment)}
         '';
@@ -431,9 +432,9 @@ in {
         "/etc/shadow".text = "root:${if isBool cfg.emergencyAccess then optionalString (!cfg.emergencyAccess) "*" else cfg.emergencyAccess}:::::::";
 
         "/bin".source = "${initrdBinEnv}/bin";
-        "/sbin".source = "${initrdBinEnv}/sbin";
+        "/sbin".source = "${initrdBinEnv}/bin";
 
-        "/etc/sysctl.d/nixos.conf".text = "kernel.modprobe = /sbin/modprobe";
+        "/etc/sysctl.d/nixos.conf".text = "kernel.modprobe = /bin/modprobe";
         "/etc/modprobe.d/systemd.conf".source = "${cfg.package}/lib/modprobe.d/systemd.conf";
         "/etc/modprobe.d/ubuntu.conf".source = pkgs.runCommand "initrd-kmod-blacklist-ubuntu" { } ''
           ${pkgs.buildPackages.perl}/bin/perl -0pe 's/## file: iwlwifi.conf(.+?)##/##/s;' $src > $out
@@ -443,6 +444,9 @@ in {
         "/etc/os-release".source = config.boot.initrd.osRelease;
         "/etc/initrd-release".source = config.boot.initrd.osRelease;
 
+        # For systemd-journald's _HOSTNAME field; needs to be set early, cannot be backfilled.
+        "/etc/hostname".text = config.networking.hostName;
+
       } // optionalAttrs (config.environment.etc ? "modprobe.d/nixos.conf") {
         "/etc/modprobe.d/nixos.conf".source = config.environment.etc."modprobe.d/nixos.conf".source;
       };
@@ -460,6 +464,7 @@ in {
         "${cfg.package}/lib/systemd/systemd-sulogin-shell"
         "${cfg.package}/lib/systemd/systemd-sysctl"
         "${cfg.package}/lib/systemd/systemd-bsod"
+        "${cfg.package}/lib/systemd/systemd-sysroot-fstab-check"
 
         # generators
         "${cfg.package}/lib/systemd/system-generators/systemd-debug-generator"
@@ -486,7 +491,8 @@ in {
         # fido2 support
         "${cfg.package}/lib/cryptsetup/libcryptsetup-token-systemd-fido2.so"
         "${pkgs.libfido2}/lib/libfido2.so.1"
-      ] ++ jobScripts;
+      ] ++ jobScripts
+      ++ map (c: builtins.removeAttrs c ["text"]) (builtins.attrValues cfg.contents);
 
       targets.initrd.aliases = ["default.target"];
       units =
diff --git a/nixos/modules/system/boot/systemd/journald.nix b/nixos/modules/system/boot/systemd/journald.nix
index f9f05d2b08f41..180a5cf6c396b 100644
--- a/nixos/modules/system/boot/systemd/journald.nix
+++ b/nixos/modules/system/boot/systemd/journald.nix
@@ -72,7 +72,7 @@ in {
       type = types.lines;
       example = "Storage=volatile";
       description = ''
-        Extra config options for systemd-journald. See man journald.conf
+        Extra config options for systemd-journald. See {manpage}`journald.conf(5)`
         for available options.
       '';
     };
@@ -96,6 +96,7 @@ in {
       "systemd-journald@.service"
       "systemd-journal-flush.service"
       "systemd-journal-catalog-update.service"
+      "systemd-journald-sync@.service"
       ] ++ (optional (!config.boot.isContainer) "systemd-journald-audit.socket") ++ [
       "systemd-journald-dev-log.socket"
       "syslog.socket"
diff --git a/nixos/modules/system/boot/systemd/nspawn.nix b/nixos/modules/system/boot/systemd/nspawn.nix
index 11fbb88838e10..e9bf82c462a95 100644
--- a/nixos/modules/system/boot/systemd/nspawn.nix
+++ b/nixos/modules/system/boot/systemd/nspawn.nix
@@ -127,6 +127,9 @@ in {
         })
         {
           systemd.targets.multi-user.wants = [ "machines.target" ];
+          systemd.services."systemd-nspawn@".environment = {
+            SYSTEMD_NSPAWN_UNIFIED_HIERARCHY = mkDefault "1";
+          };
         }
       ];
 }
diff --git a/nixos/modules/system/boot/systemd/shutdown.nix b/nixos/modules/system/boot/systemd/shutdown.nix
index 5c2525a57b4be..48477954e20c7 100644
--- a/nixos/modules/system/boot/systemd/shutdown.nix
+++ b/nixos/modules/system/boot/systemd/shutdown.nix
@@ -2,10 +2,7 @@
 
   cfg = config.systemd.shutdownRamfs;
 
-  ramfsContents = let
-    storePaths = map (p: "${p}\n") cfg.storePaths;
-    contents = lib.mapAttrsToList (_: v: "${v.source}\n${v.target}") (lib.filterAttrs (_: v: v.enable) cfg.contents);
-  in pkgs.writeText "shutdown-ramfs-contents" (lib.concatStringsSep "\n" (storePaths ++ contents));
+  ramfsContents = pkgs.writeText "shutdown-ramfs-contents.json" (builtins.toJSON cfg.storePaths);
 
 in {
   options.systemd.shutdownRamfs = {
@@ -24,7 +21,7 @@ in {
       description = ''
         Store paths to copy into the shutdown ramfs as well.
       '';
-      type = lib.types.listOf lib.types.singleLineStr;
+      type = utils.systemdUtils.types.initrdStorePath;
       default = [];
     };
   };
@@ -35,7 +32,8 @@ in {
       "/etc/initrd-release".source = config.environment.etc.os-release.source;
       "/etc/os-release".source = config.environment.etc.os-release.source;
     };
-    systemd.shutdownRamfs.storePaths = [pkgs.runtimeShell "${pkgs.coreutils}/bin"];
+    systemd.shutdownRamfs.storePaths = [pkgs.runtimeShell "${pkgs.coreutils}/bin"]
+      ++ map (c: builtins.removeAttrs c ["text"]) (builtins.attrValues cfg.contents);
 
     systemd.mounts = [{
       what = "tmpfs";
diff --git a/nixos/modules/system/boot/systemd/sysusers.nix b/nixos/modules/system/boot/systemd/sysusers.nix
index 476251e140456..8d401436daa17 100644
--- a/nixos/modules/system/boot/systemd/sysusers.nix
+++ b/nixos/modules/system/boot/systemd/sysusers.nix
@@ -5,6 +5,8 @@ let
   cfg = config.systemd.sysusers;
   userCfg = config.users;
 
+  systemUsers = lib.filterAttrs (_username: opts: !opts.isNormalUser) userCfg.users;
+
   sysusersConfig = pkgs.writeTextDir "00-nixos.conf" ''
     # Type Name ID GECOS Home directory Shell
 
@@ -16,7 +18,7 @@ let
         in
           ''u ${username} ${uid}:${opts.group} "${opts.description}" ${opts.home} ${utils.toShellPath opts.shell}''
       )
-      userCfg.users)
+      systemUsers)
     }
 
     # Groups
@@ -30,32 +32,12 @@ let
     }
   '';
 
-  staticSysusersCredentials = pkgs.runCommand "static-sysusers-credentials" { } ''
-    mkdir $out; cd $out
-    ${lib.concatLines (
-      (lib.mapAttrsToList
-        (username: opts: "echo -n '${opts.initialHashedPassword}' > 'passwd.hashed-password.${username}'")
-        (lib.filterAttrs (_username: opts: opts.initialHashedPassword != null) userCfg.users))
-        ++
-      (lib.mapAttrsToList
-        (username: opts: "echo -n '${opts.initialPassword}' > 'passwd.plaintext-password.${username}'")
-        (lib.filterAttrs (_username: opts: opts.initialPassword != null) userCfg.users))
-        ++
-      (lib.mapAttrsToList
-        (username: opts: "cat '${opts.hashedPasswordFile}' > 'passwd.hashed-password.${username}'")
-        (lib.filterAttrs (_username: opts: opts.hashedPasswordFile != null) userCfg.users))
-      )
-    }
-  '';
-
-  staticSysusers = pkgs.runCommand "static-sysusers"
-    {
-      nativeBuildInputs = [ pkgs.systemd ];
-    } ''
-    mkdir $out
-    export CREDENTIALS_DIRECTORY=${staticSysusersCredentials}
-    systemd-sysusers --root $out ${sysusersConfig}/00-nixos.conf
-  '';
+  immutableEtc = config.system.etc.overlay.enable && !config.system.etc.overlay.mutable;
+  # The location of the password files when using an immutable /etc.
+  immutablePasswordFilesLocation = "/var/lib/nixos/etc";
+  passwordFilesLocation = if immutableEtc then immutablePasswordFilesLocation else "/etc";
+  # The filenames created by systemd-sysusers.
+  passwordFiles = [ "passwd" "group" "shadow" "gshadow" ];
 
 in
 
@@ -90,95 +72,114 @@ in
         assertion = config.users.mutableUsers -> config.system.etc.overlay.enable;
         message = "config.users.mutableUsers requires config.system.etc.overlay.enable.";
       }
-    ];
-
-    systemd = lib.mkMerge [
-      ({
-
-        # Create home directories, do not create /var/empty even if that's a user's
-        # home.
-        tmpfiles.settings.home-directories = lib.mapAttrs'
-          (username: opts: lib.nameValuePair opts.home {
-            d = {
-              mode = opts.homeMode;
-              user = username;
-              group = opts.group;
-            };
-          })
-          (lib.filterAttrs (_username: opts: opts.home != "/var/empty") userCfg.users);
-
-        # Create uid/gid marker files for those without an explicit id
-        tmpfiles.settings.nixos-uid = lib.mapAttrs'
-          (username: opts: lib.nameValuePair "/var/lib/nixos/uid/${username}" {
-            f = {
-              user = username;
-            };
-          })
-          (lib.filterAttrs (_username: opts: opts.uid == null) userCfg.users);
-
-        tmpfiles.settings.nixos-gid = lib.mapAttrs'
-          (groupname: opts: lib.nameValuePair "/var/lib/nixos/gid/${groupname}" {
-            f = {
-              group = groupname;
-            };
-          })
-          (lib.filterAttrs (_groupname: opts: opts.gid == null) userCfg.groups);
+    ] ++ (lib.mapAttrsToList
+      (_username: opts: {
+        assertion = !opts.isNormalUser;
+        message = "systemd-sysusers doesn't create normal users. You can currently only use it to create system users.";
       })
+      userCfg.users)
+    ++ lib.mapAttrsToList
+      (username: opts: {
+        assertion = (opts.password == opts.initialPassword || opts.password == null) &&
+          (opts.hashedPassword == opts.initialHashedPassword || opts.hashedPassword == null);
+        message = "${username} uses password or hashedPassword. systemd-sysupdate only supports initial passwords. It'll never update your passwords.";
+      })
+      systemUsers;
+
+    systemd = {
+
+      # Create home directories, do not create /var/empty even if that's a user's
+      # home.
+      tmpfiles.settings.home-directories = lib.mapAttrs'
+        (username: opts: lib.nameValuePair opts.home {
+          d = {
+            mode = opts.homeMode;
+            user = username;
+            group = opts.group;
+          };
+        })
+        (lib.filterAttrs (_username: opts: opts.home != "/var/empty") systemUsers);
+
+      # Create uid/gid marker files for those without an explicit id
+      tmpfiles.settings.nixos-uid = lib.mapAttrs'
+        (username: opts: lib.nameValuePair "/var/lib/nixos/uid/${username}" {
+          f = {
+            user = username;
+          };
+        })
+        (lib.filterAttrs (_username: opts: opts.uid == null) systemUsers);
 
-      (lib.mkIf config.users.mutableUsers {
-        additionalUpstreamSystemUnits = [
-          "systemd-sysusers.service"
-        ];
-
-        services.systemd-sysusers = {
-          # Enable switch-to-configuration to restart the service.
-          unitConfig.ConditionNeedsUpdate = [ "" ];
-          requiredBy = [ "sysinit-reactivation.target" ];
-          before = [ "sysinit-reactivation.target" ];
-          restartTriggers = [ "${config.environment.etc."sysusers.d".source}" ];
-
-          serviceConfig = {
-            LoadCredential = lib.mapAttrsToList
-              (username: opts: "passwd.hashed-password.${username}:${opts.hashedPasswordFile}")
-              (lib.filterAttrs (_username: opts: opts.hashedPasswordFile != null) userCfg.users);
-            SetCredential = (lib.mapAttrsToList
-              (username: opts: "passwd.hashed-password.${username}:${opts.initialHashedPassword}")
-              (lib.filterAttrs (_username: opts: opts.initialHashedPassword != null) userCfg.users))
-            ++
-            (lib.mapAttrsToList
-              (username: opts: "passwd.plaintext-password.${username}:${opts.initialPassword}")
-              (lib.filterAttrs (_username: opts: opts.initialPassword != null) userCfg.users))
-            ;
+      tmpfiles.settings.nixos-gid = lib.mapAttrs'
+        (groupname: opts: lib.nameValuePair "/var/lib/nixos/gid/${groupname}" {
+          f = {
+            group = groupname;
           };
+        })
+        (lib.filterAttrs (_groupname: opts: opts.gid == null) userCfg.groups);
+
+      additionalUpstreamSystemUnits = [
+        "systemd-sysusers.service"
+      ];
+
+      services.systemd-sysusers = {
+        # Enable switch-to-configuration to restart the service.
+        unitConfig.ConditionNeedsUpdate = [ "" ];
+        requiredBy = [ "sysinit-reactivation.target" ];
+        before = [ "sysinit-reactivation.target" ];
+        restartTriggers = [ "${config.environment.etc."sysusers.d".source}" ];
+
+        serviceConfig = {
+          # When we have an immutable /etc we cannot write the files directly
+          # to /etc so we write it to a different directory and symlink them
+          # into /etc.
+          #
+          # We need to explicitly list the config file, otherwise
+          # systemd-sysusers cannot find it when we also pass another flag.
+          ExecStart = lib.mkIf immutableEtc
+            [ "" "${config.systemd.package}/bin/systemd-sysusers --root ${builtins.dirOf immutablePasswordFilesLocation} /etc/sysusers.d/00-nixos.conf" ];
+
+          # Make the source files writable before executing sysusers.
+          ExecStartPre = lib.mkIf (!userCfg.mutableUsers)
+            (lib.map
+              (file: "-${pkgs.util-linux}/bin/umount ${passwordFilesLocation}/${file}")
+              passwordFiles);
+          # Make the source files read-only after sysusers has finished.
+          ExecStartPost = lib.mkIf (!userCfg.mutableUsers)
+            (lib.map
+              (file: "${pkgs.util-linux}/bin/mount --bind -o ro ${passwordFilesLocation}/${file} ${passwordFilesLocation}/${file}")
+              passwordFiles);
+
+          LoadCredential = lib.mapAttrsToList
+            (username: opts: "passwd.hashed-password.${username}:${opts.hashedPasswordFile}")
+            (lib.filterAttrs (_username: opts: opts.hashedPasswordFile != null) systemUsers);
+          SetCredential = (lib.mapAttrsToList
+            (username: opts: "passwd.hashed-password.${username}:${opts.initialHashedPassword}")
+            (lib.filterAttrs (_username: opts: opts.initialHashedPassword != null) systemUsers))
+          ++
+          (lib.mapAttrsToList
+            (username: opts: "passwd.plaintext-password.${username}:${opts.initialPassword}")
+            (lib.filterAttrs (_username: opts: opts.initialPassword != null) systemUsers))
+          ;
         };
-      })
-    ];
+      };
 
-    environment.etc = lib.mkMerge [
-      (lib.mkIf (!userCfg.mutableUsers) {
-        "passwd" = {
-          source = "${staticSysusers}/etc/passwd";
-          mode = "0644";
-        };
-        "group" = {
-          source = "${staticSysusers}/etc/group";
-          mode = "0644";
-        };
-        "shadow" = {
-          source = "${staticSysusers}/etc/shadow";
-          mode = "0000";
-        };
-        "gshadow" = {
-          source = "${staticSysusers}/etc/gshadow";
-          mode = "0000";
-        };
-      })
+    };
 
-      (lib.mkIf userCfg.mutableUsers {
+    environment.etc = lib.mkMerge [
+      ({
         "sysusers.d".source = sysusersConfig;
       })
-    ];
 
+      # Statically create the symlinks to immutablePasswordFilesLocation when
+      # using an immutable /etc because we will not be able to do it at
+      # runtime!
+      (lib.mkIf immutableEtc (lib.listToAttrs (lib.map
+        (file: lib.nameValuePair file {
+          source = "${immutablePasswordFilesLocation}/${file}";
+          mode = "direct-symlink";
+        })
+        passwordFiles)))
+    ];
   };
 
   meta.maintainers = with lib.maintainers; [ nikstur ];
diff --git a/nixos/modules/system/boot/systemd/tmpfiles.nix b/nixos/modules/system/boot/systemd/tmpfiles.nix
index ded13728017d1..af37fb07d29bc 100644
--- a/nixos/modules/system/boot/systemd/tmpfiles.nix
+++ b/nixos/modules/system/boot/systemd/tmpfiles.nix
@@ -200,6 +200,10 @@ in
           rm -f $out/${removePrefix "tmpfiles.d/" name}
         '') config.system.build.etc.passthru.targets;
       }) + "/*";
+      "mtab" = {
+        mode = "direct-symlink";
+        source = "/proc/mounts";
+      };
     };
 
     systemd.tmpfiles.packages = [
@@ -244,13 +248,11 @@ in
       "L+ /nix/var/nix/gcroots/booted-system 0755 root root - /run/booted-system"
       "d  /run/lock                          0755 root root - -"
       "d  /var/db                            0755 root root - -"
-      "L  /etc/mtab                          -    -    -    - ../proc/mounts"
       "L  /var/lock                          -    -    -    - ../run/lock"
       # Boot-time cleanup
       "R! /etc/group.lock                    -    -    -    - -"
       "R! /etc/passwd.lock                   -    -    -    - -"
       "R! /etc/shadow.lock                   -    -    -    - -"
-      "R! /etc/mtab*                         -    -    -    - -"
       "R! /nix/var/nix/gcroots/tmp           -    -    -    - -"
       "R! /nix/var/nix/temproots             -    -    -    - -"
     ];
diff --git a/nixos/modules/system/etc/etc.nix b/nixos/modules/system/etc/etc.nix
index 0411faee6ebb8..69f4ab92548f0 100644
--- a/nixos/modules/system/etc/etc.nix
+++ b/nixos/modules/system/etc/etc.nix
@@ -64,14 +64,6 @@ let
 
   etcHardlinks = filter (f: f.mode != "symlink" && f.mode != "direct-symlink") etc';
 
-  build-composefs-dump = pkgs.buildPackages.runCommand "build-composefs-dump.py"
-    {
-      buildInputs = [ pkgs.buildPackages.python3 ];
-    } ''
-    install ${./build-composefs-dump.py} $out
-    patchShebangs --host $out
-  '';
-
 in
 
 {
@@ -255,6 +247,30 @@ in
           --options lowerdir=$tmpMetadataMount::${config.system.build.etcBasedir},${etcOverlayOptions} \
           $tmpEtcMount
 
+        # Before moving the new /etc overlay under the old /etc, we have to
+        # move mounts on top of /etc to the new /etc mountpoint.
+        findmnt /etc --submounts --list --noheading --kernel --output TARGET | while read -r mountPoint; do
+          if [[ "$mountPoint" = "/etc" ]]; then
+            continue
+          fi
+
+          tmpMountPoint="$tmpEtcMount/''${mountPoint:5}"
+            ${if config.system.etc.overlay.mutable then ''
+              if [[ -f "$mountPoint" ]]; then
+                touch "$tmpMountPoint"
+              elif [[ -d "$mountPoint" ]]; then
+                mkdir -p "$tmpMountPoint"
+              fi
+            '' else ''
+              if [[ ! -e "$tmpMountPoint" ]]; then
+                echo "Skipping undeclared mountpoint in environment.etc: $mountPoint"
+                continue
+              fi
+            ''
+          }
+          mount --bind "$mountPoint" "$tmpMountPoint"
+        done
+
         # Move the new temporary /etc mount underneath the current /etc mount.
         #
         # This should eventually use util-linux to perform this move beneath,
@@ -263,8 +279,7 @@ in
         ${pkgs.move-mount-beneath}/bin/move-mount --move --beneath $tmpEtcMount /etc
 
         # Unmount the top /etc mount to atomically reveal the new mount.
-        umount /etc
-
+        umount --recursive /etc
       fi
     '' else ''
       # Set up the statically computed bits of /etc.
@@ -295,10 +310,12 @@ in
     system.build.etcMetadataImage =
       let
         etcJson = pkgs.writeText "etc-json" (builtins.toJSON etc');
-        etcDump = pkgs.runCommand "etc-dump" { } "${build-composefs-dump} ${etcJson} > $out";
+        etcDump = pkgs.runCommand "etc-dump" { } ''
+          ${lib.getExe pkgs.buildPackages.python3} ${./build-composefs-dump.py} ${etcJson} > $out
+        '';
       in
       pkgs.runCommand "etc-metadata.erofs" {
-        nativeBuildInputs = [ pkgs.composefs pkgs.erofs-utils ];
+        nativeBuildInputs = with pkgs.buildPackages; [ composefs erofs-utils ];
       } ''
         mkcomposefs --from-file ${etcDump} $out
         fsck.erofs $out