about summary refs log tree commit diff
path: root/machines/aszlig
diff options
context:
space:
mode:
authoraszlig <aszlig@nix.build>2019-02-09 14:16:51 +0100
committeraszlig <aszlig@nix.build>2019-02-09 14:16:51 +0100
commit15008e69542774c441e388ad4c2e28a2d27f9ba0 (patch)
treef766a7ce05e3602ac01894f287d860439ef712ea /machines/aszlig
parent807edf7acd83ccc2f39954174c7a0b4a82d9ed2e (diff)
machines/dnyarri: Stop bcache during sleep/scrub
I did have a major outage this week, because I was using bcache with
writeback mode on a RAID10 backing storage. Fortunately, I was able to
recover 99.9% of the data (only the most recent stuff wasn't
recoverable), but I certainly don't want this to happen again in the
future.

While I did use bcache with hibernate and writeback, the interesting
part is that the caching device went bonkers after a "normal" shutdown
rather than a suspend/hibernate, with "normal" being "with a bunch of
kernel warnings about zswap". Also, this happened around a btrfs scrub,
so the inconsistency was all over the place.

So first of all, I'm now going with writaround mode rather than
writeback mode for the time being. Although it's slower than writeback,
the chances that I need to do such a recovery again is close to 0% with
writethrough and writearound because all writes are synchronous.

Second, this very change makes sure that whenever the machine goes to
sleep or a scrub is started, the caching is disabled and afterwards it's
re-enabled. That way we shouldn't have lots of trash on the caching
device.

Signed-off-by: aszlig <aszlig@nix.build>
Diffstat (limited to 'machines/aszlig')
-rw-r--r--machines/aszlig/dnyarri.nix30
1 files changed, 29 insertions, 1 deletions
diff --git a/machines/aszlig/dnyarri.nix b/machines/aszlig/dnyarri.nix
index e060aac6..7e42f607 100644
--- a/machines/aszlig/dnyarri.nix
+++ b/machines/aszlig/dnyarri.nix
@@ -1,4 +1,4 @@
-{ pkgs, lib, ... }:
+{ config, pkgs, utils, lib, ... }:
 
 let
   mkDevice = category: num: uuid: {
@@ -19,6 +19,20 @@ let
     ];
   };
 
+  bcacheMode = "writearound";
+
+  bcacheStart = ''
+    for i in /sys/block/bcache[0-9]*/bcache/cache_mode; do
+      echo ${lib.escapeShellArg bcacheMode} > "$i"
+    done
+  '';
+
+  bcacheStop = ''
+    for i in /sys/block/bcache[0-9]*/bcache/cache_mode; do
+      echo none > "$i"
+    done
+  '';
+
 in {
   vuizvui.user.aszlig.profiles.workstation.enable = true;
 
@@ -78,10 +92,24 @@ in {
   powerManagement.powerUpCommands = ''
     ${pkgs.hdparm}/sbin/hdparm -B 255 /dev/disk/by-id/ata-ST31500541AS_5XW0AMNH
     ${pkgs.hdparm}/sbin/hdparm -B 255 /dev/disk/by-id/ata-ST31500541AS_6XW0M217
+    ${bcacheStart}
   '';
 
+  powerManagement.powerDownCommands = bcacheStop;
+
   services.btrfs.autoScrub.enable = true;
 
+  # Inject preStart/postStart for activating/deactivating bcache to the scrub
+  # services, so we don't get large amounts of nonsense on the caching device.
+  systemd.services = let
+    scrubServiceUnits = let
+      mkName = fs: "btrfs-scrub-${utils.escapeSystemdPath fs}.service";
+    in map mkName config.services.btrfs.autoScrub.fileSystems;
+  in lib.genAttrs scrubServiceUnits (lib.const {
+    preStart = bcacheStop;
+    postStart = bcacheStart;
+  });
+
   swapDevices = map ({ name, ... }: {
     device = "/dev/mapper/${name}";
   }) cryptDevices.swap;