From 15008e69542774c441e388ad4c2e28a2d27f9ba0 Mon Sep 17 00:00:00 2001 From: aszlig Date: Sat, 9 Feb 2019 14:16:51 +0100 Subject: machines/dnyarri: Stop bcache during sleep/scrub I did have a major outage this week, because I was using bcache with writeback mode on a RAID10 backing storage. Fortunately, I was able to recover 99.9% of the data (only the most recent stuff wasn't recoverable), but I certainly don't want this to happen again in the future. While I did use bcache with hibernate and writeback, the interesting part is that the caching device went bonkers after a "normal" shutdown rather than a suspend/hibernate, with "normal" being "with a bunch of kernel warnings about zswap". Also, this happened around a btrfs scrub, so the inconsistency was all over the place. So first of all, I'm now going with writaround mode rather than writeback mode for the time being. Although it's slower than writeback, the chances that I need to do such a recovery again is close to 0% with writethrough and writearound because all writes are synchronous. Second, this very change makes sure that whenever the machine goes to sleep or a scrub is started, the caching is disabled and afterwards it's re-enabled. That way we shouldn't have lots of trash on the caching device. Signed-off-by: aszlig --- machines/aszlig/dnyarri.nix | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'machines/aszlig/dnyarri.nix') diff --git a/machines/aszlig/dnyarri.nix b/machines/aszlig/dnyarri.nix index e060aac6..7e42f607 100644 --- a/machines/aszlig/dnyarri.nix +++ b/machines/aszlig/dnyarri.nix @@ -1,4 +1,4 @@ -{ pkgs, lib, ... }: +{ config, pkgs, utils, lib, ... }: let mkDevice = category: num: uuid: { @@ -19,6 +19,20 @@ let ]; }; + bcacheMode = "writearound"; + + bcacheStart = '' + for i in /sys/block/bcache[0-9]*/bcache/cache_mode; do + echo ${lib.escapeShellArg bcacheMode} > "$i" + done + ''; + + bcacheStop = '' + for i in /sys/block/bcache[0-9]*/bcache/cache_mode; do + echo none > "$i" + done + ''; + in { vuizvui.user.aszlig.profiles.workstation.enable = true; @@ -78,10 +92,24 @@ in { powerManagement.powerUpCommands = '' ${pkgs.hdparm}/sbin/hdparm -B 255 /dev/disk/by-id/ata-ST31500541AS_5XW0AMNH ${pkgs.hdparm}/sbin/hdparm -B 255 /dev/disk/by-id/ata-ST31500541AS_6XW0M217 + ${bcacheStart} ''; + powerManagement.powerDownCommands = bcacheStop; + services.btrfs.autoScrub.enable = true; + # Inject preStart/postStart for activating/deactivating bcache to the scrub + # services, so we don't get large amounts of nonsense on the caching device. + systemd.services = let + scrubServiceUnits = let + mkName = fs: "btrfs-scrub-${utils.escapeSystemdPath fs}.service"; + in map mkName config.services.btrfs.autoScrub.fileSystems; + in lib.genAttrs scrubServiceUnits (lib.const { + preStart = bcacheStop; + postStart = bcacheStart; + }); + swapDevices = map ({ name, ... }: { device = "/dev/mapper/${name}"; }) cryptDevices.swap; -- cgit 1.4.1