Skip to content

Commit a080e65

Browse files
committed
Added verification of snapshot
1 parent 9797570 commit a080e65

File tree

1 file changed

+91
-6
lines changed

1 file changed

+91
-6
lines changed

atomic-update.py

Lines changed: 91 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,21 +84,78 @@ def get_snaps(snapper_root_config):
8484
default_snap = item["number"] if item["default"] else default_snap
8585
return active_snap, default_snap
8686

87-
# Function to get latest atomic snapshot
88-
def get_atomic_snap(snapper_root_config):
87+
# Function to get latest atomic snapshot of status
88+
# valid status: created, pending, finished
89+
def get_atomic_snap(snapper_root_config, status):
8990
snaps_json = shell_exec(f"snapper --jsonout -c {snapper_root_config} list --disable-used-space")[0]
9091
snaps = json.loads(snaps_json)
9192
snaps[snapper_root_config].reverse()
9293
for item in snaps[snapper_root_config]:
9394
try:
94-
if item["userdata"]["atomic"] == "yes":
95+
if item["userdata"]["atomic"] == status:
9596
return item["number"]
9697
except:
9798
pass
9899

100+
# Function to verify snapshot by booting it up as a container
101+
def verify_snapshot():
102+
logging.debug("Booting container")
103+
cmd = ["systemd-nspawn", "--directory", TMP_MOUNT_DIR, "--ephemeral", "--boot"]
104+
subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
105+
logging.debug("Getting container id")
106+
container_id = None
107+
for _ in range(10):
108+
out, ret = shell_exec("LC_ALL=C machinectl --quiet --no-pager -o json list")
109+
containers = json.loads(out)
110+
for container in containers:
111+
if ( container["class"] == "container" and container["service"] == "systemd-nspawn" and
112+
container["machine"].startswith(f"{TMP_MOUNT_DIR.split('/').pop()}") ):
113+
container_id = container["machine"]
114+
break
115+
if container_id:
116+
break
117+
time.sleep(1)
118+
logging.debug(f"Container ID = {container_id}")
119+
if not container_id:
120+
logging.error("Could not bootup ephemeral container from snapshot. Cancelling task...")
121+
cleanup()
122+
sys.exit()
123+
logging.debug("Waiting for container bootup to finish...")
124+
startup_finished = False
125+
for _ in range(120):
126+
out, ret = shell_exec(f"LC_ALL=C machinectl --quiet shell {container_id} /usr/bin/bash -c 'systemd-analyze time'")
127+
if out.find("Startup finished") != -1:
128+
startup_finished = True
129+
break
130+
if startup_finished:
131+
break
132+
time.sleep(1)
133+
if not startup_finished:
134+
logging.error("Timeout waiting for bootup of ephemeral container from snapshot. Cancelling task...")
135+
cleanup()
136+
sys.exit()
137+
logging.debug("Getting failed systemd units")
138+
out, ret = shell_exec(f"LC_ALL=C machinectl --quiet shell {container_id} /usr/bin/bash -c 'systemctl --quiet --no-pager -o json --failed | cat -'")
139+
out = json.loads(out)
140+
failed_units = [item["unit"] for item in out]
141+
logging.debug(f"Number of failed units = {len(failed_units)}")
142+
logging.debug(f"Failed units = {', '.join(failed_units)}")
143+
logging.debug("Stopping container...")
144+
shell_exec(f"machinectl stop {container_id}")
145+
return failed_units
146+
99147
# Function to cleanup on SIGINT or successful completion
100148
def cleanup():
101149
logging.info("Cleaning up...")
150+
logging.debug("Stopping ephemeral systemd-nspawn containers...")
151+
out, ret = shell_exec("LC_ALL=C machinectl --quiet --no-pager -o json list")
152+
containers = json.loads(out)
153+
for container in containers:
154+
if ( container["class"] == "container" and container["service"] == "systemd-nspawn" and
155+
container["machine"].startswith(f"{TMP_MOUNT_DIR.split('/').pop()}") ):
156+
container_id = container["machine"]
157+
shell_exec(f"machinectl stop {container_id}")
158+
logging.debug("Cleaning up temp mounts...")
102159
umount_command = f"""
103160
LC_ALL=C mount -l | grep '{TMP_MOUNT_DIR}' | awk '{{print $3}}' | awk '{{print length, $0}}' | sort -rn | awk '{{print $2}}' | awk '{{system("umount " $0)}}';
104161
"""
@@ -107,9 +164,17 @@ def cleanup():
107164
if out == "" and ret == 0:
108165
break
109166
time.sleep(0.01)
167+
logging.debug("Cleaning up temp dirs...")
110168
shell_exec(f"rmdir {quote(TMP_MOUNT_DIR)}")
111169
shell_exec(f"rmdir {quote(TMP_DIR)}")
170+
logging.debug("Cleaning up unfinished snapshots...")
171+
snapper_root_config = get_snapper_root_config()
172+
for status in ["created", "pending"]:
173+
snap_num = get_atomic_snap(snapper_root_config, status)
174+
if snap_num:
175+
shell_exec(f"snapper -c {snapper_root_config} delete {snap_num}")
112176

177+
# Function to handle SIGINT
113178
def sigint_handler(signum, frame):
114179
signal.signal(signum, signal.SIG_IGN) # ignore additional signals
115180
cleanup()
@@ -287,16 +352,18 @@ def sigint_handler(signum, frame):
287352
# create new read-write snapshot to perform atomic update in
288353
out, ret = shell_exec(f"snapper -c {snapper_root_config} create -c number " \
289354
f"-d 'Atomic update of #{base_snap}' " \
290-
f"-u 'atomic=yes' --from {base_snap} --read-write")
355+
f"-u 'atomic=created' --from {base_snap} --read-write")
291356
if ret != 0:
292357
logging.error(f"Could not create read-write snapshot to perform atomic update in")
293358
sys.exit(6)
294359
# get latest atomic snapshot number we just created
295-
atomic_snap = get_atomic_snap(snapper_root_config)
360+
atomic_snap = get_atomic_snap(snapper_root_config, "created")
296361
logging.debug(f"Latest atomic snapshot number: {atomic_snap}")
297362
logging.info(f"Using snapshot {base_snap} as base for new snapshot {atomic_snap}")
298363
snap_subvol = f"@/.snapshots/{atomic_snap}/snapshot"
299364
snap_dir = snap_subvol.lstrip("@")
365+
# update atomic snapshot status
366+
shell_exec(f"snapper -c {snapper_root_config} modify -u 'atomic=pending' {atomic_snap}")
300367
# check the latest atomic snapshot exists as btrfs subvolume
301368
out, ret = shell_exec(f"LC_ALL=C btrfs subvolume list / | grep '{snap_subvol}'")
302369
if ret != 0:
@@ -320,6 +387,10 @@ def sigint_handler(signum, frame):
320387
chroot {TMP_MOUNT_DIR} mount -a;
321388
"""
322389
shell_exec(commands)
390+
# verify snapshot prior to performing update
391+
if not NO_VERIFY:
392+
logging.info("Verifying snapshot prior to update...")
393+
pre_failed_units = verify_snapshot()
323394
if COMMAND == "dup":
324395
# check if dup has anything to do
325396
logging.info("Checking for packages to upgrade")
@@ -351,7 +422,7 @@ def sigint_handler(signum, frame):
351422
logging.info("Command run successfully")
352423
if SHELL:
353424
logging.info(f"Opening bash shell within chroot of snapshot {atomic_snap}")
354-
logging.info("Continue with 'exit' or discard with 'exit 1'")
425+
logging.info("Continue with 'exit 0' or discard with 'exit 1'")
355426
command = f"""
356427
chroot {TMP_MOUNT_DIR} bash -c "export PS1='atomic-update:\${{PWD}} # '; exec bash"
357428
"""
@@ -361,6 +432,20 @@ def sigint_handler(signum, frame):
361432
shell_exec(f"snapper -c {snapper_root_config} delete {atomic_snap}")
362433
cleanup()
363434
sys.exit()
435+
# verify snapshot after update
436+
if not NO_VERIFY:
437+
logging.info("Verifying snapshot post update...")
438+
post_failed_units = verify_snapshot()
439+
new_failed_units = list( set(post_failed_units) - set(pre_failed_units) )
440+
if new_failed_units:
441+
logging.error(f"Discarding snapshot {atomic_snap} as the following new " \
442+
f"systemd units have failed since update: {', '.join(new_failed_units)}")
443+
shell_exec(f"snapper -c {snapper_root_config} delete {atomic_snap}")
444+
cleanup()
445+
sys.exit()
446+
# on success, update atomic snapshot status
447+
shell_exec(f"snapper -c {snapper_root_config} modify -u 'atomic=finished' {atomic_snap}")
448+
# on success, set new snapshot as the default
364449
logging.info(f"Setting snapshot {atomic_snap} ({snap_dir}) as the new default")
365450
shell_exec(f"snapper -c {snapper_root_config} modify --default {atomic_snap}")
366451
# perform cleanup

0 commit comments

Comments
 (0)