Christoph's last Weblog entries

Backup strategy
27th June 2014

I've been working on my backup strategy for the notebook recently. The idea is to have full backups every now month and then incremental backups in between as fine-grained as possible. As it's a mobile device there's no point in time where it is guaranteed to be up, connected and within reach of the backup server.

As I'm running Debian GNU/kFreeBSD on it, using ZFS and specifically zfs send comes quite naturally. I'm now generating a new file system snapshot every day (if the notebook happens to be online during that day) using cron.

@daily zfs snapshot base/root@`date -I`
@daily zfs snapshot base/home@`date -I`
@reboot zfs snapshot base/root@`date -I`
@reboot zfs snapshot base/home@`date -I`

When connected to the home network I'm synchronizing off all incrementals that are not yet on the backup server. This is using zfs send together with gpg to encrypt the data and then put it off to some sftp storage. For the first snapshot every month a full backup is created. As there doesn't seem to be a way to merge zfs send streams without importing everything in a zfs pool I create additional incremental streams to the first snapshot of last month so I'm able to delete older full backups and daily snapshots and still keep coarse-gained backups for a longer period of time.

#!/usr/bin/python
# -*- coding: utf-8 -*-

####################
# Config
SFTP_HOST = 'botero.siccegge.de'
SFTP_DIR  = '/srv/backup/mitoraj'
SFTP_USER = 'root'
ZPOOL     = 'base'
GPGUSER   = '9FED5C6CE206B70A585770CA965522B9D49AE731'
#
####################

import subprocess
import os.path
import sys
import paramiko


term = {
    'green':  "\033[0;32m",
    'red':    "\033[0;31m",
    'yellow': "\033[0;33m",
    'purple': "\033[0;35m",
    'none':   "\033[0m",
    }

sftp = None

def print_colored(data, color):
    sys.stdout.write(term[color])
    sys.stdout.write(data)
    sys.stdout.write(term['none'])
    sys.stdout.write('\n')
    sys.stdout.flush()

def postprocess_datasets(datasets):
    devices = set([entry.split('@')[0] for entry in datasets])

    result = dict()
    for device in devices:
        result[device] = sorted([ entry.split('@')[1] for entry in datasets
                                    if entry.startswith(device) ])

    return result

def sftp_connect():
    global sftp

    host_keys = paramiko.util.load_host_keys(os.path.expanduser('~/.ssh/known_hosts'))
    hostkeytype = host_keys[SFTP_HOST].keys()[0]
    hostkey = host_keys[SFTP_HOST][hostkeytype]

    agent = paramiko.Agent()
    transport = paramiko.Transport((SFTP_HOST, 22))
    transport.connect(hostkey=hostkey)

    for key in agent.get_keys():
        try:
            transport.auth_publickey(SFTP_USER, key)
            break
        except paramiko.SSHException:
            continue

    sftp = paramiko.SFTPClient.from_transport(transport)
    sftp.chdir(SFTP_DIR)

def sftp_send(dataset, reference=None):
    zfscommand = ['sudo', 'zfs', 'send', '%s/%s' % (ZPOOL, dataset)]
    if reference is not None:
        zfscommand = zfscommand + ['-i', reference]

    zfs = subprocess.Popen(zfscommand, stdout=subprocess.PIPE)

    gpgcommand = [ 'gpg', '--batch', '--compress-algo', 'ZLIB',
                   '--sign', '--encrypt', '--recipient', GPGUSER ]
    gpg = subprocess.Popen(gpgcommand, stdout=subprocess.PIPE,
                                       stdin=zfs.stdout,
                                       stderr=subprocess.PIPE)

    gpg.poll()
    if gpg.returncode not in [None, 0]:
        print_colored("Error:\n\n" + gpg.stderr, 'red')
        return

    if reference is None:
        filename = '%s.full.zfs.gpg' % dataset
    else:
        filename = '%s.from.%s.zfs.gpg' % (dataset, reference)

    with sftp.open(filename, 'w') as remotefile:
        sys.stdout.write(term['purple'])
        while True:
            junk = gpg.stdout.read(1024*1024)
            if len(junk) == 0:
                break

            sys.stdout.write('#')
            sys.stdout.flush()
            remotefile.write(junk)
        print_colored(" DONE", 'green')

def syncronize(local_datasets, remote_datasets):
    for device in local_datasets.keys():
        current = ""
        for dataset in local_datasets[device]:
            last = current
            current = dataset

            if device in remote_datasets:
                if dataset in remote_datasets[device]:
                    print_colored("%s@%s -- found on remote server" % (device, dataset), 'yellow')
                    continue

            if last == '':
                print_colored("Initial syncronization for device %s" % device, 'green')
                sftp_send("%s@%s" % (device, dataset))
                lastmonth = dataset
                continue

            if last[:7] == dataset[:7]:
                print_colored("%s@%s -- incremental backup (reference: %s)" %
                              (device, dataset, last), 'green')
                sftp_send("%s@%s" % (device, dataset), last)
            else:
                print_colored("%s@%s -- full backup" % (device, dataset), 'green')
                sftp_send("%s@%s" % (device, dataset))
                print_colored("%s@%s -- doing incremental backup" % (device, dataset), 'green')
                sftp_send("%s@%s" % (device, dataset), lastmonth)
                lastmonth = dataset

def get_remote_datasets():
    datasets = sftp.listdir()
    datasets = filter(lambda x: '@' in x, datasets)

    datasets = [ entry.split('.')[0] for entry in datasets ]

    return postprocess_datasets(datasets)

def get_local_datasets():
    datasets = subprocess.check_output(['sudo', 'zfs', 'list', '-t', 'snapshot', '-H', '-o', 'name'])
    datasets = datasets.strip().split('\n')

    datasets = [ entry[5:] for entry in datasets ]

    return postprocess_datasets(datasets)

def main():
    sftp_connect()
    syncronize(get_local_datasets(), get_remote_datasets())

if __name__ == '__main__':
    main()

Rumors have it, btrfs has gained similar functionality to zfs send so maybe I'll be able to extend that code and use it on my linux nodes some future day (after migrating to btrfs there for a start).

Tags: foss, gnupg, kfreebsd.

Created by Chronicle v4.6