commit 4637861cb9416cdf3802a723008c9a1dd3106e8f Author: zawz Date: Wed Feb 22 15:22:49 2023 +0100 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8a2013b --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +TODO +Zmakefile +/backups +/diffstore diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1a0cbc8 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ + +var_exclude = DIFF_ENGINE COMPRESSION_TYPE COMPRESSION_LEVEL RETENTION_TYPE RETENTION_AMOUNT DEBUG ENCRYPTION +fct_exclude = diff_.* patch_.* cmp_.* dcmp_.* clean_.* + +diffstore: src/* + lxsh -o diffstore -M --exclude-var "$(var_exclude)" --exclude-fct "$(fct_exclude)" src/main.sh + +debug: src/* + lxsh -o diffstore src/main.sh + +bash: src/* + lxsh --bash -o diffstore src/main.sh + +build: diffstore + +install: build + mv diffstore /usr/local/bin + cp completion/diffstore.bash /etc/bash_completion.d + +uninstall: + rm /usr/local/bin/diffstore + rm /etc/bash_completion.d/diffstore.bash + +clear: + rm diffstore diff --git a/README.md b/README.md new file mode 100644 index 0000000..e613a6e --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +# diffstore + +Utility to store files using differentials. + +This tool is mostly inteded as a file backup history. +It can acheive much higher compression ratios while still being decently fast on compression. +However retreiving older backups takes much longer. + +This should only be used to store either very similar files or a history of one file, +in the case where files are too different, it can result in more disk usage. +This means input files have to be uncompressed and unencrypted. + +### Example configs + +- rdiff + zstd -3 : best compression/time ratio +- diff + gzip : best compatibility +- rdiff + xz -9 : best compression + +## Building + +Requires [lxsh](https://git.zawz.net/zawz/lxsh) + +```sh +make +``` diff --git a/completion/diffstore.bash b/completion/diffstore.bash new file mode 100644 index 0000000..0c502b0 --- /dev/null +++ b/completion/diffstore.bash @@ -0,0 +1,36 @@ +#/usr/bin/env bash + +_diffstore_completion() +{ + local _cw2=(init list add store get clean config) + local _cw2_val=(get) + local _config_ + local cur=$2 + local N=0 + local j I + local dir + local WORDS=() + COMPREPLY=() + + if [ $COMP_CWORD -eq 1 ] ; then + _filedir -d + elif [ $COMP_CWORD -eq 2 ] ; then + WORDS=("${_cw2[@]}") + elif { [ "$COMP_CWORD" -eq "3" ] && echo "${_cw2_val[*]}" | grep -qw -- "${COMP_WORDS[2]}" ; } ; then + N=0 + while read -r I ; do + WORDS+=("$I") + done < <($1 "${COMP_WORDS[1]}" list basic) + fi + + N=0 + if [ -n "$cur" ] ; then + for I in "${WORDS[@]}" ; do + [ "$I" != "${I#"$cur"}" ] && COMPREPLY[N++]=$I + done + else + COMPREPLY=("${WORDS[@]}") + fi +} + +complete -o nospace -F _diffstore_completion -o dirnames diffstore diff --git a/src/clean.sh b/src/clean.sh new file mode 100644 index 0000000..af344e1 --- /dev/null +++ b/src/clean.sh @@ -0,0 +1,35 @@ + +# $1 = dir , $2 = type , $3 = amount +clean() { + dir=$1 + shift 1 + type=$RETENTION_TYPE + if [ $# -gt 0 ] ; then + type=$1 + shift 1 + fi + clean_$type "$dir" "$@" +} + +## clean_TYPE ## +# $1 = dir , $2 = amount + +clean_number() { + [ "$LOG" = true ] && rmopts=${rmopts}-v + { + file_difflist "$STOREDIR" && file_latest "$STOREDIR" + } | head -n "-${2-14}" | tr '\n' '\0' | ( cd "$STOREDIR" && xargs -0 rm $rmopts 2>/dev/null) +} + +clean_days() { + [ "$LOG" = true ] && rmopts=${rmopts}-v + ( + cd "$1" + find . -mindepth 1 -maxdepth 1 -mtime +${2-14} -regex '\./[0-9]*-.*' -print0 | xargs -0 rm $rmopts 2>/dev/null || true + find . -mindepth 1 -maxdepth 1 -mtime +${2-14} -name 'latest-*' -print0 | xargs -0 rm $rmopts 2>/dev/null || true + ) +} + +clean_all() { + true +} diff --git a/src/compression.sh b/src/compression.sh new file mode 100644 index 0000000..ebc56f0 --- /dev/null +++ b/src/compression.sh @@ -0,0 +1,92 @@ +#!/bin/sh + +comptypes=' +gzip:gz +xz:xz +zstd:zst +' + +# $1 = compression +compression_extension() { + printf "%s\n" "$comptypes" | grep -o "^${1-$COMPRESSION_TYPE}:.*$" | cut -d: -f2- +} + + +# $1 = file +compress() { + compress_stdout "$1" > "$1.$(compression_extension)$(crypt_extension)" && rm "$1" +} + +# $1 = file +compress_stdout() { + if [ "$NOBAR" = true ] ; then + cat "$1" + else + pv "$1" + fi | cmp_$COMPRESSION_TYPE $COMPRESSION_LEVEL | encrypt +} + +# $1 = file , $2 = engine , $* = options +decompress() { + decompress_stdout "$1" > "${1%.$(compression_extension)$(crypt_extension)}" && rm "$1" +} + +# $1 = file , $2 = engine , $* = options +decompress_stdout() { + if [ "$NOBAR" = true ] ; then + cat "$1" + else + pv "$1" + fi | decrypt | dcmp_$COMPRESSION_TYPE $COMPRESSION_LEVEL +} + +# $1 = dir , $2 = og engine , $3 = new engine , $4 = level +recompress_all() { + import_crypt + { file_difflist "$STOREDIR" && file_latest "$STOREDIR"; } | while read -r ln ; do + basefile=${ln%.$(compression_extension)$(crypt_extension)} + outfile=$1/$basefile.$(compression_extension "$3")$(crypt_extension) + tmpoutfile=$1/.$basefile.$(compression_extension "$3")$(crypt_extension) + echo "recompressing '${basefile%.$DIFF_ENGINE}'" + decompress_stdout "$1/$ln" | + NOBAR=true COMPRESSION_TYPE=$3 COMPRESSION_LEVEL=${4--6} compress_stdout /dev/stdin > "$tmpoutfile" || return $? + mts=$(stat -c "%Y" "$1/$ln") + rm "$1/$ln" + mv "$tmpoutfile" "$outfile" + touch -m -d "@$mts" "$outfile" + done +} + +## CMP/DCMP functions + +cmp_gzip() { + if which pigz >/dev/null 2>&1; then + pigz "$@" + else + gzip "$@" + fi +} + +dcmp_gzip() { + gzip -d "$@" +} + +cmp_xz() { + if which pixz >/dev/null 2>&1; then + pixz "$@" + else + xz "$@" + fi +} + +dcmp_xz() { + xz -d "$@" +} + +cmp_zstd() { + zstd -T0 "$@" +} + +dcmp_zstd() { + zstd -dT0 "$@" +} diff --git a/src/config.sh b/src/config.sh new file mode 100644 index 0000000..6852ea9 --- /dev/null +++ b/src/config.sh @@ -0,0 +1,129 @@ +#!/bin/sh + +export_env() { + varlist=$(echo "$CONFIGVARS" | cut -d: -f2- | tr -s ',\n' '\n' | sed '/^$/d;s/^/(/g;s/$/)/g') + set | grep -E "^($(printf "%s" "$varlist" | tr '\n' '|'))=" || true +} + +configfile=.diffstoreconfig + +CONFIGVARS=' +engine:DIFF_ENGINE +retention:RETENTION_TYPE,RETENTION_AMOUNT +compression:COMPRESSION_TYPE,COMPRESSION_LEVEL +encryption:ENCRYPTION +' + +# $1 = dir , $2 = vars , $@ = values +config_set() { + local dir=$1 vars=$2 + shift 2 + I=1 + for val + do + varname=$(echo "$vars" | cut -d, -f$I) + export $varname="$val" + I=$((I+1)) + done + gen_config > "$dir/$configfile" +} + +get_config_var() { + echo "$CONFIGVARS" | grep -o "$1:[A-Za-z_,]*" | cut -d: -f2- +} + +# $1 = dir, $2 = conf op, $@ = values +config() { + if [ $# -lt 2 ] ; then + gen_config_human + exit $? + fi + dir=$1 + confop=$2 + shift 2 + case "$confop" in + compression|engine|retention|encryption) + case "$confop" in + compression) + if [ "$COMPRESSION_TYPE$COMPRESSION_LEVEL" != "$1-${2-6}" ] ; then + recompress_all "$dir" "$COMPRESSION_TYPE" "$@" + fi + ;; + engine) + if [ -n "$(file_difflist)" ] ; then + file_difflist + echo "Cannot change $confop" >&2 + return 1 + fi + ;; + encryption) + if crypt_different_state "$ENCRYPTION" "$@" ; then + reencrypt_all "$dir" "$COMPRESSION_TYPE" "$@" + fi + esac + config_set "$dir" "$(get_config_var "$confop")" "$@" + ;; + env) gen_config ;; + print) gen_config_human ;; + help) config_help ;; + *) usage && exit 1;; + esac +} + +# $1 = folder +import_config() { + [ ! -f "$1/$configfile" ] || { + tmpenv=$(TMPDIR=${TMPDIR-/tmp} _lxsh_random_tmpfile diffstore-).env + # stash current env + export_env > "$tmpenv" + # load config + . "$1/$configfile" + # load back env + . "$tmpenv" + rm -f "$tmpenv" + } +} + +set_defaults() { + if [ -z "$COMPRESSION_TYPE" ] ; then + if which zstd >/dev/null 2>&1 ; then + COMPRESSION_TYPE=zstd + else + COMPRESSION_TYPE=gz + fi + fi + if [ -z "$DIFF_ENGINE" ] ; then + if which rdiff >/dev/null 2>&1 ; then + DIFF_ENGINE=rdiff + else + DIFF_ENGINE=diff + fi + fi + RETENTION_TYPE=${RETENTION_TYPE-all} + RETENTION_AMOUNT=${RETENTION_AMOUNT-14} + COMPRESSION_LEVEL=${COMPRESSION_LEVEL--6} + ENCRYPTION=off +} + +gen_config() { + for I in $(echo "$CONFIGVARS" | cut -d: -f2- | tr ',' ' ') ; do + printf "%s=%s\n" "$I" "${!I}" + done +} + +gen_config_human() { + for oneconf in $CONFIGVARS ; do + confname=$(echo "$oneconf" | cut -d: -f1) + vars=$(echo "$oneconf" | cut -d: -f2-) + printf "%-12s:" "$confname" + for var in $(echo "$vars" | tr ',' ' ') ; do + printf " %s" "${!var}" + done + echo + done +} + +gen_default_config() { + set_defaults + gen_config > "$1/$configfile" +} diff --git a/src/crypt.sh b/src/crypt.sh new file mode 100644 index 0000000..fa69d96 --- /dev/null +++ b/src/crypt.sh @@ -0,0 +1,83 @@ +#!/bin/sh + +_stop() { + stty echo +} + + +crypt_different_state() { + if [ "$1" = on ] ; then + [ "$2" != on ] + else + [ "$2" = on ] + fi +} + +crypt_extension() { + if [ $# -gt 0 ] ; then + local ENCRYPTION=$1 + fi + if [ "$ENCRYPTION" = on ] ; then + echo ".enc" + fi +} + +# $1 = prompt +console_prompt_hidden() +{ + ( + trap _stop INT + local prompt + stty -echo + read -rp "$1" prompt || { stty echo; return 1; } + stty echo + printf "\n" >&2 + echo "$prompt" + ) +} + +import_crypt() { + if [ "$ENCRYPTION" = on ] && [ -z "$CRYPT_PW" ]; then + CRYPT_PW=$(console_prompt_hidden "Password: ") + export CRYPT_PW + fi +} + +# $1 = key +encrypt() { + if [ "$ENCRYPTION" = on ] ; then + openssl enc -aes-256-cbc -pbkdf2 -salt -in - -out - -k "$1" + else + cat + fi +} + +# $1 = key +decrypt() { + if [ "$ENCRYPTION" = on ] ; then + openssl enc -d -aes-256-cbc -pbkdf2 -in - -out - -k "$1" + else + cat + fi +} + +# $1 = dir , $2 = og engine , $3 = new engine +reencrypt_all() { + if [ -n "$(file_difflist "$STOREDIR")$(file_latest "$STOREDIR")" ] ; then + ENCRYPTION=on import_crypt + fi + { file_difflist "$STOREDIR" && file_latest "$STOREDIR"; } | while read -r ln ; do + basefile=${ln%$(crypt_extension)} + outfile=$1/$basefile$(crypt_extension "$3") + tmpoutfile=$1/.$basefile$(crypt_extension "$3") + adj=Encrypting + if [ "$ENCRYPTION" = on ] ; then + adj=Decrypting + fi + echo "$adj '${basefile%.$DIFF_ENGINE.$(compression_extension)}'" + decompress_stdout "$1/$ln" | + ENCRYPTION=$3 compress_stdout /dev/stdin > "$tmpoutfile" || return $? + rm "$1/$ln" + mv "$tmpoutfile" "$outfile" + done +} diff --git a/src/diff.sh b/src/diff.sh new file mode 100644 index 0000000..63d89ab --- /dev/null +++ b/src/diff.sh @@ -0,0 +1,72 @@ +#!/bin/sh + +### TODO: bsdiff ### + +## with bsdiff +# bsdiff older newer patch.bin +# [...] +# bspatch older newer patch.bin +## upside: +# - big compress +## downside: +# - bonkers amount of RAM + +## FCT diff_ENGINE ## +# $1 = old file , $2 = new file , $3 = output diff file + +diff_diff() { + diff -ua "$1" "$2" > "$3" || true +} + +diff_rdiff() { + sigfile=$(_lxsh_random_tmpfile diffstore-).sig + ( + set -e + rdiff signature "$1" "$sigfile" + rdiff delta "$sigfile" "$2" "$3" + ) || { + rm -rf "$sigfile" + return 1 + } + rm -rf "$sigfile" +} + +## FCT patch_ENGINE ## +# $1 = dir , $2 = file , $3 = output , stdin = patches + +patch_diff() { + dir=$1 + infile=$2 + outfile=$3 + shift 3 + cp "$infile" "$outfile" + while read -r filediff ; do + decompress_stdout "$dir/$filediff.$DIFF_ENGINE.$(compression_extension)$(crypt_extension)" + done | patch -s "$outfile" +} + +patch_rdiff() { + tmpfile=.diffstoretmp-$(_lxsh_random_string) + tmppipe=$(TMPDIR=${XDG_RUNTIME_DIR-/tmp} _lxsh_random_tmpfile "diffstorefifo_") + ( + dir=$1 + infile=$2 + realout=$3 + shift 3 + outfile=$tmpfile + mkfifo "$tmppipe" + pv -petls "$1" >/dev/null < "$tmppipe" & + while read -r filediff ; do + decompress_stdout "$dir/$filediff.$DIFF_ENGINE.$(compression_extension)$(crypt_extension)" | rdiff patch "$infile" /dev/stdin "$outfile" + rm "$infile" + mv "$outfile" "$infile" + echo "dummy line" + done > "$tmppipe" + mv "$infile" "$realout" + rm -f "$tmpfile" "$tmppipe" + ) || { + stat=$? + rm -f "$tmpfile" "$tmppipe" + return $stat + } +} diff --git a/src/help.sh b/src/help.sh new file mode 100644 index 0000000..7acef40 --- /dev/null +++ b/src/help.sh @@ -0,0 +1,42 @@ +#!/bin/sh +usage() { + cat << EOF +$(basename "$0") [ARGS] +Script to store files as differential history. +Intended purpose is to serve as a heavily compressed backup history. +Note that input files need to be unencrypted for it to be effective. + +Operations: + list List stored files + add [NAME] Add given file into diff store + store [NAME] Alias to add + get Get original from selected file + clean [TYPE] [AMOUNT] Clean according to retention. Can give a retention type in console + config Set config on folder. See '$(basename "$0") config help' for details + +It is recommended to install 'zstd' and 'rdiff' for optimal speed and compression. +They will be set as default config if found, otherwise will default to gzip/diff. +EOF +} + +config_help() { + cat << EOF +$(basename "$0") config [OPERATION] +View or change configuration of FOLDER + +Operations: + help Display this message + print Print config in human readable format + env Print config as environment + engine Engine to use for differentials. Supported: diff, rdiff + retention Configure automatic retention. + compression [LEVEL] Configure to use given compression. Supported types: gz, xz, zstd + encryption Enable or disable password-based encryption. + > Use env ENCRYPT_PW for automated password input + +Retention types: + all Keep everything + number Keep N number of files + days Keep only files modified in the N lays days +EOF +} diff --git a/src/list.sh b/src/list.sh new file mode 100644 index 0000000..3a2663a --- /dev/null +++ b/src/list.sh @@ -0,0 +1,22 @@ + +# $1 = dir , $2 = op +list() { + lop=$2 + case $lop in + basic) basic_list "$1" ;; + *) default_list "$1" ;; + esac +} + +basic_list() { + { file_difflist "$1" && file_latest "$1" ; } | filename_pipe +} + +default_list() { + maxlength=$({ file_difflist "$1" && file_latest "$1" ; } | filename_pipe | wc -L) + printf "%-*s %-36s %s\n" "$maxlength" "File" "Store date" "Disk space" + { file_difflist "$1" && file_latest "$1"; } | while read -r ln + do + printf " %-*s %-36s %s\n" "$maxlength" "$(filename "$ln")" "$(date '+%F %T %z' -d "@$(stat --printf=%Y "$1/$ln")")" "$(du -hs "$1/$ln" | cut -f1)" + done +} diff --git a/src/main.sh b/src/main.sh new file mode 100755 index 0000000..34dc3fb --- /dev/null +++ b/src/main.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env lxsh + +[ "$DEBUG" = true ] && set -x + +set -e + +%include *.sh + +if [ $# -lt 2 ] ; then + usage + exit 1 +fi + +STOREDIR=$1 +OP=$2 +shift 2 + +if [ "$OP" = init ] || [ ! -f "$STOREDIR/$configfile" ] ; then + init_store "$STOREDIR" +fi + +import_config "$STOREDIR" + +case "$OP" in + init) true ;; + ls|list) list "$STOREDIR" "$@" ;; + store|add) add_file "$STOREDIR" "$@" && clean "$STOREDIR" ;; + timeset) timeset_file "$STOREDIR" "$@" ;; + get) get_file "$STOREDIR" "$@" ;; + clean) LOG=true clean "$STOREDIR" "$@" ;; + config) + config "$STOREDIR" "$@" + ;; + *) usage && exit 1 ;; +esac diff --git a/src/store.sh b/src/store.sh new file mode 100644 index 0000000..51959cd --- /dev/null +++ b/src/store.sh @@ -0,0 +1,140 @@ +#!/bin/sh + +# $1 = dir +init_store() { + mkdir -p "$1" + rm -rf "$1/*" + gen_default_config "$1" +} + +# $1 = dir +file_difflist() { + ( + set -e + cd "$1" + find . -mindepth 1 -maxdepth 1 -regex '\./[0-9]*-.*' -printf "%f\n" | sort -n + ) +} + +# $1 = dir +file_latest() { + ( + set -e + cd "$1" + find . -mindepth 1 -maxdepth 1 -name 'latest-*' -printf "%f\n" 2>/dev/null + ) +} + +# $1 = file +original_filename() { + name=${1%.$(compression_extension)$(crypt_extension)} + name=${name%.$DIFF_ENGINE} + printf "%s\n" "$name" | cut -d- -f2- +} + +# $1 = file +filename() { + name=${1%.$(compression_extension)$(crypt_extension)} + name=${name%.$DIFF_ENGINE} + printf "%s\n" "$name" +} + +# $1 = file +filename_pipe() { + sed "s/$(crypt_extension)$//g;s/\.$(compression_extension)$//g;s/\.$DIFF_ENGINE$//g" +} + +# $1 = dir +getnvalue() { + file_difflist "$1" | tail -n1 | cut -d- -f1 +} + +# $1 = dir , $2 = input file name +new_difffile() { + printf "%s\n" "$(($(getnvalue "$1")+1))-$2.$DIFF_ENGINE" +} + +# $1 = dir , $2 = input file +get_latestname() { + printf "%s\n" "latest-$(basename "$2").$(compression_extension)$(crypt_extension)" +} + +# $1 = dir , $2 = file , $3 = name +add_file() { + import_crypt + dir=$1 + file=$2 + name=${3-"$2"} + tarfile= + latest=$(file_latest "$dir") + newlatest=$dir/$(get_latestname "$dir" "$name") + if [ -d "$file" ] ; then + newlatest=$dir/$(get_latestname "$dir" "$name.tar") + tarfile="$dir/.$name.tar" + tar -cf "$tarfile" "$file" + file=$tarfile + fi + if [ -n "$latest" ] ; then + fname=$(original_filename "$latest") + destfile=$(new_difffile "$dir" "$fname") + echo "> Creating diff" + decompress_stdout "$dir/$latest" | diff_$DIFF_ENGINE "$file" /dev/stdin "$dir/$destfile" + echo "> Compressing diff file" + compress "$dir/$destfile" + rm "$dir/$latest" + fi + echo "> Compressing new file" + compress_stdout "$file" > "$newlatest" + if [ -n "$tarfile" ] ; then + rm "$tarfile" + fi +} + +# $1 = dir , $2 = name +resolve_shortname() { + local filetest + case "$2" in + latest|[0-9]*) ( + cd "$1" + filetest=$(printf "%s" "$2"-*) + if [ -f "$filetest" ] ; then + printf "%s\n" "$filetest" | filename_pipe + else + printf "%s\n" "$2" + fi + ) ;; + *) printf "%s\n" "$2" ;; + esac +} + +get_file() { + import_crypt + dir=$1 + file=$(resolve_shortname "$dir" "$2") + dest=${3-$file} + latest=$(file_latest "$dir" | filename_pipe) + + if [ "$file" = "$latest" ] ; then + echo "> Decompressing" + decompress_stdout "$dir/$(file_latest "$dir")" > "$dest" + else + list=$(file_difflist "$dir" | filename_pipe) + list=$(printf "%s\n" "$list" | grep "^$file$" -A$(echo "$list" | wc -l) | tac) + if [ -z "$list" ]; then + echo "file $file not found" >&2 + return 1 + fi + + echo "> Decompressing" + decompress_stdout "$dir/$(file_latest "$dir")" > ".$dest" + ( + set +e + echo "> Patching" + printf "%s\n" "$list" | NOBAR=true patch_$DIFF_ENGINE "$dir" ".$dest" "$dest" "$(printf "%s\n" "$list" | wc -l)" + stat=$? + rm -f ".$dest" + exit $stat + ) + fi + +}