#!/usr/bin/env python3 # -*- coding: utf-8 -*- """hash files using the selected algorithm hashing the uncompressed form by decompressing compressed files.""" #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- # this script was created to help illustrate uses of zopen() #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- # algorithm is determined by command name or first argument # make executable names (or links) with one # preferred algorithm for each command name #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- # beware of files that uncompress into huge content #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- import hashlib import os from ftrgen import ftrgen from os import environ,sep from sys import argv,stderr,stdin from zopen import zopen # some constants and settings CR='\r' NL='\n' ETEOL='\x1b[K' # erase to end of line # tune reading and size output readsize = 2**24 # how many bytes to read oversize = 2**48 # how big size looks like whensize = 2**25 # after this many show size error_max = 255 # default var = 'ERROR_MAX' if var in environ: val = environ[var] try: error_max = int(val) except ValueError: exit(f'bad int value for env var {var!r}: {val!r}') #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- # function to handle error messages, print them and count them #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- def error(msg): global errorcount print(msg,file=stderr,flush=1) errorcount += 1 return errorcount = 0 #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- # function to find which algorithm is in a str # returns name as a str if one is found # returns count as an int if not one #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- def which_algorithm(word): for alg in hashlib.algorithms_available: if alg == word: return alg found = set() for alg in hashlib.algorithms_available: if alg in word: found.add(alg) count = len(found) if count == 1: return found.pop() return count #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- # get command name and from it determine which hash algorithm was intended for this command # if the command name has none then try the first argument as the hash algorithm # hash algorithm names such as: md5 or sha256 #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- exe = argv.pop(0) if not argv: exit(print(NL.join(sorted(hashlib.algorithms_available)))) cmd = exe.rsplit(sep,1)[1] if sep in exe else exe algorithm = which_algorithm(cmd) ext = '' if algorithm == 0: if argv: ext = ' or first argument' if argv[0][:5] == '--alg': exit(print(NL.join(sorted(hashlib.algorithms_available)))) algorithm = which_algorithm(argv.pop(0)) if isinstance(algorithm,int): if algorithm == 0: algorithm = 'no' exit(f'{algorithm} available hash algorithms match command name {cmd!r}{ext}') #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- # gather all file names # file names are read from stdin if not given in argv # this code does not zhash stdin #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- count = len(argv) names = [] try: for name1 in argv if count else stdin: for name in name1.splitlines(): if os.path.isdir(name): for ascend,depth,path in ftrgen(name): if os.path.islink(path): continue if os.path.isfile(path): if path[:2] == './': path = path[2:] names.append(path) elif os.path.isfile(name): if name[:2] == './': name = name[2:] names.append(name) except KeyboardInterrupt: exit(CR+'ouch!'+ETEOL+CR) #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- # check all files for open failures before any are hashed (plain open() is used here, not zopen()) #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- try: for name in names: try: with open(name,'rb') as file: continue except FileNotFoundError: error(f'file not found: {name!r}') except PermissionError: error(f'access denied: {name!r}') except IsADirectoryError: error(f'will not hash directory: {name!r}') except OSError: # everything else error(f'unknown error opening file: {name!r}') except KeyboardInterrupt: exit(CR+'Ouch!'+ETEOL+CR) if errorcount > error_max: exit(f'aborting due to {errorcount[0]} error{"s"[errorcount==1:]} (max {error_max!r})') #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- # hash all files #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- try: for name in names: try: total = oversize limit = oversize + whensize with zopen(name,'rb') as file: # decompression will be determined in zopen by extension(s) of file name hash_object = hashlib.new(algorithm) while True: if total >= limit: print(f'{hex(total)[3:]} {name}'[:135],end=ETEOL+CR,file=stderr,flush=True) limit += whensize data = file.read(readsize) if not data: # EOF break hash_object.update(data) total += len(data) print(end=CR+ETEOL+CR,file=stderr,flush=True) except EOFError: continue except OSError: continue print(hash_object.hexdigest(),'*'+name,flush=True) except KeyboardInterrupt: exit(CR+'OUCH!'+ETEOL+CR) print(end=CR,file=stderr,flush=True)