from decimal import Decimal
from numbers import Number

#-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------
def tn(a):
    return type(a).__name__

#-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------
def un_utf8(
        s,
        errors=None, # true=exception, false=ignore
):
    """Convert a sequence of one or more UTF8 octet codes to Unicode code points.

function        un_utf8

reverses        to_utf8

purpose         convert a sequence of one or more UTF-8, MUTF-8, XUTF-8
                octet code values representing one or more Unicode code
                points to sequence of Unicode code point values for
                each Unicode character.

one argument    a sequence of one or more UTF-8, MUTF-8, XUTF-8 octet
                code values in a type of bytes, bytearray, str, list of
                numbers, or tuple of numbers.  all UTF-8 types are
                accepted in this sequence.  the caller is responsiple
                for detecting invalid code points for the type of UTF-8
                it is dealing with.  a very high XUTF-8 code point will
                returned to the caller if the caller provide an encoded
                sequence for such a code point.  this includes Unicode
                UTF-16 surrogate code points (decoded like everything
                else).

returns         a sequence of code points in a str or a list of numbers
                or a tuple of numbers.  type str is returned when the
                octet codes are provided in type str or bytes or
                bytearray.

note            this function has no conversion mode and will convert
                extended sequences as if they were valid, resuulting
                in code points above 1114111 [0x10FFFF].

note            this function will convert sequences that a loner than
                needed to encode the code point.  the call can detect
                such cases by encoding the resulting code points to
                test if they match.

note            if a dictionary type is given, only the values, not the
                keys, are converted, and the result is returned as a
                dictionary with unchanged keys and converted values.

author          Phil D. Howard
                The author may be contacted by decoding the number:
                11054987560151472272755686915985840251291393453694611309
                (provu igi la numeron al duuma)
"""
#-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------
    errors = True if errors else False
    t = type(s)

    if isinstance(s,str):
        s = [ord(x)for x in s]

    elif isinstance(s,(list,tuple,bytes,bytearray)):
        p = []
        for c in s:
            if isinstance(c,complex):
                c = c.real
            if isinstance(c,(float,Decimal,Number)):
                c = int(c)
            elif not isinstance(c,int):
                raise TypeError(f'unknown/unsupported type {tn(c)} of data')
            p.append(c)
        s = p

    elif isinstance(s,dict):
        return {k:un_utf8(v) for k,v in s.items()}

    else:
        raise TypeError(f'unknown/unsupported type {tn(s)} of UTF-8 {s!r}')
#-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------
    p = []
    while s:
        c = s.pop(0) # s needs to be a mutable sequence of octet values

        if not isinstance(c,int):
            raise TypeError(f'unknown,unsupported type {tn(c)} of octet {c!r}')

        if   c < 0b10000000: # in range(128) (ASCII)
            pass # is a code point

        elif c < 0b11000000: # in range(128,192)
            if errors:
                raise ValueError(f'bad UTF-8 code {c!r} in range(128,192)')
            # else this octet just becomes its own code point

        elif c < 0b11100000: # in range(192,224)
            # 2 octets, 5 bits from 1st octet, 5+1*6 = 11
            if len(s) < 1: break
            c &= 0b00011111
            c = c << 6 | 0b00111111 & s.pop(0)

        elif c < 0b11110000: # in range(224,240)
            # 3 octets, 4 bits from 1st octet, 4+2*6 = 16
            if len(s) < 2: break
            c &= 0b00001111
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)

        elif c < 0b11111000: # in range(240,248)
            # 4 octets, 3 bits from 1st octet, 3+3*6 = 21
            if len(s) < 3: break
            c &= 0b00000111
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)

        elif c < 0b11111100: # in range(248,252)
            # 5 octets, 2 bits from 1st octet, 2+4*6 = 26
            if len(s) < 4: break
            c &= 0b00000011
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)

        elif c < 0b11111110: # in range(252,254)
            # 6 octets, 1 bit from 1st octet, 1+5*6 = 31
            if len(s) < 5: break
            c &= 0b00000001
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)

        elif c == 0b11111110: # 254
            # 7 octets, 0 bits from 1st octet, 0+6*6 = 36
            if len(s) < 6: break
            c =          0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)

        elif c == 0b11111111: # 255
            # 8 octets, 0 bits from 1st octet, 0+7*6 = 42
            if len(s) < 7: break
            c =          0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)
            c = c << 6 | 0b00111111 & s.pop(0)

        else:
            raise ValueError(f'bad UTF-8 code {c!r} exceeds 255')

        p.append(c)
#-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------
    if t in (bytes,bytearray) and all(isinstance(x,int) and x<256 for x in p):
        return t(p) # return as bytes of Unicode characters of up to 8-bits
    if t in (str,bytes,bytearray):
        return ''.join([chr(c) for c in p]) # return a str of Unicode characters
    if t is tuple:
        return tuple(p) # return a tuple of Unicode code point ints
    return p # return a list of Unicode code point ints
#-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------