from decimal import Decimal from numbers import Number #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- def tn(a): return type(a).__name__ #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- def un_utf8( s, errors=None, # true=exception, false=ignore ): """Convert a sequence of one or more UTF8 octet codes to Unicode code points. function un_utf8 reverses to_utf8 purpose convert a sequence of one or more UTF-8, MUTF-8, XUTF-8 octet code values representing one or more Unicode code points to sequence of Unicode code point values for each Unicode character. one argument a sequence of one or more UTF-8, MUTF-8, XUTF-8 octet code values in a type of bytes, bytearray, str, list of numbers, or tuple of numbers. all UTF-8 types are accepted in this sequence. the caller is responsiple for detecting invalid code points for the type of UTF-8 it is dealing with. a very high XUTF-8 code point will returned to the caller if the caller provide an encoded sequence for such a code point. this includes Unicode UTF-16 surrogate code points (decoded like everything else). returns a sequence of code points in a str or a list of numbers or a tuple of numbers. type str is returned when the octet codes are provided in type str or bytes or bytearray. note this function has no conversion mode and will convert extended sequences as if they were valid, resuulting in code points above 1114111 [0x10FFFF]. note this function will convert sequences that a loner than needed to encode the code point. the call can detect such cases by encoding the resulting code points to test if they match. note if a dictionary type is given, only the values, not the keys, are converted, and the result is returned as a dictionary with unchanged keys and converted values. author Phil D. Howard The author may be contacted by decoding the number: 11054987560151472272755686915985840251291393453694611309 (provu igi la numeron al duuma) """ #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- errors = True if errors else False t = type(s) if isinstance(s,str): s = [ord(x)for x in s] elif isinstance(s,(list,tuple,bytes,bytearray)): p = [] for c in s: if isinstance(c,complex): c = c.real if isinstance(c,(float,Decimal,Number)): c = int(c) elif not isinstance(c,int): raise TypeError(f'unknown/unsupported type {tn(c)} of data') p.append(c) s = p elif isinstance(s,dict): return {k:un_utf8(v) for k,v in s.items()} else: raise TypeError(f'unknown/unsupported type {tn(s)} of UTF-8 {s!r}') #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- p = [] while s: c = s.pop(0) # s needs to be a mutable sequence of octet values if not isinstance(c,int): raise TypeError(f'unknown,unsupported type {tn(c)} of octet {c!r}') if c < 0b10000000: # in range(128) (ASCII) pass # is a code point elif c < 0b11000000: # in range(128,192) if errors: raise ValueError(f'bad UTF-8 code {c!r} in range(128,192)') # else this octet just becomes its own code point elif c < 0b11100000: # in range(192,224) # 2 octets, 5 bits from 1st octet, 5+1*6 = 11 if len(s) < 1: break c &= 0b00011111 c = c << 6 | 0b00111111 & s.pop(0) elif c < 0b11110000: # in range(224,240) # 3 octets, 4 bits from 1st octet, 4+2*6 = 16 if len(s) < 2: break c &= 0b00001111 c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) elif c < 0b11111000: # in range(240,248) # 4 octets, 3 bits from 1st octet, 3+3*6 = 21 if len(s) < 3: break c &= 0b00000111 c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) elif c < 0b11111100: # in range(248,252) # 5 octets, 2 bits from 1st octet, 2+4*6 = 26 if len(s) < 4: break c &= 0b00000011 c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) elif c < 0b11111110: # in range(252,254) # 6 octets, 1 bit from 1st octet, 1+5*6 = 31 if len(s) < 5: break c &= 0b00000001 c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) elif c == 0b11111110: # 254 # 7 octets, 0 bits from 1st octet, 0+6*6 = 36 if len(s) < 6: break c = 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) elif c == 0b11111111: # 255 # 8 octets, 0 bits from 1st octet, 0+7*6 = 42 if len(s) < 7: break c = 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) c = c << 6 | 0b00111111 & s.pop(0) else: raise ValueError(f'bad UTF-8 code {c!r} exceeds 255') p.append(c) #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- if t in (bytes,bytearray) and all(isinstance(x,int) and x<256 for x in p): return t(p) # return as bytes of Unicode characters of up to 8-bits if t in (str,bytes,bytearray): return ''.join([chr(c) for c in p]) # return a str of Unicode characters if t is tuple: return tuple(p) # return a tuple of Unicode code point ints return p # return a list of Unicode code point ints #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------