from decimal import Decimal #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.------- def to_utf8( cp, modified=True, normal=None, extended=None, ): """Convert one or more Unicode code points to UTF-8, MUTF-8, or XUTF-8. function to_utf8 reverses un_utf8 purpose convert a sequence of Unicode code points to a like sequence of 8-bit codes encoded as UTF-8, or as MUTF-8. or as XUTF-8 in a bytes sequence the Unicode code points may be given as characters or as numbers (int or anything that can be converted to int). argument a value or sequence of one or more code points. returns a like value or sequence structure with code point values replaced with UTF-8, MUTF-8, or XUTF-8. note this function also encodes values that are outside the set of valid values for Unicode code points including values reserved for surrogate pairs used in UTF-16 and values exceeding the Unicode limit of 1114112 note the largest working value for UTF-8 and MUTF-8 (modified UTF-8) is 1114111 (0x10ffff) which is encoded with 5 octets. the largest working value for XUTF-8 (extended UTF-8) is 4398046511103 (2**42-1) which is encoded with 8 octets. any 32-bit word may be encoded this way. note this function is only responsiple for carrying out the conversion logic. it is not responsible detecting and any special handling of UTF-16 surrogates. note for type dictionary only its values, not its keys, are converted. note for type str, bytes, or bytearray, each character is converted and the [MX]UTF-8 result is returned with encoded characters of the same type. for bytes and bytearray, code points are limited to 0 through 255. note for types list or tuple, each item is encoded if it can be, and the results are returned within the same type. note for types int, float, or decimal.Decimal, conversion is made only from the whole value. any fractional value is truncated. negative values are handled specially or not handled at all and may cause exceptions. the value -1 will always be encoded as 0. the value 0 is encoded as C0 80 when modified UTF-8 (MUTF-8) is in effect. the caller may use -1 to force a 0 into the result. note for a solo int, float, or decimal.Decimal, the returned type is bytes. author Phil D. Howard The author may be contacted by decoding the number 11054987560151472272755686915985840251291393453694611309 (provu igi la numeron al duuma) """ #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------. if modified is None: modified = True # default modified = True if modified else False normal = True if normal else False extended = True if extended else False count = (modified,normal,extended).count(True) if count > 1: raise ValueError('multiple conversion types requested, UTF-8 vs MUTF-8 vs XUTF-8') del count ty = type(cp) if ty is str: cp = [ord(x)for x in cp] elif ty is dict: return {k:to_utf8(v) for k,v in cp.items()} elif ty in (int,float,Decimal): cp = [cp,] # make a solo number into a 1-sequence ty = list elif ty in (set,frozenset): return ty(to_utf8([x for x in cp])) if ty not in (list,tuple,set,frozenset,bytes,bytearray): raise TypeError(f'unsupported type {ty.__name__!r}') #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------. two7 = 2** 7 two11 = 2**11 two16 = 2**16 two21 = 2**21 two26 = 2**26 two31 = 2**31 two36 = 2**36 if extended else 0 two42 = 2**42 if extended else 0 #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------. u = [] # conversion will be concatenated here for z in cp: # iterate sequence of code points z = int(z) # int is needed by binary ops below if z < 0: # any negative code point encodes ... u += [ 0 ] # ... 0 as the one octet result elif modified and z == 0: # in modified UTF-8 (MUTF-8) 0 is ... u += [ 192, 128 ] # ... encoded as overlong C0 80 elif z < two7: # 7 bits encode to 1 octet u += [ z ] # 0 .. 127 0xxxxxxx (ASCII) elif z < two11: # 11 bits encode to 2 octets u += [ ( z >> 6) + 192, # 192 .. 223 110xxxxx ( z & 63 ) + 128] # 128 .. 191 10xxxxxx elif z < two16: # 16 bits encode to 3 octets u += [ ( z >> 12) + 224, # 224 .. 239 1110xxxx ( z >> 6 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z & 63 ) + 128] # 128 .. 191 10xxxxxx elif z < two21: # 21 bits encode to 4 octets u += [ ( z >> 18) + 240, # 240 .. 247 11110xxx ( z >> 12 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 6 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z & 63 ) + 128] # 128 .. 191 10xxxxxx elif z < two26: # 26 bits encode to 5 octets u += [ ( z >> 24) + 248, # 248 .. 251 111110xx ( z >> 18 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 12 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 6 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z & 63 ) + 128] # 128 .. 191 10xxxxxx elif z < two31: # 31 bits encode to 6 octets u += [ ( z >> 30) + 252, # 252, 253 1111110x ( z >> 24 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 18 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 12 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 6 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z & 63 ) + 128] # 128 .. 191 10xxxxxx elif z < two36: # 36 bits encode to 7 octets u += [ 254, # 254 may confuse some octet streams ( z >> 30 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 24 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 18 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 12 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 6 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z & 63 ) + 128] # 128 .. 191 10xxxxxx elif z < two42: # 42 bits encode to 8 octets u += [ 255, # 255 will confuse many octet streams ( z >> 36 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 30 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 24 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 18 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 12 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z >> 6 & 63 ) + 128, # 128 .. 191 10xxxxxx ( z & 63 ) + 128] # 128 .. 191 10xxxxxx else: u += [z] # an invalid value remains unconverted and will exceed 2**42 #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------. if ty in (int,float,Decimal): ty = bytes elif ty is str: return ty().join(chr(x) for x in u) return ty(u) #-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.-------.