Source code for hbutils.encoding.decode

"""
Overview:
    Functions to deal with encoding binary data easily.
"""
from typing import Optional

import chardet

_DEFAULT_ENCODING = 'utf-8'
_ENCODING_LIST = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5']  # common encodings for chinese

__all__ = [
    'auto_decode'
]


[docs]def auto_decode(data: bytes, encoding: Optional[str] = None) -> str:
    r"""
    Overview:
        Auto decode binary data to string, the encoding mode will be automatically detected.

    Arguments:
        - data (:obj:`bytes`): Original binary data to be decoded.
        - encoding (:obj:`Optional[str]`): Encoding mode to be used, default is ``None`` which \
            means this function need to automatically detect the encoding.

    Returns:
        - str (:obj:`str`): Decoded string.

    Examples::

        >>> auto_decode(b'kdsfjldsjflkdsmgds')  # 'kdsfjldsjflkdsmgds'
        >>> auto_decode(b'\xd0\x94\xd0\xbe\xd0\xb1\xd1\x80\xd1\x8b\xd0\xb9 \xd0'
        ...             b'\xb2\xd0\xb5\xd1\x87\xd0\xb5\xd1\x80')  # "Добрый вечер"
        >>> auto_decode(b'\xa4\xb3\xa4\xf3\xa4\xd0\xa4\xf3\xa4\xcf')  # "こんばんは"
        >>> auto_decode(b'\xcd\xed\xc9\xcf\xba\xc3')  # "晚上好"
    """
    if encoding:
        return data.decode(encoding)
    else:
        auto_encoding = chardet.detect(data)['encoding']
        if auto_encoding and auto_encoding not in _ENCODING_LIST:
            _list = _ENCODING_LIST + [auto_encoding]
        else:
            _list = _ENCODING_LIST

        last_err = None
        for enc in _list:
            try:
                return data.decode(encoding=enc)
            except UnicodeDecodeError as err:
                last_err = err

        raise last_err