Source code for hbutils.encoding.decode

"""
Overview:
    Functions to deal with encoding binary data easily.
"""
import sys
from typing import Optional, List

import chardet

from ..collection import unique

_DEFAULT_ENCODING = 'utf-8'
_DEFAULT_PREFERRED_ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5']  # common encodings for chinese

__all__ = [
    'auto_decode'
]


def _decode(data: bytes, encoding: str) -> str:
    return data.decode(encoding)


[docs]def auto_decode(data: bytes, encoding: Optional[str] = None, prefers: Optional[List[str]] = None) -> str:
    r"""
    Overview:
        Auto decode binary data to string, the encoding mode will be automatically detected.

    Arguments:
        - data (:obj:`bytes`): Original binary data to be decoded.
        - encoding (:obj:`Optional[str]`): Encoding mode to be used, default is ``None`` which \
            means this function need to automatically detect the encoding.
        - prefers (:obj:`Optional[List[str]]`): Prefered encodings.

    Returns:
        - str (:obj:`str`): Decoded string.

    Examples::

        >>> auto_decode(b'kdsfjldsjflkdsmgds')  # 'kdsfjldsjflkdsmgds'
        >>> auto_decode(b'\xd0\x94\xd0\xbe\xd0\xb1\xd1\x80\xd1\x8b\xd0\xb9 \xd0'
        ...             b'\xb2\xd0\xb5\xd1\x87\xd0\xb5\xd1\x80')  # "Добрый вечер"
        >>> auto_decode(b'\xa4\xb3\xa4\xf3\xa4\xd0\xa4\xf3\xa4\xcf')  # "こんばんは"
        >>> auto_decode(b'\xcd\xed\xc9\xcf\xba\xc3')  # "晚上好"
    """
    if encoding:
        return _decode(data, encoding)
    else:
        if prefers is None:
            prefers = _DEFAULT_PREFERRED_ENCODINGS
        _elist = filter(bool, unique([
            *prefers,
            sys.getdefaultencoding(),
            chardet.detect(data)['encoding']
        ]))

        last_err = None
        for enc in _elist:
            try:
                return _decode(data, enc)
            except UnicodeDecodeError as err:
                if last_err is None or err.start > last_err.start:
                    last_err = err

        raise last_err