当前位置: 首页>>代码示例>>Python>>正文

Python codecs.BOM_UTF16_BE属性代码示例

本文整理汇总了Python中codecs.BOM_UTF16_BE属性的典型用法代码示例。如果您正苦于以下问题:Python codecs.BOM_UTF16_BE属性的具体用法?Python codecs.BOM_UTF16_BE怎么用?Python codecs.BOM_UTF16_BE使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在codecs的用法示例。


示例1: _detect_encoding

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def _detect_encoding(self, fileid):
        if isinstance(fileid, PathPointer):
            s = fileid.open().readline()
            with open(fileid, 'rb') as infile:
                s = infile.readline()
        if s.startswith(codecs.BOM_UTF16_BE):
            return 'utf-16-be'
        if s.startswith(codecs.BOM_UTF16_LE):
            return 'utf-16-le'
        if s.startswith(codecs.BOM_UTF32_BE):
            return 'utf-32-be'
        if s.startswith(codecs.BOM_UTF32_LE):
            return 'utf-32-le'
        if s.startswith(codecs.BOM_UTF8):
            return 'utf-8'
        m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
        if m:
            return m.group(1).decode()
        m = re.match(br"\s*<\?xml\b.*\bencoding='([^']+)'", s)
        if m:
            return m.group(1).decode()
        # No encoding found -- what should the default be?
        return 'utf-8' 

示例2: get_text_contents

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def get_text_contents(self):
            contents = self.get_contents()
            # The behavior of various decode() methods and functions
            # w.r.t. the initial BOM bytes is different for different
            # encodings and/or Python versions.  ('utf-8' does not strip
            # them, but has a 'utf-8-sig' which does; 'utf-16' seems to
            # strip them; etc.)  Just side step all the complication by
            # explicitly stripping the BOM before we decode().
            if contents.startswith(codecs.BOM_UTF8):
                contents = contents[len(codecs.BOM_UTF8):]
                # TODO(2.2):  Remove when 2.3 becomes floor.
                #contents = contents.decode('utf-8')
                contents = my_decode(contents, 'utf-8')
            elif contents.startswith(codecs.BOM_UTF16_LE):
                contents = contents[len(codecs.BOM_UTF16_LE):]
                # TODO(2.2):  Remove when 2.3 becomes floor.
                #contents = contents.decode('utf-16-le')
                contents = my_decode(contents, 'utf-16-le')
            elif contents.startswith(codecs.BOM_UTF16_BE):
                contents = contents[len(codecs.BOM_UTF16_BE):]
                # TODO(2.2):  Remove when 2.3 becomes floor.
                #contents = contents.decode('utf-16-be')
                contents = my_decode(contents, 'utf-16-be')
            return contents 

示例3: guess_json_utf

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def guess_json_utf(data):
    # JSON always starts with two ASCII characters, so detection is as
    # easy as counting the nulls and from their location and count
    # determine the encoding. Also detect a BOM, if present.
    sample = data[:4]
    if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE):
        return 'utf-32'     # BOM included
    if sample[:3] == codecs.BOM_UTF8:
        return 'utf-8-sig'  # BOM included, MS style (discouraged)
    if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
        return 'utf-16'     # BOM included
    nullcount = sample.count(_null)
    if nullcount == 0:
        return 'utf-8'
    if nullcount == 2:
        if sample[::2] == _null2:   # 1st and 3rd are null
            return 'utf-16-be'
        if sample[1::2] == _null2:  # 2nd and 4th are null
            return 'utf-16-le'
        # Did not detect 2 valid UTF-16 ascii-range characters
    if nullcount == 3:
        if sample[:3] == _null3:
            return 'utf-32-be'
        if sample[1:] == _null3:
            return 'utf-32-le'
        # Did not detect a valid UTF-32 ascii-range character
    return None 

示例4: guess_json_utf

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def guess_json_utf(data):
    :rtype: str
    # JSON always starts with two ASCII characters, so detection is as
    # easy as counting the nulls and from their location and count
    # determine the encoding. Also detect a BOM, if present.
    sample = data[:4]
    if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
        return 'utf-32'     # BOM included
    if sample[:3] == codecs.BOM_UTF8:
        return 'utf-8-sig'  # BOM included, MS style (discouraged)
    if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
        return 'utf-16'     # BOM included
    nullcount = sample.count(_null)
    if nullcount == 0:
        return 'utf-8'
    if nullcount == 2:
        if sample[::2] == _null2:   # 1st and 3rd are null
            return 'utf-16-be'
        if sample[1::2] == _null2:  # 2nd and 4th are null
            return 'utf-16-le'
        # Did not detect 2 valid UTF-16 ascii-range characters
    if nullcount == 3:
        if sample[:3] == _null3:
            return 'utf-32-be'
        if sample[1:] == _null3:
            return 'utf-32-le'
        # Did not detect a valid UTF-32 ascii-range character
    return None 

示例5: determine_encoding

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def determine_encoding(self):
        while not self.eof and len(self.raw_buffer) < 2:
        if not isinstance(self.raw_buffer, unicode):
            if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
                self.raw_decode = codecs.utf_16_le_decode
                self.encoding = 'utf-16-le'
            elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
                self.raw_decode = codecs.utf_16_be_decode
                self.encoding = 'utf-16-be'
                self.raw_decode = codecs.utf_8_decode
                self.encoding = 'utf-8'

示例6: determine_encoding

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def determine_encoding(self):
        while not self.eof and (self.raw_buffer is None or len(self.raw_buffer) < 2):
        if isinstance(self.raw_buffer, bytes):
            if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
                self.raw_decode = codecs.utf_16_le_decode
                self.encoding = 'utf-16-le'
            elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
                self.raw_decode = codecs.utf_16_be_decode
                self.encoding = 'utf-16-be'
                self.raw_decode = codecs.utf_8_decode
                self.encoding = 'utf-8'

示例7: detect_encoding

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def detect_encoding(data):
    """Detect which UTF codec was used to encode the given bytes.

    The latest JSON standard (:rfc:`8259`) suggests that only UTF-8 is
    accepted. Older documents allowed 8, 16, or 32. 16 and 32 can be big
    or little endian. Some editors or libraries may prepend a BOM.

    :param data: Bytes in unknown UTF encoding.
    :return: UTF encoding name
    head = data[:4]

    if head[:3] == codecs.BOM_UTF8:
        return 'utf-8-sig'

    if b'\x00' not in head:
        return 'utf-8'

    if head in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE):
        return 'utf-32'

    if head[:2] in (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE):
        return 'utf-16'

    if len(head) == 4:
        if head[:3] == b'\x00\x00\x00':
            return 'utf-32-be'

        if head[::2] == b'\x00\x00':
            return 'utf-16-be'

        if head[1:] == b'\x00\x00\x00':
            return 'utf-32-le'

        if head[1::2] == b'\x00\x00':
            return 'utf-16-le'

    if len(head) == 2:
        return 'utf-16-be' if head.startswith(b'\x00') else 'utf-16-le'

    return 'utf-8' 

示例8: detectBOM

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
            codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
            codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'

        # Go to beginning of file and read in 4 bytes
        string = self.rawStream.read(4)
        assert isinstance(string, bytes)

        # Try detecting the BOM using bytes from the string
        encoding = bomDict.get(string[:3])         # UTF-8
        seek = 3
        if not encoding:
            # Need to detect UTF-32 before UTF-16
            encoding = bomDict.get(string)         # UTF-32
            seek = 4
            if not encoding:
                encoding = bomDict.get(string[:2])  # UTF-16
                seek = 2

        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
        if encoding:
            return lookupEncoding(encoding)
            return None 

示例9: guess_json_utf

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def guess_json_utf(data):
    :rtype: str
    # JSON always starts with two ASCII characters, so detection is as
    # easy as counting the nulls and from their location and count
    # determine the encoding. Also detect a BOM, if present.
    sample = data[:4]
    if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE):
        return 'utf-32'     # BOM included
    if sample[:3] == codecs.BOM_UTF8:
        return 'utf-8-sig'  # BOM included, MS style (discouraged)
    if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
        return 'utf-16'     # BOM included
    nullcount = sample.count(_null)
    if nullcount == 0:
        return 'utf-8'
    if nullcount == 2:
        if sample[::2] == _null2:   # 1st and 3rd are null
            return 'utf-16-be'
        if sample[1::2] == _null2:  # 2nd and 4th are null
            return 'utf-16-le'
        # Did not detect 2 valid UTF-16 ascii-range characters
    if nullcount == 3:
        if sample[:3] == _null3:
            return 'utf-32-be'
        if sample[1:] == _null3:
            return 'utf-32-le'
        # Did not detect a valid UTF-32 ascii-range character
    return None 

示例10: detectBOM

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'

        # Go to beginning of file and read in 4 bytes
        string = self.rawStream.read(4)
        assert isinstance(string, bytes)

        # Try detecting the BOM using bytes from the string
        encoding = bomDict.get(string[:3])         # UTF-8
        seek = 3
        if not encoding:
            # Need to detect UTF-32 before UTF-16
            encoding = bomDict.get(string)         # UTF-32
            seek = 4
            if not encoding:
                encoding = bomDict.get(string[:2])  # UTF-16
                seek = 2

        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
        self.rawStream.seek(encoding and seek or 0)

        return encoding 

示例11: encode_endian

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def encode_endian(text, encoding, errors="strict", le=True):
    """Like text.encode(encoding) but always returns little endian/big endian
    BOMs instead of the system one.

        text (text)
        encoding (str)
        errors (str)
        le (boolean): if little endian

    encoding = codecs.lookup(encoding).name

    if encoding == "utf-16":
        if le:
            return codecs.BOM_UTF16_LE + text.encode("utf-16-le", errors)
            return codecs.BOM_UTF16_BE + text.encode("utf-16-be", errors)
    elif encoding == "utf-32":
        if le:
            return codecs.BOM_UTF32_LE + text.encode("utf-32-le", errors)
            return codecs.BOM_UTF32_BE + text.encode("utf-32-be", errors)
        return text.encode(encoding, errors) 

示例12: has_bom

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def has_bom(fn):
    with open(fn, 'rb') as f:
        sample = f.read(4)
    return sample[:3] == b'\xef\xbb\xbf' or \
        sample.startswith(codecs.BOM_UTF16_LE) or \

示例13: get_text_contents

# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_BE [as 别名]
def get_text_contents(self):
        This attempts to figure out what the encoding of the text is
        based upon the BOM bytes, and then decodes the contents so that
        it's a valid python string.
        contents = self.get_contents()
        # The behavior of various decode() methods and functions
        # w.r.t. the initial BOM bytes is different for different
        # encodings and/or Python versions.  ('utf-8' does not strip
        # them, but has a 'utf-8-sig' which does; 'utf-16' seems to
        # strip them; etc.)  Just sidestep all the complication by
        # explicitly stripping the BOM before we decode().
        if contents[:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8:
            return contents[len(codecs.BOM_UTF8):].decode('utf-8')
        if contents[:len(codecs.BOM_UTF16_LE)] == codecs.BOM_UTF16_LE:
            return contents[len(codecs.BOM_UTF16_LE):].decode('utf-16-le')
        if contents[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
            return contents[len(codecs.BOM_UTF16_BE):].decode('utf-16-be')
            return contents.decode('utf-8')
        except UnicodeDecodeError as e:
                return contents.decode('latin-1')
            except UnicodeDecodeError as e:
                return contents.decode('utf-8', error='backslashreplace') 
