本文整理汇总了Python中codecs.BOM_UTF16_LE属性的典型用法代码示例。如果您正苦于以下问题:Python codecs.BOM_UTF16_LE属性的具体用法?Python codecs.BOM_UTF16_LE怎么用?Python codecs.BOM_UTF16_LE使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类codecs
的用法示例。
在下文中一共展示了codecs.BOM_UTF16_LE属性的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _detect_encoding
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def _detect_encoding(self, fileid):
if isinstance(fileid, PathPointer):
s = fileid.open().readline()
else:
with open(fileid, 'rb') as infile:
s = infile.readline()
if s.startswith(codecs.BOM_UTF16_BE):
return 'utf-16-be'
if s.startswith(codecs.BOM_UTF16_LE):
return 'utf-16-le'
if s.startswith(codecs.BOM_UTF32_BE):
return 'utf-32-be'
if s.startswith(codecs.BOM_UTF32_LE):
return 'utf-32-le'
if s.startswith(codecs.BOM_UTF8):
return 'utf-8'
m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
if m:
return m.group(1).decode()
m = re.match(br"\s*<\?xml\b.*\bencoding='([^']+)'", s)
if m:
return m.group(1).decode()
# No encoding found -- what should the default be?
return 'utf-8'
示例2: get_text_contents
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def get_text_contents(self):
contents = self.get_contents()
# The behavior of various decode() methods and functions
# w.r.t. the initial BOM bytes is different for different
# encodings and/or Python versions. ('utf-8' does not strip
# them, but has a 'utf-8-sig' which does; 'utf-16' seems to
# strip them; etc.) Just side step all the complication by
# explicitly stripping the BOM before we decode().
if contents.startswith(codecs.BOM_UTF8):
contents = contents[len(codecs.BOM_UTF8):]
# TODO(2.2): Remove when 2.3 becomes floor.
#contents = contents.decode('utf-8')
contents = my_decode(contents, 'utf-8')
elif contents.startswith(codecs.BOM_UTF16_LE):
contents = contents[len(codecs.BOM_UTF16_LE):]
# TODO(2.2): Remove when 2.3 becomes floor.
#contents = contents.decode('utf-16-le')
contents = my_decode(contents, 'utf-16-le')
elif contents.startswith(codecs.BOM_UTF16_BE):
contents = contents[len(codecs.BOM_UTF16_BE):]
# TODO(2.2): Remove when 2.3 becomes floor.
#contents = contents.decode('utf-16-be')
contents = my_decode(contents, 'utf-16-be')
return contents
示例3: guess_json_utf
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def guess_json_utf(data):
# JSON always starts with two ASCII characters, so detection is as
# easy as counting the nulls and from their location and count
# determine the encoding. Also detect a BOM, if present.
sample = data[:4]
if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE):
return 'utf-32' # BOM included
if sample[:3] == codecs.BOM_UTF8:
return 'utf-8-sig' # BOM included, MS style (discouraged)
if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
return 'utf-16' # BOM included
nullcount = sample.count(_null)
if nullcount == 0:
return 'utf-8'
if nullcount == 2:
if sample[::2] == _null2: # 1st and 3rd are null
return 'utf-16-be'
if sample[1::2] == _null2: # 2nd and 4th are null
return 'utf-16-le'
# Did not detect 2 valid UTF-16 ascii-range characters
if nullcount == 3:
if sample[:3] == _null3:
return 'utf-32-be'
if sample[1:] == _null3:
return 'utf-32-le'
# Did not detect a valid UTF-32 ascii-range character
return None
示例4: guess_json_utf
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def guess_json_utf(data):
"""
:rtype: str
"""
# JSON always starts with two ASCII characters, so detection is as
# easy as counting the nulls and from their location and count
# determine the encoding. Also detect a BOM, if present.
sample = data[:4]
if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
return 'utf-32' # BOM included
if sample[:3] == codecs.BOM_UTF8:
return 'utf-8-sig' # BOM included, MS style (discouraged)
if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
return 'utf-16' # BOM included
nullcount = sample.count(_null)
if nullcount == 0:
return 'utf-8'
if nullcount == 2:
if sample[::2] == _null2: # 1st and 3rd are null
return 'utf-16-be'
if sample[1::2] == _null2: # 2nd and 4th are null
return 'utf-16-le'
# Did not detect 2 valid UTF-16 ascii-range characters
if nullcount == 3:
if sample[:3] == _null3:
return 'utf-32-be'
if sample[1:] == _null3:
return 'utf-32-le'
# Did not detect a valid UTF-32 ascii-range character
return None
示例5: determine_encoding
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def determine_encoding(self):
while not self.eof and len(self.raw_buffer) < 2:
self.update_raw()
if not isinstance(self.raw_buffer, unicode):
if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
self.raw_decode = codecs.utf_16_le_decode
self.encoding = 'utf-16-le'
elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
self.raw_decode = codecs.utf_16_be_decode
self.encoding = 'utf-16-be'
else:
self.raw_decode = codecs.utf_8_decode
self.encoding = 'utf-8'
self.update(1)
示例6: determine_encoding
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def determine_encoding(self):
while not self.eof and (self.raw_buffer is None or len(self.raw_buffer) < 2):
self.update_raw()
if isinstance(self.raw_buffer, bytes):
if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
self.raw_decode = codecs.utf_16_le_decode
self.encoding = 'utf-16-le'
elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
self.raw_decode = codecs.utf_16_be_decode
self.encoding = 'utf-16-be'
else:
self.raw_decode = codecs.utf_8_decode
self.encoding = 'utf-8'
self.update(1)
示例7: detect_encoding
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def detect_encoding(data):
"""Detect which UTF codec was used to encode the given bytes.
The latest JSON standard (:rfc:`8259`) suggests that only UTF-8 is
accepted. Older documents allowed 8, 16, or 32. 16 and 32 can be big
or little endian. Some editors or libraries may prepend a BOM.
:param data: Bytes in unknown UTF encoding.
:return: UTF encoding name
"""
head = data[:4]
if head[:3] == codecs.BOM_UTF8:
return 'utf-8-sig'
if b'\x00' not in head:
return 'utf-8'
if head in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE):
return 'utf-32'
if head[:2] in (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE):
return 'utf-16'
if len(head) == 4:
if head[:3] == b'\x00\x00\x00':
return 'utf-32-be'
if head[::2] == b'\x00\x00':
return 'utf-16-be'
if head[1:] == b'\x00\x00\x00':
return 'utf-32-le'
if head[1::2] == b'\x00\x00':
return 'utf-16-le'
if len(head) == 2:
return 'utf-16-be' if head.startswith(b'\x00') else 'utf-16-le'
return 'utf-8'
示例8: detectBOM
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
an encoding can be determined from the BOM return the name of the
encoding otherwise return None"""
bomDict = {
codecs.BOM_UTF8: 'utf-8',
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
}
# Go to beginning of file and read in 4 bytes
string = self.rawStream.read(4)
assert isinstance(string, bytes)
# Try detecting the BOM using bytes from the string
encoding = bomDict.get(string[:3]) # UTF-8
seek = 3
if not encoding:
# Need to detect UTF-32 before UTF-16
encoding = bomDict.get(string) # UTF-32
seek = 4
if not encoding:
encoding = bomDict.get(string[:2]) # UTF-16
seek = 2
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
if encoding:
self.rawStream.seek(seek)
return lookupEncoding(encoding)
else:
self.rawStream.seek(0)
return None
示例9: guess_json_utf
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def guess_json_utf(data):
"""
:rtype: str
"""
# JSON always starts with two ASCII characters, so detection is as
# easy as counting the nulls and from their location and count
# determine the encoding. Also detect a BOM, if present.
sample = data[:4]
if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE):
return 'utf-32' # BOM included
if sample[:3] == codecs.BOM_UTF8:
return 'utf-8-sig' # BOM included, MS style (discouraged)
if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
return 'utf-16' # BOM included
nullcount = sample.count(_null)
if nullcount == 0:
return 'utf-8'
if nullcount == 2:
if sample[::2] == _null2: # 1st and 3rd are null
return 'utf-16-be'
if sample[1::2] == _null2: # 2nd and 4th are null
return 'utf-16-le'
# Did not detect 2 valid UTF-16 ascii-range characters
if nullcount == 3:
if sample[:3] == _null3:
return 'utf-32-be'
if sample[1:] == _null3:
return 'utf-32-le'
# Did not detect a valid UTF-32 ascii-range character
return None
示例10: detectBOM
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
an encoding can be determined from the BOM return the name of the
encoding otherwise return None"""
bomDict = {
codecs.BOM_UTF8: 'utf-8',
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
}
# Go to beginning of file and read in 4 bytes
string = self.rawStream.read(4)
assert isinstance(string, bytes)
# Try detecting the BOM using bytes from the string
encoding = bomDict.get(string[:3]) # UTF-8
seek = 3
if not encoding:
# Need to detect UTF-32 before UTF-16
encoding = bomDict.get(string) # UTF-32
seek = 4
if not encoding:
encoding = bomDict.get(string[:2]) # UTF-16
seek = 2
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.rawStream.seek(encoding and seek or 0)
return encoding
示例11: encode_endian
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def encode_endian(text, encoding, errors="strict", le=True):
"""Like text.encode(encoding) but always returns little endian/big endian
BOMs instead of the system one.
Args:
text (text)
encoding (str)
errors (str)
le (boolean): if little endian
Returns:
bytes
Raises:
UnicodeEncodeError
LookupError
"""
encoding = codecs.lookup(encoding).name
if encoding == "utf-16":
if le:
return codecs.BOM_UTF16_LE + text.encode("utf-16-le", errors)
else:
return codecs.BOM_UTF16_BE + text.encode("utf-16-be", errors)
elif encoding == "utf-32":
if le:
return codecs.BOM_UTF32_LE + text.encode("utf-32-le", errors)
else:
return codecs.BOM_UTF32_BE + text.encode("utf-32-be", errors)
else:
return text.encode(encoding, errors)
示例12: iter_text_fixups
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def iter_text_fixups(data, encoding):
"""Yields a series of repaired text values for decoding"""
yield data
if encoding == Encoding.UTF16BE:
# wrong termination
yield data + b"\x00"
elif encoding == Encoding.UTF16:
# wrong termination
yield data + b"\x00"
# utf-16 is missing BOM, content is usually utf-16-le
yield codecs.BOM_UTF16_LE + data
# both cases combined
yield codecs.BOM_UTF16_LE + data + b"\x00"
示例13: has_bom
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def has_bom(fn):
with open(fn, 'rb') as f:
sample = f.read(4)
return sample[:3] == b'\xef\xbb\xbf' or \
sample.startswith(codecs.BOM_UTF16_LE) or \
sample.startswith(codecs.BOM_UTF16_BE)
示例14: get_text_contents
# 需要导入模块: import codecs [as 别名]
# 或者: from codecs import BOM_UTF16_LE [as 别名]
def get_text_contents(self):
"""
This attempts to figure out what the encoding of the text is
based upon the BOM bytes, and then decodes the contents so that
it's a valid python string.
"""
contents = self.get_contents()
# The behavior of various decode() methods and functions
# w.r.t. the initial BOM bytes is different for different
# encodings and/or Python versions. ('utf-8' does not strip
# them, but has a 'utf-8-sig' which does; 'utf-16' seems to
# strip them; etc.) Just sidestep all the complication by
# explicitly stripping the BOM before we decode().
if contents[:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8:
return contents[len(codecs.BOM_UTF8):].decode('utf-8')
if contents[:len(codecs.BOM_UTF16_LE)] == codecs.BOM_UTF16_LE:
return contents[len(codecs.BOM_UTF16_LE):].decode('utf-16-le')
if contents[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
return contents[len(codecs.BOM_UTF16_BE):].decode('utf-16-be')
try:
return contents.decode('utf-8')
except UnicodeDecodeError as e:
try:
return contents.decode('latin-1')
except UnicodeDecodeError as e:
return contents.decode('utf-8', error='backslashreplace')