本文整理汇总了Python中_sre.CODESIZE属性的典型用法代码示例。如果您正苦于以下问题:Python _sre.CODESIZE属性的具体用法?Python _sre.CODESIZE怎么用?Python _sre.CODESIZE使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类_sre
的用法示例。
在下文中一共展示了_sre.CODESIZE属性的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _bytes_to_codes
# 需要导入模块: import _sre [as 别名]
# 或者: from _sre import CODESIZE [as 别名]
def _bytes_to_codes(b):
# Convert block indices to word array
import array
if _sre.CODESIZE == 2:
code = 'H'
else:
code = 'I'
a = array.array(code, bytes(b))
assert a.itemsize == _sre.CODESIZE
assert len(a) * a.itemsize == len(b)
return a.tolist()
示例2: _bytes_to_codes
# 需要导入模块: import _sre [as 别名]
# 或者: from _sre import CODESIZE [as 别名]
def _bytes_to_codes(b):
# Convert block indices to word array
a = memoryview(b).cast('I')
assert a.itemsize == _sre.CODESIZE
assert len(a) * a.itemsize == len(b)
return a.tolist()
示例3: _hex_code
# 需要导入模块: import _sre [as 别名]
# 或者: from _sre import CODESIZE [as 别名]
def _hex_code(code):
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
示例4: _mk_bitmap
# 需要导入模块: import _sre [as 别名]
# 或者: from _sre import CODESIZE [as 别名]
def _mk_bitmap(bits):
data = []
dataappend = data.append
if _sre.CODESIZE == 2:
start = (1, 0)
else:
start = (1, 0)
m, v = start
for c in bits:
if c:
v = v + m
m = m + m
if m > MAXCODE:
dataappend(v)
m, v = start
return data
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 16-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (128 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of chunks (16 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
# of the basic multilingual plane; an efficient representation
# for all of UTF-16 has not yet been developed. This means,
# in particular, that negated charsets cannot be represented as
# bigcharsets.
示例5: _optimize_unicode
# 需要导入模块: import _sre [as 别名]
# 或者: from _sre import CODESIZE [as 别名]
def _optimize_unicode(charset, fixup):
try:
import array
except ImportError:
return charset
charmap = [0]*65536
negate = 0
try:
for op, av in charset:
if op is NEGATE:
negate = 1
elif op is LITERAL:
charmap[fixup(av)] = 1
elif op is RANGE:
for i in range(fixup(av[0]), fixup(av[1])+1):
charmap[i] = 1
elif op is CATEGORY:
# XXX: could expand category
return charset # cannot compress
except IndexError:
# non-BMP characters; XXX now they should work
return charset
if negate:
if sys.maxunicode != 65535:
# XXX: negation does not work with big charsets
# XXX2: now they should work, but removing this will make the
# charmap 17 times bigger
return charset
for i in range(65536):
charmap[i] = not charmap[i]
comps = {}
mapping = [0]*256
block = 0
data = []
for i in range(256):
chunk = tuple(charmap[i*256:(i+1)*256])
new = comps.setdefault(chunk, block)
mapping[i] = new
if new == block:
block = block + 1
data = data + _mk_bitmap(chunk)
header = [block]
if _sre.CODESIZE == 2:
code = 'H'
else:
code = 'I'
# Convert block indices to byte array of 256 bytes
mapping = array.array('b', mapping).tobytes()
# Convert byte array to word array
mapping = array.array(code, mapping)
assert mapping.itemsize == _sre.CODESIZE
assert len(mapping) * mapping.itemsize == 256
header = header + mapping.tolist()
data[0:0] = header
return [(BIGCHARSET, data)]
示例6: _mk_bitmap
# 需要导入模块: import _sre [as 别名]
# 或者: from _sre import CODESIZE [as 别名]
def _mk_bitmap(bits):
data = []
dataappend = data.append
if _sre.CODESIZE == 2:
start = (1, 0)
else:
start = (1L, 0L)
m, v = start
for c in bits:
if c:
v = v + m
m = m + m
if m > MAXCODE:
dataappend(v)
m, v = start
return data
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 16-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (128 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of chunks (16 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
# of the basic multilingual plane; an efficient representation
# for all of UTF-16 has not yet been developed. This means,
# in particular, that negated charsets cannot be represented as
# bigcharsets.
示例7: _optimize_unicode
# 需要导入模块: import _sre [as 别名]
# 或者: from _sre import CODESIZE [as 别名]
def _optimize_unicode(charset, fixup):
try:
import array
except ImportError:
return charset
charmap = [0]*65536
negate = 0
try:
for op, av in charset:
if op is NEGATE:
negate = 1
elif op is LITERAL:
charmap[fixup(av)] = 1
elif op is RANGE:
for i in xrange(fixup(av[0]), fixup(av[1])+1):
charmap[i] = 1
elif op is CATEGORY:
# XXX: could expand category
return charset # cannot compress
except IndexError:
# non-BMP characters
return charset
if negate:
if sys.maxunicode != 65535:
# XXX: negation does not work with big charsets
return charset
for i in xrange(65536):
charmap[i] = not charmap[i]
comps = {}
mapping = [0]*256
block = 0
data = []
for i in xrange(256):
chunk = tuple(charmap[i*256:(i+1)*256])
new = comps.setdefault(chunk, block)
mapping[i] = new
if new == block:
block = block + 1
data = data + _mk_bitmap(chunk)
header = [block]
if _sre.CODESIZE == 2:
code = 'H'
else:
code = 'I'
# Convert block indices to byte array of 256 bytes
mapping = array.array('b', mapping).tostring()
# Convert byte array to word array
mapping = array.array(code, mapping)
assert mapping.itemsize == _sre.CODESIZE
header = header + mapping.tolist()
data[0:0] = header
return [(BIGCHARSET, data)]
示例8: _mk_bitmap
# 需要导入模块: import _sre [as 别名]
# 或者: from _sre import CODESIZE [as 别名]
def _mk_bitmap(bits):
data = []
dataappend = data.append
if _sre.CODESIZE == 2:
start = (1, 0)
else:
start = (1L, 0L)
m, v = start
for c in bits:
if c:
v = v + m
m = m + m
if m > MAXCODE:
dataappend(v)
m, v = start
return data
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 32-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (64 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of 256-bit chunks (8 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
# of the basic multilingual plane; an efficient representation
# for all of Unicode has not yet been developed. This means,
# in particular, that negated charsets cannot be represented as
# bigcharsets.
示例9: _optimize_unicode
# 需要导入模块: import _sre [as 别名]
# 或者: from _sre import CODESIZE [as 别名]
def _optimize_unicode(charset, fixup):
try:
import array
except ImportError:
return charset
charmap = [0]*65536
negate = 0
try:
for op, av in charset:
if op is NEGATE:
negate = 1
elif op is LITERAL:
charmap[fixup(av)] = 1
elif op is RANGE:
for i in xrange(fixup(av[0]), fixup(av[1])+1):
charmap[i] = 1
elif op is CATEGORY:
# XXX: could expand category
return charset # cannot compress
except IndexError:
# non-BMP characters
return charset
if negate:
if sys.maxunicode != 65535:
# XXX: negation does not work with big charsets
return charset
for i in xrange(65536):
charmap[i] = not charmap[i]
comps = {}
mapping = [0]*256
block = 0
data = []
for i in xrange(256):
chunk = tuple(charmap[i*256:(i+1)*256])
new = comps.setdefault(chunk, block)
mapping[i] = new
if new == block:
block = block + 1
data = data + _mk_bitmap(chunk)
header = [block]
if _sre.CODESIZE == 2:
code = 'H'
else:
code = 'I'
# Convert block indices to byte array of 256 bytes
mapping = array.array('B', mapping).tostring()
# Convert byte array to word array
mapping = array.array(code, mapping)
assert mapping.itemsize == _sre.CODESIZE
header = header + mapping.tolist()
data[0:0] = header
return [(BIGCHARSET, data)]