本文整理汇总了Python中html.entities.name2codepoint.get方法的典型用法代码示例。如果您正苦于以下问题:Python name2codepoint.get方法的具体用法?Python name2codepoint.get怎么用?Python name2codepoint.get使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类html.entities.name2codepoint
的用法示例。
在下文中一共展示了name2codepoint.get方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: decodeHtmlentities
# 需要导入模块: from html.entities import name2codepoint [as 别名]
# 或者: from html.entities.name2codepoint import get [as 别名]
def decodeHtmlentities(string):
string = entitiesfix(string)
entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
def substitute_entity(match):
if PY3:
from html.entities import name2codepoint as n2cp
else:
from htmlentitydefs import name2codepoint as n2cp
ent = match.group(2)
if match.group(1) == "#":
return unichr(int(ent)).encode('utf-8')
else:
cp = n2cp.get(ent)
if cp:
return unichr(cp).encode('utf-8')
else:
return match.group()
return entity_re.subn(substitute_entity, string)[0]
示例2: get
# 需要导入模块: from html.entities import name2codepoint [as 别名]
# 或者: from html.entities.name2codepoint import get [as 别名]
def get(self, val, default=None):
if 0 <= val < self.num_terms:
return str(val)
return default
示例3: __init__
# 需要导入模块: from html.entities import name2codepoint [as 别名]
# 或者: from html.entities.name2codepoint import get [as 别名]
def __init__(self, corpus, reps):
"""
Wrap a `corpus` as another corpus of length `reps`. This is achieved by
repeating documents from `corpus` over and over again, until the requested
length `len(result)==reps` is reached. Repetition is done
on-the-fly=efficiently, via `itertools`.
>>> corpus = [[(1, 0.5)], []] # 2 documents
>>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents
[[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)]]
"""
self.corpus = corpus
self.reps = reps
示例4: decode_htmlentities
# 需要导入模块: from html.entities import name2codepoint [as 别名]
# 或者: from html.entities.name2codepoint import get [as 别名]
def decode_htmlentities(text):
"""
Decode HTML entities in text, coded as hex, decimal or named.
Adapted from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py
>>> u = u'E tu vivrai nel terrore - L'aldilà (1981)'
>>> print(decode_htmlentities(u).encode('UTF-8'))
E tu vivrai nel terrore - L'aldilà (1981)
>>> print(decode_htmlentities("l'eau"))
l'eau
>>> print(decode_htmlentities("foo < bar"))
foo < bar
"""
def substitute_entity(match):
ent = match.group(3)
if match.group(1) == "#":
# decoding by number
if match.group(2) == '':
# number is in decimal
return unichr(int(ent))
elif match.group(2) == 'x':
# number is in hex
return unichr(int('0x' + ent, 16))
else:
# they were using a name
cp = n2cp.get(ent)
if cp:
return unichr(cp)
else:
return match.group()
try:
return RE_HTML_ENTITY.sub(substitute_entity, text)
except:
# in case of errors, return input
# e.g., ValueError: unichr() arg not in range(0x10000) (narrow Python build)
return text
示例5: chunkize
# 需要导入模块: from html.entities import name2codepoint [as 别名]
# 或者: from html.entities.name2codepoint import get [as 别名]
def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
"""
Split a stream of values into smaller chunks.
Each chunk is of length `chunksize`, except the last one which may be smaller.
A once-only input stream (`corpus` from a generator) is ok, chunking is done
efficiently via itertools.
If `maxsize > 1`, don't wait idly in between successive chunk `yields`, but
rather keep filling a short queue (of size at most `maxsize`) with forthcoming
chunks in advance. This is realized by starting a separate process, and is
meant to reduce I/O delays, which can be significant when `corpus` comes
from a slow medium (like harddisk).
If `maxsize==0`, don't fool around with parallelism and simply yield the chunksize
via `chunkize_serial()` (no I/O optimizations).
>>> for chunk in chunkize(range(10), 4): print(chunk)
[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9]
"""
assert chunksize > 0
if maxsize > 0:
q = multiprocessing.Queue(maxsize=maxsize)
worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy)
worker.daemon = True
worker.start()
while True:
chunk = [q.get(block=True)]
if chunk[0] is None:
break
yield chunk.pop()
else:
for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
yield chunk
示例6: substitute
# 需要导入模块: from html.entities import name2codepoint [as 别名]
# 或者: from html.entities.name2codepoint import get [as 别名]
def substitute(match):
ent = match.group(2)
if match.group(1) == "#":
return chr(int(ent))
else:
cp = n2cp.get(ent)
if cp:
return chr(cp)
else:
return match.group()
示例7: get_header_from_response
# 需要导入模块: from html.entities import name2codepoint [as 别名]
# 或者: from html.entities.name2codepoint import get [as 别名]
def get_header_from_response(url, header_to_get="", post=None, headers=None):
header_to_get = header_to_get.lower()
response = httptools.downloadpage(url, post=post, headers=headers, only_headers=True)
return response.headers.get(header_to_get)
示例8: decode_htmlentities
# 需要导入模块: from html.entities import name2codepoint [as 别名]
# 或者: from html.entities.name2codepoint import get [as 别名]
def decode_htmlentities(text):
"""
Decode HTML entities in text, coded as hex, decimal or named.
Adapted from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py
>>> u = u'E tu vivrai nel terrore - L'aldilà (1981)'
>>> print(decode_htmlentities(u).encode('UTF-8'))
E tu vivrai nel terrore - L'aldilà (1981)
>>> print(decode_htmlentities("l'eau"))
l'eau
>>> print(decode_htmlentities("foo < bar"))
foo < bar
"""
def substitute_entity(match):
ent = match.group(3)
if match.group(1) == "#":
# decoding by number
if match.group(2) == '':
# number is in decimal
return unichr(int(ent))
elif match.group(2) == 'x':
# number is in hex
return unichr(int('0x' + ent, 16))
else:
# they were using a name
cp = n2cp.get(ent)
if cp:
return unichr(cp)
else:
return match.group()
try:
return RE_HTML_ENTITY.sub(substitute_entity, text)
except:
# in case of errors, return input
# e.g., ValueError: unichr() arg not in range(0x10000) (narrow Python build)
return text