本文整理汇总了Python中unicodedata.decomposition方法的典型用法代码示例。如果您正苦于以下问题:Python unicodedata.decomposition方法的具体用法?Python unicodedata.decomposition怎么用?Python unicodedata.decomposition使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类unicodedata
的用法示例。
在下文中一共展示了unicodedata.decomposition方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_ipy2_gh357
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import decomposition [as 别名]
def test_ipy2_gh357(self):
"""https://github.com/IronLanguages/ironpython2/issues/357"""
import unicodedata
if is_cli:
self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
else:
self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')
self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
示例2: test_urlsplit_normalization
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import decomposition [as 别名]
def test_urlsplit_normalization(self):
# Certain characters should never occur in the netloc,
# including under normalization.
# Ensure that ALL of them are detected and cause an error
illegal_chars = '/:#?@'
hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
denorm_chars = [
c for c in map(chr, range(128, sys.maxunicode))
if (hex_chars & set(unicodedata.decomposition(c).split()))
and c not in illegal_chars
]
# Sanity check that we found at least one such character
self.assertIn('\u2100', denorm_chars)
self.assertIn('\uFF03', denorm_chars)
for scheme in ["http", "https", "ftp"]:
for c in denorm_chars:
url = "{}://netloc{}false.netloc/path".format(scheme, c)
with self.subTest(url=url, char='{:04X}'.format(ord(c))):
with self.assertRaises(ValueError):
urllib.parse.urlsplit(url)
示例3: test_urlsplit_normalization
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import decomposition [as 别名]
def test_urlsplit_normalization(self):
# Certain characters should never occur in the netloc,
# including under normalization.
# Ensure that ALL of them are detected and cause an error
illegal_chars = '/:#?@'
hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
maxunicode = 0xffff if sys.implementation.name == "ironpython" else sys.maxunicode # https://github.com/IronLanguages/ironpython3/issues/252
denorm_chars = [
c for c in map(chr, range(128, maxunicode))
if (hex_chars & set(unicodedata.decomposition(c).split()))
and c not in illegal_chars
]
# Sanity check that we found at least one such character
self.assertIn('\u2100', denorm_chars)
self.assertIn('\uFF03', denorm_chars)
# https://github.com/IronLanguages/ironpython3/issues/614
is_mono = False
mono_issue_chars = ("\ufe13", "\ufe16", "\ufe5f")
if sys.implementation.name == "ironpython":
import clr
is_mono = clr.IsMono
for scheme in ["http", "https", "ftp"]:
for c in denorm_chars:
url = "{}://netloc{}false.netloc/path".format(scheme, c)
with self.subTest(url=url, char='{:04X}'.format(ord(c))):
if is_mono and c in mono_issue_chars:
urllib.parse.urlsplit(url) # ensure we fail if this ever gets fixed
continue
with self.assertRaises(ValueError):
urllib.parse.urlsplit(url)
示例4: clean_unicode
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import decomposition [as 别名]
def clean_unicode(s):
s = s.replace("\u00ad", "") # soft hyphen
s = s.replace("\u2010", "-") # hyphen
# Some sources encode an i with an accent above using dotless i,
# which must be converted to normal i
s = list(s)
for i in range(len(s) - 1):
# bug: we should only be looking for accents above, not
# below
if s[i] == "ı" and unicodedata.category(s[i + 1]) == "Mn":
s[i] = "i"
s = "".join(s)
# Selectively apply compatibility decomposition.
# This converts, e.g., fi to fi and : to :, but not ² to 2.
# Unsure: … to ...
# More classes could be added here.
def decompose(c):
d = unicodedata.decomposition(c)
if d and d.split(None, 1)[0] in ["<compat>", "<wide>", "<narrow>", "<noBreak>"]:
return unicodedata.normalize("NFKD", c)
else:
return c
s = "".join(map(decompose, s))
# Convert combining characters when possible
s = unicodedata.normalize("NFC", s)
return s
示例5: __missing__
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import decomposition [as 别名]
def __missing__(self, key):
ch = self.get(key)
if ch is not None:
return ch
de = unicodedata.decomposition(chr(key))
if de:
try:
ch = int(de.split(None, 1)[0], 16)
except (IndexError, ValueError):
ch = key
else:
ch = key
self[key] = ch
return ch
示例6: test_urlsplit_normalization
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import decomposition [as 别名]
def test_urlsplit_normalization(self):
# Certain characters should never occur in the netloc,
# including under normalization.
# Ensure that ALL of them are detected and cause an error
illegal_chars = u'/:#?@'
hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
denorm_chars = [
c for c in map(unichr, range(128, sys.maxunicode))
if (hex_chars & set(unicodedata.decomposition(c).split()))
and c not in illegal_chars
]
# Sanity check that we found at least one such character
self.assertIn(u'\u2100', denorm_chars)
self.assertIn(u'\uFF03', denorm_chars)
# https://github.com/IronLanguages/ironpython3/issues/614
is_mono = False
mono_issue_chars = (u"\ufe13", u"\ufe16", u"\ufe5f")
if sys.platform == "cli":
import clr
is_mono = clr.IsMono
# bpo-36742: Verify port separators are ignored when they
# existed prior to decomposition
urlparse.urlsplit(u'http://\u30d5\u309a:80')
with self.assertRaises(ValueError):
urlparse.urlsplit(u'http://\u30d5\u309a\ufe1380')
if is_mono: raise ValueError
if is_mono: urlparse.urlsplit(u'http://\u30d5\u309a\ufe1380') # ensure we fail if this ever gets fixed
for scheme in [u"http", u"https", u"ftp"]:
for netloc in [u"netloc{}false.netloc", u"n{}user@netloc"]:
for c in denorm_chars:
url = u"{}://{}/path".format(scheme, netloc.format(c))
if test_support.verbose:
print "Checking %r" % url
if is_mono and c in mono_issue_chars:
urlparse.urlsplit(url) # ensure we fail if this ever gets fixed
continue
with self.assertRaises(ValueError):
urlparse.urlsplit(url)
# check error message: invalid netloc must be formated with repr()
# to get an ASCII error message
with self.assertRaises(ValueError) as cm:
urlparse.urlsplit(u'http://example.com\uFF03@bing.com')
self.assertEqual(str(cm.exception),
"netloc u'example.com\\uff03@bing.com' contains invalid characters "
"under NFKC normalization")
self.assertIsInstance(cm.exception.args[0], str)