本文整理汇总了Python中unicodedata.normalize方法的典型用法代码示例。如果您正苦于以下问题:Python unicodedata.normalize方法的具体用法?Python unicodedata.normalize怎么用?Python unicodedata.normalize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类unicodedata
的用法示例。
在下文中一共展示了unicodedata.normalize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _tokenize
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def _tokenize(self, text):
if not self._cased:
text = unicodedata.normalize('NFD', text)
text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
text = text.lower()
spaced = ''
for ch in text:
if self._is_punctuation(ch) or self._is_cjk_character(ch):
spaced += ' ' + ch + ' '
elif self._is_space(ch):
spaced += ' '
elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
continue
else:
spaced += ch
tokens = []
for word in spaced.strip().split():
tokens += self._word_piece_tokenize(word)
return tokens
示例2: secure_filename
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def secure_filename(filename):
if isinstance(filename, str):
from unicodedata import normalize
filename = normalize('NFKD', filename).encode('ascii', 'ignore')
filename = filename.decode('ascii')
for sep in os.path.sep, os.path.altsep:
if sep:
filename = filename.replace(sep, ' ')
filename = str(_filename_ascii_strip_re.sub('', '_'.join(
filename.split()))).strip('._')
# on nt a couple of special files are present in each folder. We
# have to ensure that the target file is not such a filename. In
# this case we prepend an underline
if os.name == 'nt' and filename and \
filename.split('.')[0].upper() in _windows_device_files:
filename = '_' + filename
return filename
示例3: unicodify
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def unicodify(s, encoding='utf-8', norm=None):
"""Ensure string is Unicode.
.. versionadded:: 1.31
Decode encoded strings using ``encoding`` and normalise Unicode
to form ``norm`` if specified.
Args:
s (str): String to decode. May also be Unicode.
encoding (str, optional): Encoding to use on bytestrings.
norm (None, optional): Normalisation form to apply to Unicode string.
Returns:
unicode: Decoded, optionally normalised, Unicode string.
"""
if not isinstance(s, unicode):
s = unicode(s, encoding)
if norm:
from unicodedata import normalize
s = normalize(norm, s)
return s
示例4: fold_to_ascii
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
示例5: strdisplaywidth
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def strdisplaywidth(self, s):
def get_char_display_width(unicode_str):
r = unicodedata.east_asian_width(unicode_str)
if r == "F": # Fullwidth
return 1
elif r == "H": # Half-width
return 1
elif r == "W": # Wide
return 2
elif r == "Na": # Narrow
return 1
elif r == "A": # Ambiguous, go with 2
return 1
elif r == "N": # Neutral
return 1
else:
return 1
s = unicodedata.normalize('NFC', s)
w = 0
for c in s:
w += get_char_display_width(c)
return w
示例6: normalizestr
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def normalizestr(string):
""" Converts special characters like copyright,
trademark signs to ascii name """
# print("input: '{}'".format(string))
input_string = string
for mark, ascii_repl in unicode_marks(string):
string = string.replace(mark, ascii_repl)
rv = []
# for c in unicodedata.normalize('NFKC', smart_text(string)):
for c in unicodedata.normalize('NFKC', string):
# cat = unicodedata.category(c)[0]
# if cat in 'LN' or c in ok:
rv.append(c)
new = ''.join(rv).strip()
result = unidecode(new)
if result != input_string:
print("Fixed string: '{}'".format(result))
return result
示例7: deaccent
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
Return input string with accents removed, as unicode.
>>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
if not isinstance(text, unicode):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
示例8: _byteify
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def _byteify(data, ignore_dicts = False):
# if this is a unicode string, return its string representation
if isinstance(data, unicode):
return unicodedata.normalize('NFKD', data).encode('ascii','ignore')
# if this is a list of values, return list of byteified values
if isinstance(data, list):
return [ _byteify(item, ignore_dicts=True) for item in data ]
# if this is a dictionary, return dictionary of byteified keys and values
# but only if we haven't already byteified it
if isinstance(data, dict) and not ignore_dicts:
return {
_byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True)
for key, value in data.iteritems()
}
# if it's anything else, return it in its original form
return data
示例9: generate_rows
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
partition_id=None, records_limit = -1):
query_date = datetime.datetime.now()
rows = self.list_epics()
if len(rows) == 0:
logging.info("Not epics.")
else:
nb = 0
for row in rows:
if 0 <= records_limit <= nb:
logging.info("Reached records_limit (%i), stopping." % records_limit)
return
encoded_row = {}
encoded_row["query_date"] = query_date
for key in row:
val = row[key]
if isinstance(val, unicode):
val = unicodedata.normalize('NFKD', val).encode('ascii','ignore')
encoded_row[str(key)] = val
yield encoded_row
nb += 1
示例10: setup
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def setup(self):
"""Setup."""
self.normalize = self.config['normalize'].upper()
self.convert_encoding = self.config['convert_encoding'].lower()
self.errors = self.config['errors'].lower()
if self.convert_encoding:
self.convert_encoding = codecs.lookup(
filters.PYTHON_ENCODING_NAMES.get(self.default_encoding, self.default_encoding).lower()
).name
# Don't generate content with BOMs
if (
self.convert_encoding.startswith(('utf-32', 'utf-16')) and
not self.convert_encoding.endswith(('le', 'be'))
):
self.convert_encoding += '-le'
if self.convert_encoding == 'utf-8-sig':
self.convert_encoding = 'utf-8'
示例11: normalize_string
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def normalize_string(text):
''' For theme media, do not modify unless modified in TV Tunes.
Remove dots from the last character as windows can not have directories
with dots at the end
'''
text = text.replace(":", "")
text = text.replace("/", "-")
text = text.replace("\\", "-")
text = text.replace("<", "")
text = text.replace(">", "")
text = text.replace("*", "")
text = text.replace("?", "")
text = text.replace('|', "")
text = text.strip()
text = text.rstrip('.')
text = unicodedata.normalize('NFKD', unicode(text, 'utf-8')).encode('ascii', 'ignore')
return text
示例12: _run_strip_accents
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
# 这个函数去除掉text中的非间距字符
# 标准化对于任何需要以一致的方式处理Unicode文本的程序都是非常重要的。
# 当处理来自用户输入的字符串而你很难去控制编码的时候尤其如此。
# normalize() 将文本标准化,第一个参数指定字符串标准化的方式,NFD表示字符应该分解为多个组合字符表示
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
# category() 返回字符在UNICODE里分类的类型
cat = unicodedata.category(char)
if cat == "Mn":
# Mark, Nonspacing 指示字符是非间距字符,这指示基字符的修改。
# https://www.fileformat.info/info/unicode/category/Mn/list.htm
continue
output.append(char)
return "".join(output)
示例13: normalize
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def normalize(self, form):
"""
Return the Unicode normal form for the strings in the Series/Index.
For more information on the forms, see the
:func:`unicodedata.normalize`.
Parameters
----------
form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
Unicode form
Returns
-------
normalized : Series/Index of objects
"""
import unicodedata
f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
result = _na_map(f, self._parent)
return self._wrap_result(result)
示例14: clean_id
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def clean_id(name, preserve_case=False):
"""
Return a 'clean' dokuwiki-compliant name. Based on the cleanID() PHP function in inc/pageutils.php
Ignores both slashes and colons as valid namespace choices (to convert slashes to colons,
call make_dokuwiki_pagename)
"""
main,ext = os.path.splitext(name)
# remove accents
try:
decomposed = unicodedata.normalize("NFKD", main)
no_accent = ''.join(c for c in decomposed if ord(c)<0x7f)
except TypeError:
no_accent = main # name was plaintext to begin with
# recombine without any other characters
result = (re.sub(r'[^\w/:-]+', '_', no_accent) + ext)
if not preserve_case:
result = result.lower()
while "__" in result:
result = result.replace("__", "_") # this is a hack, unsure why regex doesn't catch it
return result
示例15: _run_strip_accents
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)