Python unicodedata.normalize方法代码示例

本文整理汇总了Python中unicodedata.normalize方法的典型用法代码示例。如果您正苦于以下问题：Python unicodedata.normalize方法的具体用法？Python unicodedata.normalize怎么用？Python unicodedata.normalize使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类unicodedata的用法示例。

在下文中一共展示了unicodedata.normalize方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _tokenize

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def _tokenize(self, text):
        if not self._cased:
            text = unicodedata.normalize('NFD', text)
            text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
            text = text.lower()
        spaced = ''
        for ch in text:
            if self._is_punctuation(ch) or self._is_cjk_character(ch):
                spaced += ' ' + ch + ' '
            elif self._is_space(ch):
                spaced += ' '
            elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
                continue
            else:
                spaced += ch
        tokens = []
        for word in spaced.strip().split():
            tokens += self._word_piece_tokenize(word)
        return tokens

开发者ID:CyberZHG，项目名称:keras-bert，代码行数:21，代码来源:tokenizer.py

示例2: secure_filename

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def secure_filename(filename):
    if isinstance(filename, str):
        from unicodedata import normalize
        filename = normalize('NFKD', filename).encode('ascii', 'ignore')
        filename = filename.decode('ascii')
    for sep in os.path.sep, os.path.altsep:
        if sep:
            filename = filename.replace(sep, ' ')
    filename = str(_filename_ascii_strip_re.sub('', '_'.join(
                   filename.split()))).strip('._')

    # on nt a couple of special files are present in each folder.  We
    # have to ensure that the target file is not such a filename.  In
    # this case we prepend an underline
    if os.name == 'nt' and filename and \
       filename.split('.')[0].upper() in _windows_device_files:
        filename = '_' + filename

    return filename

开发者ID:mme，项目名称:vergeml，代码行数:21，代码来源:wsgi.py

示例3: unicodify

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def unicodify(s, encoding='utf-8', norm=None):
    """Ensure string is Unicode.

    .. versionadded:: 1.31

    Decode encoded strings using ``encoding`` and normalise Unicode
    to form ``norm`` if specified.

    Args:
        s (str): String to decode. May also be Unicode.
        encoding (str, optional): Encoding to use on bytestrings.
        norm (None, optional): Normalisation form to apply to Unicode string.

    Returns:
        unicode: Decoded, optionally normalised, Unicode string.

    """
    if not isinstance(s, unicode):
        s = unicode(s, encoding)

    if norm:
        from unicodedata import normalize
        s = normalize(norm, s)

    return s

开发者ID:TKkk-iOSer，项目名称:wechat-alfred-workflow，代码行数:27，代码来源:util.py

示例4: fold_to_ascii

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def fold_to_ascii(self, text):
        """Convert non-ASCII characters to closest ASCII equivalent.

        .. versionadded:: 1.3

        .. note:: This only works for a subset of European languages.

        :param text: text to convert
        :type text: ``unicode``
        :returns: text containing only ASCII characters
        :rtype: ``unicode``

        """
        if isascii(text):
            return text
        text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
        return unicode(unicodedata.normalize('NFKD',
                       text).encode('ascii', 'ignore'))

开发者ID:TKkk-iOSer，项目名称:wechat-alfred-workflow，代码行数:20，代码来源:workflow.py

示例5: strdisplaywidth

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def strdisplaywidth(self, s):
        def get_char_display_width(unicode_str):
            r = unicodedata.east_asian_width(unicode_str)
            if r == "F":  # Fullwidth
                return 1
            elif r == "H":  # Half-width
                return 1
            elif r == "W":  # Wide
                return 2
            elif r == "Na":  # Narrow
                return 1
            elif r == "A":  # Ambiguous, go with 2
                return 1
            elif r == "N":  # Neutral
                return 1
            else:
                return 1

        s = unicodedata.normalize('NFC', s)
        w = 0
        for c in s:
            w += get_char_display_width(c)
        return w

开发者ID:ncm2，项目名称:ncm2，代码行数:25，代码来源:ncm2.py

示例6: normalizestr

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def normalizestr(string):
    """ Converts special characters like copyright,
        trademark signs to ascii name """
    # print("input: '{}'".format(string))
    input_string = string
    for mark, ascii_repl in unicode_marks(string):
        string = string.replace(mark, ascii_repl)

    rv = []
#    for c in unicodedata.normalize('NFKC', smart_text(string)):
    for c in unicodedata.normalize('NFKC', string):
        # cat = unicodedata.category(c)[0]
        # if cat in 'LN' or c in ok:
        rv.append(c)

    new = ''.join(rv).strip()
    result = unidecode(new)
    if result != input_string:
        print("Fixed string: '{}'".format(result))
    return result

开发者ID:googlefonts，项目名称:gftools，代码行数:22，代码来源:gftools-fix-ascii-fontmetadata.py

示例7: deaccent

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result)

开发者ID:loretoparisi，项目名称:word2vec-twitter，代码行数:18，代码来源:word2vecReaderUtils.py

示例8: _byteify

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def _byteify(data, ignore_dicts = False):
    # if this is a unicode string, return its string representation
    if isinstance(data, unicode):
        return unicodedata.normalize('NFKD', data).encode('ascii','ignore')
    # if this is a list of values, return list of byteified values
    if isinstance(data, list):
        return [ _byteify(item, ignore_dicts=True) for item in data ]
    # if this is a dictionary, return dictionary of byteified keys and values
    # but only if we haven't already byteified it
    if isinstance(data, dict) and not ignore_dicts:
        return {
            _byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True)
            for key, value in data.iteritems()
        }
    # if it's anything else, return it in its original form
    return data

开发者ID:dataiku，项目名称:dataiku-contrib，代码行数:18，代码来源:connector.py

示例9: generate_rows

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
                            partition_id=None, records_limit = -1):
        query_date = datetime.datetime.now()

        rows = self.list_epics()
        if len(rows) == 0:
            logging.info("Not epics.")
        else:
            nb = 0
            for row in rows:
                if 0 <= records_limit <= nb:
                    logging.info("Reached records_limit (%i), stopping." % records_limit)
                    return
                
                encoded_row = {}
                encoded_row["query_date"] = query_date
                for key in row:
                    val = row[key]
                    if isinstance(val, unicode):
                        val = unicodedata.normalize('NFKD', val).encode('ascii','ignore')                                             
                    encoded_row[str(key)] = val
                    
                yield encoded_row
                nb += 1

开发者ID:dataiku，项目名称:dataiku-contrib，代码行数:26，代码来源:connector.py

示例10: setup

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def setup(self):
        """Setup."""

        self.normalize = self.config['normalize'].upper()
        self.convert_encoding = self.config['convert_encoding'].lower()
        self.errors = self.config['errors'].lower()

        if self.convert_encoding:
            self.convert_encoding = codecs.lookup(
                filters.PYTHON_ENCODING_NAMES.get(self.default_encoding, self.default_encoding).lower()
            ).name

            # Don't generate content with BOMs
            if (
                self.convert_encoding.startswith(('utf-32', 'utf-16')) and
                not self.convert_encoding.endswith(('le', 'be'))
            ):
                self.convert_encoding += '-le'

            if self.convert_encoding == 'utf-8-sig':
                self.convert_encoding = 'utf-8'

开发者ID:facelessuser，项目名称:pyspelling，代码行数:23，代码来源:text.py

示例11: normalize_string

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def normalize_string(text):

    ''' For theme media, do not modify unless modified in TV Tunes.
        Remove dots from the last character as windows can not have directories
        with dots at the end
    '''
    text = text.replace(":", "")
    text = text.replace("/", "-")
    text = text.replace("\\", "-")
    text = text.replace("<", "")
    text = text.replace(">", "")
    text = text.replace("*", "")
    text = text.replace("?", "")
    text = text.replace('|', "")
    text = text.strip()

    text = text.rstrip('.')
    text = unicodedata.normalize('NFKD', unicode(text, 'utf-8')).encode('ascii', 'ignore')

    return text

开发者ID:MediaBrowser，项目名称:plugin.video.emby，代码行数:22，代码来源:utils.py

示例12: _run_strip_accents

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 这个函数去除掉text中的非间距字符

        # 标准化对于任何需要以一致的方式处理Unicode文本的程序都是非常重要的。
        # 当处理来自用户输入的字符串而你很难去控制编码的时候尤其如此。
        # normalize() 将文本标准化,第一个参数指定字符串标准化的方式,NFD表示字符应该分解为多个组合字符表示
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            # category() 返回字符在UNICODE里分类的类型
            cat = unicodedata.category(char)
            if cat == "Mn":
                #  Mark, Nonspacing 指示字符是非间距字符，这指示基字符的修改。
                # https://www.fileformat.info/info/unicode/category/Mn/list.htm
                continue
            output.append(char)
        return "".join(output)

开发者ID:eva-n27，项目名称:BERT-for-Chinese-Question-Answering，代码行数:20，代码来源:tokenization.py

示例13: normalize

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def normalize(self, form):
        """
        Return the Unicode normal form for the strings in the Series/Index.
        For more information on the forms, see the
        :func:`unicodedata.normalize`.

        Parameters
        ----------
        form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
            Unicode form

        Returns
        -------
        normalized : Series/Index of objects
        """
        import unicodedata
        f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
        result = _na_map(f, self._parent)
        return self._wrap_result(result)

开发者ID:Frank-qlu，项目名称:recruit，代码行数:21，代码来源:strings.py

示例14: clean_id

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def clean_id(name, preserve_case=False):
    """
    Return a 'clean' dokuwiki-compliant name. Based on the cleanID() PHP function in inc/pageutils.php

    Ignores both slashes and colons as valid namespace choices (to convert slashes to colons,
    call make_dokuwiki_pagename)
    """
    main,ext = os.path.splitext(name)

    # remove accents
    try:
        decomposed = unicodedata.normalize("NFKD", main)
        no_accent = ''.join(c for c in decomposed if ord(c)<0x7f)
    except TypeError:
        no_accent = main # name was plaintext to begin with

    # recombine without any other characters
    result = (re.sub(r'[^\w/:-]+', '_', no_accent) + ext)
    if not preserve_case:
        result = result.lower()
    while "__" in result:
        result = result.replace("__", "_") # this is a hack, unsure why regex doesn't catch it
    return result

开发者ID:projectgus，项目名称:yamdwe，代码行数:25，代码来源:names.py

示例15: _run_strip_accents

# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import normalize [as 别名]
def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

开发者ID:Socialbird-AILab，项目名称:BERT-Classification-Tutorial，代码行数:12，代码来源:tokenization.py

注：本文中的unicodedata.normalize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。