Python unicodedata.combining函数代码示例

本文整理汇总了Python中unicodedata.combining函数的典型用法代码示例。如果您正苦于以下问题:Python combining函数的具体用法?Python combining怎么用?Python combining使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


示例1: clean_tokenize

    def clean_tokenize(self, input_text, accentmark, minimunlen, numeric, alpha,stopwords):
        Clean document, removing accents, punctuation and symbols
        :param text: string to clean
        :return: string cleaned without punctuation and stop words
        final_text = []
        if not accentmark:
            text = input_text.replace('\n', ' ').replace('\r', '').replace('”', '').replace('“', '').replace('.', '')
            nfkd_form = unicodedata.normalize('NFKD', text)
            unicode_text = u"".join([c for c in nfkd_form if not unicodedata.combining(c)]).lower()
            clean_text = unicode_text.translate(punctuation)
            clean_text = str(''.join([i if ord(i) < 128 else ' ' for i in clean_text])).lower()
            words = word_tokenize(clean_text, language='spanish')
            #words = nltk.regexp_tokenize(clean_text, r"([a-zA-Z])*")
            text = u"".join([c for c in input_text if not unicodedata.combining(c)])
            words = word_tokenize(text, language='spanish')
        for word in words:
            result = True
            if len(word) > minimunlen:
                if stopwords:
                    if word.lower() in self.ALL_STOPWORDS:
                        result = False

                if result and numeric and word.isnumeric():
                    result = False
                elif result and alpha and not word.isalpha():
                    result = False
                if result:

        return final_text

示例2: clean_song_data

 def clean_song_data(self, artist, title):
     # convert to lowercase
     artist = artist.lower()
     title = title.lower()
     # remove accents
     artist = unicodedata.normalize('NFKD', artist)
     artist = "".join([c for c in artist if not unicodedata.combining(c)])
     title = unicodedata.normalize('NFKD', title)
     title = "".join([c for c in title if not unicodedata.combining(c)])
     if self.ignore_brackets:
     # replace ampersands and the like
     for exp in LYRICS_ARTIST_REPLACE:
         artist = re.sub(exp[0], exp[1], artist)
     for exp in LYRICS_TITLE_REPLACE:
         title = re.sub(exp[0], exp[1], title)
     # strip things like "(live at Somewhere)", "(acoustic)", etc
     for exp in LYRICS_TITLE_STRIP:
         title = re.sub (exp, '', title)
     # compress spaces
     title = title.strip()
     artist = artist.strip()
     return (artist, title)

示例3: preprocess_str

    def preprocess_str(line: str, return_mapping: bool = False) -> Union[Tuple[str, List[int], List[int]], str]:
        """ Removes unicode and other characters from str

            line: string to process
            return_mapping: return mapping from line to preprocessed line or not

            preprocessed line, raw2preprocessed mapping, preprocessed2raw mapping

        line = line.replace("''", '" ').replace("``", '" ')
        if not return_mapping:
            return ''.join(c for c in line if not unicodedata.combining(c))

        r2p = [len(line)] * (len(line) + 1)
        p2r = [len(line)] * (len(line) + 1)
        s = ''
        for i, c in enumerate(line):
            if unicodedata.combining(c):
                r2p[i] = -1
                s += c
                r2p[i] = len(s) - 1
                p2r[len(s) - 1] = i
        return s, r2p, p2r

示例4: shave_marks_latin

def shave_marks_latin(txt):
    norm_txt = normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)

示例5: remove_initial_vowel

def remove_initial_vowel(word):
    if not word:
        return u''
    word = unicodedata.normalize('NFKD', word)
    removed = u''
    while word[0] in vowels or unicodedata.combining(word[0]):
        removed += word[0]
        test = u''.join([c for c in removed if not unicodedata.combining(c)])
        if test and test not in vowels and test not in diphthongs:
            return word
        if len(word) == 1:
            return u''
        word = word[1:]
    return word

示例6: shave_marks_latin

def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue  # Ignore diacritic on latin base char.
        # If it isn't combining char, it's a new base char.
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)

示例7: shave_marks_latin

def shave_marks_latin(txt):
    process_txt = unicodedata.normalize('NFD', txt)
    keepers = []
    latin_base = False
    for char in process_txt:
        if not unicodedata.combining(char) or not latin_base:
        elif unicodedata.combining(char) and not latin_base:
        elif not unicodedata.combining(char) and latin_base:
        if not unicodedata.combining(char):
            latin_base = char in string.ascii_letters
    text = ''.join(keepers)
    return unicodedata.normalize('NFC', text)

示例8: _format_for_latex

    def _format_for_latex(self, text):
        """ Function to sanitize text, so that it can be typeset by latex.
            This sanitation consists of three operations:
                1. Normalize the text to NFC.
                    This compresses diacritics where possible.
                2. Replacement of unknown unicode characters with a default.
                3. Replacement of non-typesettable character with their
                    latex counterpart or equivalent character.

            text : str
                The text to sanitize.

            The text after sanitation.
        correct_line = ''
        normalized_line = unicodedata.normalize('NFC', text)
        for idx, c in enumerate(normalized_line):
                if unicodedata.combining(c) != 0:
                next_char = normalized_line[idx+1]
                name = unicodedata.name(c, None)
                codepoint = ord(next_char)
                if codepoint in special_char_mapping.keys():
                    latex_command = special_char_mapping[codepoint]
                    correct_line += self._build_latex_replacement(latex_command, c)
                elif c in special_char_mapping.keys():
                    latex_command = special_char_mapping[c]
                    correct_line += self._build_latex_replacement(latex_command, c)
                elif name is None:
                    self._preamble['latexsym'] = ''
                    correct_line += '□'
                    correct_line += c
            except IndexError:
                if unicodedata.combining(c) != 0:
                if c in special_char_mapping.keys():
                    latex_command = special_char_mapping[c]
                    correct_line += self._build_latex_replacement(latex_command, c)
                    correct_line += c

        return correct_line

示例9: make_sortable

def make_sortable(text):
    text = text.lower()
    text = text.decode('utf-8')
    normalized = unicodedata.normalize('NFKD', text)
    text = u''.join([c for c in normalized if not unicodedata.combining(c)])
    text = text.encode('utf-8')
    return text

示例10: codepoint

def codepoint(bot, trigger):
    arg = trigger.group(2).strip()
    if len(arg) == 0:
        bot.reply('What code point do you want me to look up?')
        return NOLIMIT
    elif len(arg) > 1:
        if arg.startswith('U+'):
            arg = arg[2:]
            arg = unichr(int(arg, 16))
            bot.reply("That's not a valid code point.")
            return NOLIMIT

    # Get the hex value for the code point, and drop the 0x from the front
    point = str(hex(ord(u'' + arg)))[2:]
    # Make the hex 4 characters long with preceding 0s, and all upper case
    point = point.rjust(4, str('0')).upper()
        name = unicodedata.name(arg)
    except ValueError:
        return 'U+%s (No name found)' % point

    if not unicodedata.combining(arg):
        template = 'U+%s %s (%s)'
        template = 'U+%s %s (\xe2\x97\x8c%s)'
    bot.say(template % (point, name, arg))

示例11: about

def about(u, cp=None, name=None):
    global data_loaded

    ## load UnicodeData
    if not data_loaded:
        data_loaded = True

    if cp is None:
        ## cp is not provided, we can safely grab the codepoint
        cp = ord(u)
        ## codepoint is provided but is in hexadeciaml
        cp = int(cp, 16)

    if name is None:
        name = 'No Name Found'
        ## we need the U+XXXX numbers
        ## which are hex numbers
        ## it is how the numbers are formatted in the UnicodeData file
        search_cp = '%04X' % (cp)
        if search_cp in cp_names:
            name = cp_names[search_cp]

    ## TODO: Replace this...
    if not unicodedata.combining(u):
        template = 'U+%04X %s (%s)'
        template = 'U+%04X %s (\xe2\x97\x8c%s)'

    return template % (cp, name, u.encode('utf-8'))

示例12: _text_chars

    def _text_chars(self, length, truncate, text, whole_words):
        Truncates a string after a certain number of chars.
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
            s_len += 1
            if end_index is None and s_len > length:
                end_index = i
            if s_len > length:
                truncated = text[:end_index or 0]

                if whole_words:
                    if not char.isspace():
                        # Current character is whitespace, find previous
                        # whole word
                        truncated = truncated.rsplit(' ', 1)[0]

                    # Remove trailing whitespace and punctuation
                    truncated = truncated.rstrip(
                        string.whitespace + string.punctuation

                # Return the truncated string
                return self.add_truncation_text(truncated, truncate)

        # Return the original string since no truncation was necessary
        return text

示例13: codepoint

def codepoint(bot, trigger):
    arg = trigger.group(2)
    if not arg:
        bot.reply('What code point do you want me to look up?')
        return module.NOLIMIT
    stripped = arg.strip()
    if len(stripped) > 0:
        arg = stripped
    if len(arg) > 1:
        if arg.startswith('U+'):
            arg = arg[2:]
            arg = unichr(int(arg, 16))
        except (ValueError, TypeError):
            bot.reply("That's not a valid code point.")
            return module.NOLIMIT

    point, name = get_codepoint_name(arg)
    if name is None:
        name = '(No name found)'

    template = 'U+%s %s (\xe2\x97\x8c%s)'
    if not unicodedata.combining(arg):
        template = 'U+%s %s (%s)'

    bot.say(template % (point, name, arg))

示例14: _char_block_width

def _char_block_width(char):
    # Basic Latin, which is probably the most common case
    #if char in xrange(0x0021, 0x007e):
    #if char >= 0x0021 and char <= 0x007e:
    if 0x0021 <= char <= 0x007e:
        return 1
    # Chinese, Japanese, Korean (common)
    if 0x4e00 <= char <= 0x9fff:
        return 2
    # Hangul
    if 0xac00 <= char <= 0xd7af:
        return 2
    # Combining?
    if unicodedata.combining(uni_chr(char)):
        return 0
    # Hiragana and Katakana
    if 0x3040 <= char <= 0x309f or 0x30a0 <= char <= 0x30ff:
        return 2
    # Full-width Latin characters
    if 0xff01 <= char <= 0xff60:
        return 2
    # CJK punctuation
    if 0x3000 <= char <= 0x303e:
        return 2
    # Backspace and delete
    if char in (0x0008, 0x007f):
        return -1
    # Other control characters
    elif char in (0x0000, 0x001f):
        return 0
    # Take a guess
    return 1

示例15: artist_search

def artist_search(results, media, lang, artist_name):

  # Precompose.
    artist_name = unicodedata.normalize('NFKD', artist_name.decode('utf-8'))
  except UnicodeError:
    artist_name = unicodedata.normalize('NFKD', artist_name)

  # Strip diacritics.
  stripped = u''
  for i in range(len(artist_name)):
    point = artist_name[i]
    if not unicodedata.combining(point):
      stripped += point
  artist_name = stripped

  json_obj = JSON.ObjectFromURL('' % (String.Quote(artist_name)))

  score = 100
  normalized_artist_name = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist_name))
  for artist in json_obj['artists']:

    # Require a perfect match after normalization to avoid false positives.
    normalized_artist_result = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist['name']))
    Log('Sanity checking normalized artist: %s against Vevo result: %s' % (normalized_artist_name, normalized_artist_result))
    if normalized_artist_name == normalized_artist_result:        
        id = artist['urlSafeName'],
        score = score
      score = score - 1
