本文整理汇总了Python中unicodedata.combining函数的典型用法代码示例。如果您正苦于以下问题:Python combining函数的具体用法?Python combining怎么用?Python combining使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了combining函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean_tokenize
def clean_tokenize(self, input_text, accentmark, minimunlen, numeric, alpha,stopwords):
"""
Clean document, removing accents, punctuation and symbols
:param text: string to clean
:return: string cleaned without punctuation and stop words
"""
final_text = []
if not accentmark:
text = input_text.replace('\n', ' ').replace('\r', '').replace('”', '').replace('“', '').replace('.', '')
nfkd_form = unicodedata.normalize('NFKD', text)
unicode_text = u"".join([c for c in nfkd_form if not unicodedata.combining(c)]).lower()
clean_text = unicode_text.translate(punctuation)
clean_text = str(''.join([i if ord(i) < 128 else ' ' for i in clean_text])).lower()
words = word_tokenize(clean_text, language='spanish')
#words = nltk.regexp_tokenize(clean_text, r"([a-zA-Z])*")
else:
text = u"".join([c for c in input_text if not unicodedata.combining(c)])
words = word_tokenize(text, language='spanish')
for word in words:
result = True
if len(word) > minimunlen:
if stopwords:
if word.lower() in self.ALL_STOPWORDS:
result = False
if result and numeric and word.isnumeric():
result = False
elif result and alpha and not word.isalpha():
result = False
if result:
final_text.append(word)
return final_text
示例2: clean_song_data
def clean_song_data(self, artist, title):
# convert to lowercase
artist = artist.lower()
title = title.lower()
# remove accents
artist = unicodedata.normalize('NFKD', artist)
artist = "".join([c for c in artist if not unicodedata.combining(c)])
title = unicodedata.normalize('NFKD', title)
title = "".join([c for c in title if not unicodedata.combining(c)])
if self.ignore_brackets:
LYRICS_TITLE_STRIP.append("\(.*\)")
# replace ampersands and the like
for exp in LYRICS_ARTIST_REPLACE:
artist = re.sub(exp[0], exp[1], artist)
for exp in LYRICS_TITLE_REPLACE:
title = re.sub(exp[0], exp[1], title)
# strip things like "(live at Somewhere)", "(acoustic)", etc
for exp in LYRICS_TITLE_STRIP:
title = re.sub (exp, '', title)
# compress spaces
title = title.strip()
artist = artist.strip()
return (artist, title)
示例3: preprocess_str
def preprocess_str(line: str, return_mapping: bool = False) -> Union[Tuple[str, List[int], List[int]], str]:
""" Removes unicode and other characters from str
Args:
line: string to process
return_mapping: return mapping from line to preprocessed line or not
Returns:
preprocessed line, raw2preprocessed mapping, preprocessed2raw mapping
"""
line = line.replace("''", '" ').replace("``", '" ')
if not return_mapping:
return ''.join(c for c in line if not unicodedata.combining(c))
r2p = [len(line)] * (len(line) + 1)
p2r = [len(line)] * (len(line) + 1)
s = ''
for i, c in enumerate(line):
if unicodedata.combining(c):
r2p[i] = -1
else:
s += c
r2p[i] = len(s) - 1
p2r[len(s) - 1] = i
return s, r2p, p2r
示例4: shave_marks_latin
def shave_marks_latin(txt):
norm_txt = normalize('NFD', txt)
latin_base = False
keepers = []
for c in norm_txt:
if unicodedata.combining(c) and latin_base:
continue
keepers.append(c)
if not unicodedata.combining(c):
latin_base = c in string.ascii_letters
shaved = ''.join(keepers)
return unicodedata.normalize('NFC', shaved)
示例5: remove_initial_vowel
def remove_initial_vowel(word):
if not word:
return u''
word = unicodedata.normalize('NFKD', word)
removed = u''
while word[0] in vowels or unicodedata.combining(word[0]):
removed += word[0]
test = u''.join([c for c in removed if not unicodedata.combining(c)])
if test and test not in vowels and test not in diphthongs:
return word
if len(word) == 1:
return u''
word = word[1:]
return word
示例6: shave_marks_latin
def shave_marks_latin(txt):
"""Remove all diacritic marks from Latin base characters"""
norm_txt = unicodedata.normalize('NFD', txt)
latin_base = False
keepers = []
for c in norm_txt:
if unicodedata.combining(c) and latin_base:
continue # Ignore diacritic on latin base char.
keepers.append(c)
# If it isn't combining char, it's a new base char.
if not unicodedata.combining(c):
latin_base = c in string.ascii_letters
shaved = ''.join(keepers)
return unicodedata.normalize('NFC', shaved)
示例7: shave_marks_latin
def shave_marks_latin(txt):
process_txt = unicodedata.normalize('NFD', txt)
keepers = []
latin_base = False
for char in process_txt:
if not unicodedata.combining(char) or not latin_base:
keepers.append(char)
elif unicodedata.combining(char) and not latin_base:
keepers.append(char)
elif not unicodedata.combining(char) and latin_base:
keepers.append(char)
if not unicodedata.combining(char):
latin_base = char in string.ascii_letters
text = ''.join(keepers)
return unicodedata.normalize('NFC', text)
示例8: _format_for_latex
def _format_for_latex(self, text):
""" Function to sanitize text, so that it can be typeset by latex.
This sanitation consists of three operations:
1. Normalize the text to NFC.
This compresses diacritics where possible.
2. Replacement of unknown unicode characters with a default.
3. Replacement of non-typesettable character with their
latex counterpart or equivalent character.
Parameters
----------
text : str
The text to sanitize.
Returns
-------
The text after sanitation.
"""
correct_line = ''
normalized_line = unicodedata.normalize('NFC', text)
for idx, c in enumerate(normalized_line):
try:
if unicodedata.combining(c) != 0:
continue
next_char = normalized_line[idx+1]
name = unicodedata.name(c, None)
codepoint = ord(next_char)
if codepoint in special_char_mapping.keys():
latex_command = special_char_mapping[codepoint]
correct_line += self._build_latex_replacement(latex_command, c)
elif c in special_char_mapping.keys():
latex_command = special_char_mapping[c]
correct_line += self._build_latex_replacement(latex_command, c)
elif name is None:
self._preamble['latexsym'] = ''
correct_line += '□'
else:
correct_line += c
except IndexError:
if unicodedata.combining(c) != 0:
continue
if c in special_char_mapping.keys():
latex_command = special_char_mapping[c]
correct_line += self._build_latex_replacement(latex_command, c)
else:
correct_line += c
return correct_line
示例9: make_sortable
def make_sortable(text):
text = text.lower()
text = text.decode('utf-8')
normalized = unicodedata.normalize('NFKD', text)
text = u''.join([c for c in normalized if not unicodedata.combining(c)])
text = text.encode('utf-8')
return text
示例10: codepoint
def codepoint(bot, trigger):
arg = trigger.group(2).strip()
if len(arg) == 0:
bot.reply('What code point do you want me to look up?')
return NOLIMIT
elif len(arg) > 1:
if arg.startswith('U+'):
arg = arg[2:]
try:
arg = unichr(int(arg, 16))
except:
bot.reply("That's not a valid code point.")
return NOLIMIT
# Get the hex value for the code point, and drop the 0x from the front
point = str(hex(ord(u'' + arg)))[2:]
# Make the hex 4 characters long with preceding 0s, and all upper case
point = point.rjust(4, str('0')).upper()
try:
name = unicodedata.name(arg)
except ValueError:
return 'U+%s (No name found)' % point
if not unicodedata.combining(arg):
template = 'U+%s %s (%s)'
else:
template = 'U+%s %s (\xe2\x97\x8c%s)'
bot.say(template % (point, name, arg))
示例11: about
def about(u, cp=None, name=None):
global data_loaded
## load UnicodeData
if not data_loaded:
load_data()
data_loaded = True
if cp is None:
## cp is not provided, we can safely grab the codepoint
cp = ord(u)
else:
## codepoint is provided but is in hexadeciaml
cp = int(cp, 16)
if name is None:
name = 'No Name Found'
## we need the U+XXXX numbers
## which are hex numbers
## it is how the numbers are formatted in the UnicodeData file
search_cp = '%04X' % (cp)
if search_cp in cp_names:
name = cp_names[search_cp]
## TODO: Replace this...
if not unicodedata.combining(u):
template = 'U+%04X %s (%s)'
else:
template = 'U+%04X %s (\xe2\x97\x8c%s)'
return template % (cp, name, u.encode('utf-8'))
示例12: _text_chars
def _text_chars(self, length, truncate, text, whole_words):
"""
Truncates a string after a certain number of chars.
"""
s_len = 0
end_index = None
for i, char in enumerate(text):
if unicodedata.combining(char):
# Don't consider combining characters
# as adding to the string length
continue
s_len += 1
if end_index is None and s_len > length:
end_index = i
if s_len > length:
truncated = text[:end_index or 0]
if whole_words:
if not char.isspace():
# Current character is whitespace, find previous
# whole word
truncated = truncated.rsplit(' ', 1)[0]
# Remove trailing whitespace and punctuation
truncated = truncated.rstrip(
string.whitespace + string.punctuation
)
# Return the truncated string
return self.add_truncation_text(truncated, truncate)
# Return the original string since no truncation was necessary
return text
示例13: codepoint
def codepoint(bot, trigger):
arg = trigger.group(2)
if not arg:
bot.reply('What code point do you want me to look up?')
return module.NOLIMIT
stripped = arg.strip()
if len(stripped) > 0:
arg = stripped
if len(arg) > 1:
if arg.startswith('U+'):
arg = arg[2:]
try:
arg = unichr(int(arg, 16))
except (ValueError, TypeError):
bot.reply("That's not a valid code point.")
return module.NOLIMIT
point, name = get_codepoint_name(arg)
if name is None:
name = '(No name found)'
template = 'U+%s %s (\xe2\x97\x8c%s)'
if not unicodedata.combining(arg):
template = 'U+%s %s (%s)'
bot.say(template % (point, name, arg))
示例14: _char_block_width
def _char_block_width(char):
# Basic Latin, which is probably the most common case
#if char in xrange(0x0021, 0x007e):
#if char >= 0x0021 and char <= 0x007e:
if 0x0021 <= char <= 0x007e:
return 1
# Chinese, Japanese, Korean (common)
if 0x4e00 <= char <= 0x9fff:
return 2
# Hangul
if 0xac00 <= char <= 0xd7af:
return 2
# Combining?
if unicodedata.combining(uni_chr(char)):
return 0
# Hiragana and Katakana
if 0x3040 <= char <= 0x309f or 0x30a0 <= char <= 0x30ff:
return 2
# Full-width Latin characters
if 0xff01 <= char <= 0xff60:
return 2
# CJK punctuation
if 0x3000 <= char <= 0x303e:
return 2
# Backspace and delete
if char in (0x0008, 0x007f):
return -1
# Other control characters
elif char in (0x0000, 0x001f):
return 0
# Take a guess
return 1
示例15: artist_search
def artist_search(results, media, lang, artist_name):
# Precompose.
try:
artist_name = unicodedata.normalize('NFKD', artist_name.decode('utf-8'))
except UnicodeError:
artist_name = unicodedata.normalize('NFKD', artist_name)
# Strip diacritics.
stripped = u''
for i in range(len(artist_name)):
point = artist_name[i]
if not unicodedata.combining(point):
stripped += point
artist_name = stripped
json_obj = JSON.ObjectFromURL('http://127.0.0.1:32400/services/vevo/search?q=%s&artistsLimit=6&videosLimit=1' % (String.Quote(artist_name)))
score = 100
normalized_artist_name = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist_name))
for artist in json_obj['artists']:
# Require a perfect match after normalization to avoid false positives.
normalized_artist_result = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist['name']))
Log('Sanity checking normalized artist: %s against Vevo result: %s' % (normalized_artist_name, normalized_artist_result))
if normalized_artist_name == normalized_artist_result:
results.add(SearchResult(
id = artist['urlSafeName'],
score = score
))
score = score - 1