本文整理汇总了Python中unicodedata.category函数的典型用法代码示例。如果您正苦于以下问题:Python category函数的具体用法?Python category怎么用?Python category使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了category函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: splitText
def splitText(text):
""" Split text into sub segments of size not bigger than MAX_SEGMENT_SIZE. """
segments = []
remaining_text = __class__.cleanSpaces(text)
while len(remaining_text) > __class__.MAX_SEGMENT_SIZE:
cur_text = remaining_text[:__class__.MAX_SEGMENT_SIZE]
# try to split at punctuation
split_idx = __class__.findLastCharIndexMatching(cur_text,
# https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
lambda x: unicodedata.category(x) in ("Ps", "Pe", "Pi", "Pf", "Po"))
if split_idx is None:
# try to split at whitespace
split_idx = __class__.findLastCharIndexMatching(cur_text,
lambda x: unicodedata.category(x).startswith("Z"))
if split_idx is None:
# try to split at anything not a letter or number
split_idx = __class__.findLastCharIndexMatching(cur_text,
lambda x: not (unicodedata.category(x)[0] in ("L", "N")))
if split_idx is None:
# split at the last char
split_idx = __class__.MAX_SEGMENT_SIZE - 1
new_segment = cur_text[:split_idx + 1].rstrip()
segments.append(new_segment)
remaining_text = remaining_text[split_idx + 1:].lstrip(string.whitespace + string.punctuation)
if remaining_text:
segments.append(remaining_text)
return segments
示例2: TokenOffsets
def TokenOffsets(string: str):
"""
Yield the offsets of all Unicode category borders in the *string*,
including the initial 0 and the final offset value of ``len(string)``.
Caplitalized words special case: A single upper case letter ('Lu')
followed by lower case letters ('Ll') are treated as a single token.
"""
if string is not None and len(string) > 0:
yield 0
last = category(string[0])
for i in range(1, len(string)):
current = category(string[i])
if last != current:
# "join" capitalized tokens:
if last == 'Lu' and \
current == 'Ll' and \
(i == 1 or (i > 1 and category(string[i - 2]) != 'Lu')):
pass
else:
yield i
last = current
yield len(string)
示例3: ranking
def ranking(self):
"""
For each result, removes stopwords, ranks the word, augments the query
and returns True if successful else False
"""
print "Indexing results ...."
for i in range(len(self.results)):
result = self.results[i]
title = result[0]
summary = result[1]
# Remove punctuation and create lists of words
titleWords = "".join(c for c in title if not unicodedata.category(c).startswith('P')).split()
summaryWords = "".join(c for c in summary if not unicodedata.category(c).startswith('P')).split()
for tw in titleWords:
if tw.lower() in self.stopWords:
continue
if self.user_feedback[i] == 'y':
self.applyRanking(i, tw, True, True)
else:
self.applyRanking(i, tw, True, False)
for sw in summaryWords:
if sw.lower() in self.stopWords:
continue
if self.user_feedback[i] == 'y':
self.applyRanking(i, sw, False, True)
else:
self.applyRanking(i, sw, False, False)
print "Indexing results ...."
return self.augmentQuery()
示例4: normalize_roman
def normalize_roman(string, additional=None):
"""Removes diacritics from the string and converts to lowercase.
>>> normalize_roman(u'Eèé')
u'eee'
"""
if additional:
safe = additional.keys() + additional.values()
def gen():
for c in string:
if c not in safe:
yield normalize_roman(c)
elif c in additional:
yield additional[c]
else:
yield c
return ''.join(gen())
else:
chars = []
for c in string:
if unicodedata.category(c) == 'Lo':
chars.append(c)
else:
nor = unicodedata.normalize('NFD', c)
chars.extend(x for x in nor if unicodedata.category(x) != 'Mn')
return ''.join(chars).lower()
示例5: test_exclude_characters_of_specific_groups
def test_exclude_characters_of_specific_groups():
st = characters(blacklist_categories=('Lu', 'Nd'))
find(st, lambda c: unicodedata.category(c) != 'Lu')
find(st, lambda c: unicodedata.category(c) != 'Nd')
assert_no_examples(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))
示例6: consolidate_ampers
def consolidate_ampers(text: str) -> str:
"""Converts all ampersands in a text to a single one (&).
:param text: A string which should have ampersands converted.
:return: The text string after all ampersands have been replaced.
"""
chosen_amper_value = "\u0026"
amper_values = dict.fromkeys(
[chr(i) for i in range(sys.maxunicode)
# Avoid unnamed control chars throwing ValueErrors
if (unicodedata.category(chr(i)).startswith('P')
or unicodedata.category(chr(i)).startswith('S'))
and re.search(
r" ampersand|ampersand ", unicodedata.name(chr(i)),
re.IGNORECASE) is not None
and chr(i) != chosen_amper_value]
)
# Change all ampersands to one type of ampersand
for value in amper_values:
text = text.replace(value, chosen_amper_value)
return text
示例7: test_exclude_characters_of_specific_groups
def test_exclude_characters_of_specific_groups():
st = characters(blacklist_categories=("Lu", "Nd"))
find_any(st, lambda c: unicodedata.category(c) != "Lu")
find_any(st, lambda c: unicodedata.category(c) != "Nd")
assert_no_examples(st, lambda c: unicodedata.category(c) in ("Lu", "Nd"))
示例8: tokens
def tokens(source):
p = 0
while p < len(source):
ch = source[p]
cat = category(ch)
if ch in NEWLINE_CHARS:
yield NewlineToken(source[p])
p += 1
elif cat[0] in "CZ":
q = p + 1
while q < len(source) and category(source[q])[0] in "CZ":
q += 1
yield WhitespaceToken(source[p:q])
p = q
elif cat[0] in "LN":
q = p + 1
while q < len(source) and category(source[q])[0] in "LN":
q += 1
yield WordToken(source[p:q])
p = q
else:
q = p + 1
while q < len(source) and source[q] == ch:
q += 1
yield SymbolToken(source[p:q])
p = q
示例9: crear_nombre_usuario
def crear_nombre_usuario(nombre, apellidos):
# En primer lugar quitamos tildes, colocamos nombres en minúsculas y :
nombre = ''.join(
(c for c in unicodedata.normalize('NFD', smart_unicode(nombre)) if
unicodedata.category(c) != 'Mn')).lower().split()
apellidos = ''.join(
(c for c in unicodedata.normalize('NFD', smart_unicode(apellidos)) if
unicodedata.category(c) != 'Mn')).lower().split()
iniciales_nombre = ''
for parte in nombre:
iniciales_nombre = iniciales_nombre + parte[0]
try:
iniciales_apellidos = apellidos[0]
except: # Estas dos líneas están para crear usuarios cuando no tienen apellidos
iniciales_apellidos = 'sin'
for ind in range(len(apellidos))[1:]:
try: # Por si acaso el usuario sólo tuviera un apellido:
iniciales_apellidos = iniciales_apellidos + apellidos[ind][0]
except IndexError:
pass
usuario = iniciales_nombre + iniciales_apellidos
valid_usuario = False
n = 1
while valid_usuario == False:
username = usuario + str(n)
try:
user = Gauser.objects.get(username=username)
n += 1
except:
valid_usuario = True
return username
示例10: __new__
def __new__(cls,s,on_fail='die',msg=None):
if type(s) == cls: return s
cls.arg_chk(on_fail)
for k in cls.forbidden,cls.allowed:
assert type(k) == list
for ch in k: assert type(ch) == str and len(ch) == 1
try:
s = s.strip()
if type(s) != str:
s = s.decode('utf8')
for ch in s:
# Allow: (L)etter,(N)umber,(P)unctuation,(S)ymbol,(Z)space
# Disallow: (C)ontrol,(M)combining
# Combining characters create width formatting issues, so disallow them for now
if unicodedata.category(ch)[0] in 'CM':
t = { 'C':'control', 'M':'combining' }[unicodedata.category(ch)[0]]
raise ValueError('{}: {} characters not allowed'.format(ascii(ch),t))
me = str.__new__(cls,s)
if cls.max_screen_width:
me.screen_width = len(s) + len([1 for ch in s if unicodedata.east_asian_width(ch) in ('F','W')])
assert me.screen_width <= cls.max_screen_width,(
'too wide (>{} screen width)'.format(cls.max_screen_width))
else:
assert len(s) <= cls.max_len, 'too long (>{} symbols)'.format(cls.max_len)
assert len(s) >= cls.min_len, 'too short (<{} symbols)'.format(cls.min_len)
assert not cls.allowed or set(list(s)).issubset(set(cls.allowed)),\
'contains non-allowed symbols: {}'.format(' '.join(set(list(s)) - set(cls.allowed)))
assert not cls.forbidden or not any(ch in s for ch in cls.forbidden),\
"contains one of these forbidden symbols: '{}'".format("', '".join(cls.forbidden))
return me
except Exception as e:
return cls.init_fail(e,s)
示例11: parse
def parse(cls, string):
from unicodedata import category
parts = []
last_ch = None
for ch in string:
if last_ch is None:
parts.append([ch])
elif ch == ".":
if last_ch in ".-":
parts[-1][-1] += "0"
parts[-1].append("")
elif ch == "-":
if last_ch in ".-":
parts[-1][-1] += "0"
parts.append([""])
else:
if last_ch not in ".-" and category(ch)[0] != category(last_ch)[0]:
parts.append([ch])
else:
parts[-1][-1] += ch
last_ch = ch
for part in parts:
for i, x in enumerate(part):
try:
part[i] = int(x)
except (ValueError, TypeError):
pass
while len(part) > 1 and not part[-1]:
part[:] = part[:-1]
return cls(*map(tuple, parts))
示例12: get_match_list
def get_match_list(data, match_list, order_list=None, only_ascii=False, ignorecase=False):
"""
Busca coincidencias en una cadena de texto, con un diccionario de "ID" / "Listado de cadenas de busqueda":
{ "ID1" : ["Cadena 1", "Cadena 2", "Cadena 3"],
"ID2" : ["Cadena 4", "Cadena 5", "Cadena 6"]
}
El diccionario no pude contener una misma cadena de busqueda en varías IDs.
La busqueda se realiza por orden de tamaño de cadena de busqueda (de mas larga a mas corta) si una cadena coincide,
se elimina de la cadena a buscar para las siguientes, para que no se detecten dos categorias si una cadena es parte de otra:
por ejemplo: "Idioma Español" y "Español" si la primera aparece en la cadena "Pablo sabe hablar el Idioma Español"
coincidira con "Idioma Español" pero no con "Español" ya que la coincidencia mas larga tiene prioridad.
"""
import unicodedata
match_dict = dict()
matches = []
# Pasamos la cadena a unicode
data = unicode(data, "utf8")
# Pasamos el diccionario a {"Cadena 1": "ID1", "Cadena 2", "ID1", "Cadena 4", "ID2"} y los pasamos a unicode
for key in match_list:
if order_list and not key in order_list:
raise Exception("key '%s' not in match_list" % key)
for value in match_list[key]:
if value in match_dict:
raise Exception("Duplicate word in list: '%s'" % value)
match_dict[unicode(value, "utf8")] = key
# Si ignorecase = True, lo pasamos todo a mayusculas
if ignorecase:
data = data.upper()
match_dict = dict((key.upper(), match_dict[key]) for key in match_dict)
# Si ascii = True, eliminamos todos los accentos y Ñ
if only_ascii:
data = ''.join((c for c in unicodedata.normalize('NFD', data) if unicodedata.category(c) != 'Mn'))
match_dict = dict((''.join((c for c in unicodedata.normalize('NFD', key) if unicodedata.category(c) != 'Mn')),
match_dict[key]) for key in match_dict)
# Ordenamos el listado de mayor tamaño a menor y buscamos.
for match in sorted(match_dict, key=lambda x: len(x), reverse=True):
s = data
for a in matches:
s = s.replace(a, "")
if match in s:
matches.append(match)
if matches:
if order_list:
return type("Mtch_list", (),
{"key": match_dict[matches[-1]], "index": order_list.index(match_dict[matches[-1]])})
else:
return type("Mtch_list", (), {"key": match_dict[matches[-1]], "index": None})
else:
if order_list:
return type("Mtch_list", (), {"key": None, "index": len(order_list)})
else:
return type("Mtch_list", (), {"key": None, "index": None})
示例13: characters
def characters(self, content):
text = content.strip()
if self._inTitle:
if self._headerProcessed:
if not self._ignoreTitle:
self._writeHtml(content)
else :
if self._headerProcessed:
if not self._ignoreText:
if len(text) > 0:
if not self._glossTitleWritten and not self._inTitle:
self._writeDefaultTitle()
if not self._inParagraph and not self._inGeneratedPara and not self._inArticle and not self._lineGroupPara and not self._inTable:
self._startGeneratedPara()
if self._endDfn:
if self._keywordTag == 'dfn':
if unicodedata.category(content[0]) == 'Pd':
self._writeHtml(' ')
elif content[0] == ' ':
if unicodedata.category(text[0]) != 'Pd':
self._writeHtml(u' \u2014')
else:
self._writeHtml(u' \u2014 ')
self._writeHtml(content)
else: # 'h4' for fb2
if unicodedata.category(text[0]) == 'Pd':
text = text[1:]
self._writeHtml(text.strip())
self._endDfn = False
else:
self._writeHtml(content)
示例14: test_characters_of_specific_groups
def test_characters_of_specific_groups():
st = characters(whitelist_categories=("Lu", "Nd"))
find_any(st, lambda c: unicodedata.category(c) == "Lu")
find_any(st, lambda c: unicodedata.category(c) == "Nd")
assert_no_examples(st, lambda c: unicodedata.category(c) not in ("Lu", "Nd"))
示例15: test_find_something_rare
def test_find_something_rare():
st = characters(whitelist_categories=['Zs'], min_codepoint=12288)
find(st, lambda c: unicodedata.category(c) == 'Zs')
with pytest.raises(NoSuchExample):
find(st, lambda c: unicodedata.category(c) != 'Zs')