本文整理汇总了Python中unicodedata.category方法的典型用法代码示例。如果您正苦于以下问题:Python unicodedata.category方法的具体用法?Python unicodedata.category怎么用?Python unicodedata.category使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类unicodedata
的用法示例。
在下文中一共展示了unicodedata.category方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _tokenize
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def _tokenize(self, text):
if not self._cased:
text = unicodedata.normalize('NFD', text)
text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
text = text.lower()
spaced = ''
for ch in text:
if self._is_punctuation(ch) or self._is_cjk_character(ch):
spaced += ' ' + ch + ' '
elif self._is_space(ch):
spaced += ' '
elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
continue
else:
spaced += ch
tokens = []
for word in spaced.strip().split():
tokens += self._word_piece_tokenize(word)
return tokens
示例2: normalizestr
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def normalizestr(string):
""" Converts special characters like copyright,
trademark signs to ascii name """
# print("input: '{}'".format(string))
input_string = string
for mark, ascii_repl in unicode_marks(string):
string = string.replace(mark, ascii_repl)
rv = []
# for c in unicodedata.normalize('NFKC', smart_text(string)):
for c in unicodedata.normalize('NFKC', string):
# cat = unicodedata.category(c)[0]
# if cat in 'LN' or c in ok:
rv.append(c)
new = ''.join(rv).strip()
result = unidecode(new)
if result != input_string:
print("Fixed string: '{}'".format(result))
return result
示例3: deaccent
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
Return input string with accents removed, as unicode.
>>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
if not isinstance(text, unicode):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
示例4: _is_punctuation
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
### modeling.py
示例5: _run_strip_accents
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
# 这个函数去除掉text中的非间距字符
# 标准化对于任何需要以一致的方式处理Unicode文本的程序都是非常重要的。
# 当处理来自用户输入的字符串而你很难去控制编码的时候尤其如此。
# normalize() 将文本标准化,第一个参数指定字符串标准化的方式,NFD表示字符应该分解为多个组合字符表示
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
# category() 返回字符在UNICODE里分类的类型
cat = unicodedata.category(char)
if cat == "Mn":
# Mark, Nonspacing 指示字符是非间距字符,这指示基字符的修改。
# https://www.fileformat.info/info/unicode/category/Mn/list.htm
continue
output.append(char)
return "".join(output)
示例6: get_hgnc_id_from_symbol
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def get_hgnc_id_from_symbol(gene_symbol):
"""
Get HGNC curie from symbol using monarch and mygene services
:param gene_symbol:
:return:
"""
monarch_url = 'https://solr.monarchinitiative.org/solr/search/select'
params = DipperUtil._get_solr_weight_settings()
params["q"] = "{0} \"{0}\"".format(gene_symbol)
params["fq"] = ["taxon:\"NCBITaxon:9606\"", "category:\"gene\""]
gene_id = None
try:
monarch_request = requests.get(monarch_url, params=params)
response = monarch_request.json()
count = response['response']['numFound']
if count > 0:
gene_id = response['response']['docs'][0]['id']
except requests.ConnectionError:
print("error fetching {0}".format(monarch_url))
return gene_id
示例7: _run_strip_accents
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
korean = "%s-%s%s-%s" % (chr(0xac00), chr(0xd7a3),
chr(0x3131), chr(0x3163))
if re.search("[%s]+" % korean, text):
return "".join(
substr if re.search("^[%s]+$" % korean, substr)
else self._run_strip_accents(substr)
for substr in re.findall("[%s]+|[^%s]+" % (korean, korean), text)
)
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
示例8: encode_field
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def encode_field(s):
# Empty string converts to "".
if s == "":
return '"" '
# Check whether the string contains any special characters.
special = False
if '"' in s or ' ' in s:
special = True
else:
for c in s:
if unicodedata.category(c)[0] == "C":
special = True
break
if not special:
return s
# Quoted field is needed.
f = '"'
for c in s:
if c in ESCAPE_SEQUENCES2:
f += ESCAPE_SEQUENCES2[c]
continue
f += c
return f + '"'
示例9: remove_accents
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def remove_accents(self, text):
""" Normalise (normalize) unicode data in Python to remove umlauts, accents etc.
Rule 10 - When special characters appear as part of the name, paternal surname and maternal surname,
they must be excluded for the calculation of the homonym and the verification digit.
The characters will be interpreted, yes and only if, they are individually within the name,
paternal surname and maternal surname. Examples:
Roberto O’farril Carballo OACR-661121
Rubén D’angelo Fargo DAFR-710108
Luz Ma. Fernández Juárez FEJL-830120
"""
#s_no_accents = ''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))
try:
text = unicode(text, 'utf-8')
except (TypeError, NameError): # unicode is a default on python 3
pass
text = unicodedata.normalize('NFD', text)
text = text.encode('ascii', 'ignore')
text = text.decode("utf-8")
return str(text)
示例10: _run_strip_accents
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
示例11: _is_whitespace
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
示例12: _is_control
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
示例13: _is_punctuation
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
示例14: text_normalize
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def text_normalize(text):
text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents
text = text.lower()
text = re.sub("[^{}]".format(hp.vocab), " ", text)
text = re.sub("[ ]+", " ", text)
return text
示例15: text_normalize
# 需要导入模块: import unicodedata [as 别名]
# 或者: from unicodedata import category [as 别名]
def text_normalize(text):
text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents
text = text.lower()
text = re.sub("[^{}]".format(hp.vocab), " ", text)
text = re.sub("[ ]+", " ", text)
return text