本文整理汇总了Python中unicodedata.name函数的典型用法代码示例。如果您正苦于以下问题:Python name函数的具体用法?Python name怎么用?Python name使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了name函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: jatokenize
def jatokenize(content):
ret_list = []
lines = tagger.parse(content).split('\n')
for line in lines:
if line == "EOS":
break
line = line.split('\t')
word = line[2]
try:
jtype = unicodedata.name(word[0])
except:
continue
# 漢字でない一文字のwordは無視
# 'ー'や'*'も同様
if len(word) == 1 and jtype[0:4] != 'CJK ':
continue
# 二文字のひらがなは無視
if (len(word) == 2 and jtype[0:4] == 'HIRA'
and unicodedata.name(word[1])[0:4] == 'HIRA'):
continue
if jtype[0:4] == 'LATI':
continue
if word.isdigit():
continue
if (line[3][:2] == '名詞' or line[3][:2] == '動詞'
or line[3][:2] == '副詞' or line[3][:3] == '形容詞'):
ofs.write("%s " % word)
ret_list.append(word.encode('utf8'))
ofs.write("\n")
return ret_list
示例2: data
def data(self, index, role ):
global UC_CAT_EXPAND, COL_ALIGNMENT, COL_TOOLTIPS
(char, count) = self.chardata.get_tuple(index.row())
if role == Qt.DisplayRole : # request for actual data
if 0 == index.column():
return char
elif 1 == index.column():
return '0x{0:04x}'.format(ord(char))
elif 2 == index.column():
return count
elif 3 == index.column():
if char in C.NAMED_ENTITIES :
return '&' + C.NAMED_ENTITIES[char] + ';'
else:
return '&#{0:d};'.format(ord(char))
elif 4 == index.column():
return UC_CAT_EXPAND[unicodedata.category(char).lower()]
else: # assuming column is 5, unicode name
return unicodedata.name(char,'no name?').title()
elif (role == Qt.TextAlignmentRole) :
return COL_ALIGNMENT[index.column()]
elif (role == Qt.ToolTipRole) or (role == Qt.StatusTipRole) :
if index.column() < 5 :
return COL_TOOLTIPS[index.column()]
# For column 5, the tooltip is the name string, because a narrow
# column may not expose the entire name any other way.
return unicodedata.name(char,'no name?').title()
# Sorry, we don't support other roles
return None
示例3: codepoint_simple
def codepoint_simple(arg):
arg = arg.upper()
r_label = re.compile('\\b' + arg.replace(' ', '.*\\b') + '\\b')
results = []
for cp in xrange(0xFFFF):
u = unichr(cp)
try:
name = unicodedata.name(u)
except ValueError:
continue
if r_label.search(name):
results.append((len(name), u, cp, name))
if not results:
r_label = re.compile('\\b' + arg.replace(' ', '.*\\b'))
for cp in xrange(0xFFFF):
u = unichr(cp)
try:
name = unicodedata.name(u)
except ValueError:
continue
if r_label.search(name):
results.append((len(name), u, cp, name))
if not results:
return None
length, u, cp, name = sorted(results)[0]
return about(u, cp, name)
示例4: _do_write
def _do_write(fname, variable, version, date, table):
print("writing {} ..".format(fname))
import unicodedata
import datetime
import string
utc_now = datetime.datetime.now(tz=datetime.timezone.utc)
INDENT = 4
with open(fname, 'w') as fp:
fp.write("# Generated: {iso_utc}\n"
"# Source: {version}\n"
"# Date: {date}\n"
"{variable} = (".format(iso_utc=utc_now.isoformat(),
version=version,
date=date,
variable=variable))
for start, end in table:
ucs_start, ucs_end = unichr(start), unichr(end)
hex_start, hex_end = ('0x{0:04x}'.format(start),
'0x{0:04x}'.format(end))
try:
name_start = string.capwords(unicodedata.name(ucs_start))
except ValueError:
name_start = u''
try:
name_end = string.capwords(unicodedata.name(ucs_end))
except ValueError:
name_end = u''
fp.write('\n' + (' ' * INDENT))
fp.write('({0}, {1},),'.format(hex_start, hex_end))
fp.write(' # {0:24s}..{1}'.format(
name_start[:24].rstrip() or '(nil)',
name_end[:24].rstrip()))
fp.write('\n)\n')
print("complete.")
示例5: test_cjk
def test_cjk(self):
import sys
import unicodedata
cases = ((0x3400, 0x4DB5),
(0x4E00, 0x9FA5))
if unicodedata.unidata_version >= "4.1":
cases = ((0x3400, 0x4DB5),
(0x4E00, 0x9FBB),
(0x20000, 0x2A6D6))
for first, last in cases:
# Test at and inside the boundary
for i in (first, first + 1, last - 1, last):
charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
char = ('\\U%08X' % i).decode('unicode-escape')
assert unicodedata.name(char) == charname
assert unicodedata.lookup(charname) == char
# Test outside the boundary
for i in first - 1, last + 1:
charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
char = ('\\U%08X' % i).decode('unicode-escape')
try:
unicodedata.name(char)
except ValueError, e:
assert e.message == 'no such name'
raises(KeyError, unicodedata.lookup, charname)
示例6: extractKeyword
def extractKeyword(text,word_class=["名詞","形容詞"]):
tmp = splitTag(text) #まずハッシュタグを抽出
text = tmp[0]
keywords = tmp[1]
tagger = MeCab.Tagger('-Ochasen')
node = tagger.parseToNode(text.encode('utf-8'))
while node:
try:
if node.feature.split(',')[0] in word_class:
#print node.surface
uniname = node.surface.decode('utf-8')[0] #名詞の一文字目 ↓で数字、ひらがな、カタカナ、漢字、アルファベットのみをkeywordsに追加
if (unicodedata.name(uniname)[0:8] == "HIRAGANA") or (unicodedata.name(uniname)[0:8] == "KATAKANA") or (unicodedata.name(uniname)[0:18] == "HALFWIDTH KATAKANA") or (unicodedata.name(uniname)[0:3] == "CJK") or (unicodedata.name(uniname)[0:5] == "LATIN") or (unicodedata.name(uniname)[0:5] == "DIGIT"):
term = node.surface.replace('*','*')
term = term.replace('"','”')
term = term.replace("'","’")
keywords.append(term.decode('utf-8'))
#print node.surface.decode('utf-8')
except Exception as e:
print "-"*10
print "エラー(MeCab)"
print node.surface
print str(type(e))
print str(e.args)
print e.message
print str(e)
print "-"*10
node = node.next
return keywords
示例7: test_cjk
def test_cjk(self):
import sys
import unicodedata
cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FA5))
if unicodedata.unidata_version >= "5": # don't know the exact limit
cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FCB), (0x20000, 0x2A6D6), (0x2A700, 0x2B734))
elif unicodedata.unidata_version >= "4.1":
cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FBB), (0x20000, 0x2A6D6))
for first, last in cases:
# Test at and inside the boundary
for i in (first, first + 1, last - 1, last):
charname = "CJK UNIFIED IDEOGRAPH-%X" % i
char = ("\\U%08X" % i).decode("unicode-escape")
assert unicodedata.name(char) == charname
assert unicodedata.lookup(charname) == char
# Test outside the boundary
for i in first - 1, last + 1:
charname = "CJK UNIFIED IDEOGRAPH-%X" % i
char = ("\\U%08X" % i).decode("unicode-escape")
try:
unicodedata.name(char)
except ValueError, e:
assert e.message == "no such name"
raises(KeyError, unicodedata.lookup, charname)
示例8: clean_Ustring_fromU
def clean_Ustring_fromU(string):
from unicodedata import name, normalize
gClean = ''
for ch in u''.join(string.decode('utf-8', 'ignore')):
try:
if name(ch).startswith('LATIN') or name(ch) == 'SPACE':
gClean = gClean + ch
else: # Remove non-latin characters and change them by spaces
gClean = gClean + ' '
except ValueError: # In the case name of 'ch' does not exist in the unicode database.
gClean = gClean + ' '
try: # Trying different cases for bad input documents.
normalized_string = normalize('NFKC', gClean.lower())
except TypeError:
#sys.stderr.write('Bad formed string at the first attempt\n')
try:
range_error = 999
normalized_string = normalize('NFKC', gClean[0:range_error].lower()) # One thousand of characters are written if available.
except TypeError:
#sys.stderr.write('\nThe wrong string at the second attempt: before %s words' % range_error)
try:
range_error = 99
normalized_string = normalize('NFKC', gClean[0:range_error].lower())
except TypeError:
#sys.stderr.write('\nThe wrong string at the third attempt: before %s words' % range_error)
try:
range_error = 49
normalized_string = normalize('NFKC', gClean[0:range_error].lower())
except TypeError:
#sys.stderr.write('\nIt was not possible forming output file after three attempts. Fatally bad file')
normalized_string = '# Fatally bad File\n'
pass
return normalized_string.split() # Return the unicode normalized document.
示例9: get_unicode_str
def get_unicode_str(size=10, max_char=0xFFFF, onlyNormalized=False, includeUnexisting=False):
'''
generates valid (for current OS) Unicode file name
Notice: if includeUnexisting==True, it is possible that files don't get synchronized
'''
if platform.system() == "Windows":
# Unicode characters 1 through 31, as well as quote ("), less than (<), greater than (>), pipe (|), backspace (\b), null (\0) and tab (\t).
exclude = string.punctuation + u"\t" + u''.join([unichr(x) for x in range(0, 32)])
else:
# I guess it mainly depends on fs type
#exclude = u"/" + u"." + u''.join([unichr(x) for x in range(0, 1)])
exclude = u"/" + u"." + u''.join([unichr(x) for x in range(0, 32)])
name = u""
while len(name) < size:
c = unichr(random.randint(0, max_char))
if c not in exclude:
try:
if not includeUnexisting:
unicodedata.name(c) #this will cause invalid unicode character to throw exception
if onlyNormalized:
name = name + unicodedata.normalize('NFC',c) #only normalized chars
else:
name = name + c
except ValueError:
pass
return name
示例10: format
def format(self, stream, args):
char = unicode(args.next())
if len(char) != 1:
raise TypeError("expected single character")
if self.atsign:
if char in python_escapes:
stream.write('"\\%s"' % python_escapes[char])
else:
try:
stream.write('u"\\N{%s}"' % unicodedata.name(char))
except ValueError:
stream.write(repr(char))
else:
if unicodedata.category(char).startswith("C"):
try:
stream.write(unicodedata.name(char))
except ValueError:
code = ord(char)
if code in ascii_control_chars:
i = 1 if self.colon else 0
stream.write(ascii_control_chars[code][i])
else:
raise FormatError("unprintable character")
else:
stream.write(char)
示例11: showdict
def showdict(data, indent):
first=True
for key in sorted(data.keys()):
value=data[key]
if first:
first=False
else:
print
print " "*max(indent,0) + "("+key,
# Sneaky trick: we don't want to go newline-indent over and
# over for long sequences, i.e. cases where there is only
# one possible follower. So we skip the newlines in those
# cases, and tell the next-lower iteration not to do the whole
# indent thing by passing a negative indent. We don't just
# pass 0 or 1 because if another iteration *further down*
# turns out not to be an only case, it will need to know
# the right indent to pass along. So a case like
# R-O-{CK|LL}, the O is unique after the R, so no linefeed,
# but then the {C|L} are not unique after the O.
if type(value)==dict:
if len(value)>1:
print ""
showdict(value, abs(indent)+4),
else:
showdict(value, -(abs(indent)+4)),
else:
print " "+value.encode('utf-8'),
if "-n" in sys.argv:
try:
print unicodedata.name(value),
except:
pass
print ")",
示例12: report_code_points
def report_code_points(char_class, code_point_list, text=''):
'''Report all code points which have been added to or removed from a
character class.
'''
for code_point in sorted(code_point_list):
if type(code_point) == type(int()):
print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
%{'text': text,
'char': chr(code_point),
'char_class': char_class,
'code_point': hex(code_point),
'name': unicodedata.name(chr(code_point), 'name unknown')})
else:
print(('%(char_class)s: %(text)s: '
+ '%(char0)s → %(char1)s '
+ '%(code_point0)s → %(code_point1)s '
+ '%(name0)s → %(name1)s') %{
'text': text,
'char_class': char_class,
'char0': chr(code_point[0]),
'code_point0': hex(code_point[0]),
'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
'char1': chr(code_point[1]),
'code_point1': hex(code_point[1]),
'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
})
示例13: test_cjk
def test_cjk(self):
import sys
if sys.maxunicode < 0x10ffff:
skip("requires a 'wide' python build.")
import unicodedata
cases = ((0x3400, 0x4DB5),
(0x4E00, 0x9FA5))
if unicodedata.unidata_version >= "4.1":
cases = ((0x3400, 0x4DB5),
(0x4E00, 0x9FBB),
(0x20000, 0x2A6D6))
for first, last in cases:
# Test at and inside the boundary
for i in (first, first + 1, last - 1, last):
charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
assert unicodedata.name(unichr(i)) == charname
assert unicodedata.lookup(charname) == unichr(i)
# Test outside the boundary
for i in first - 1, last + 1:
charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
try:
unicodedata.name(unichr(i))
except ValueError:
pass
raises(KeyError, unicodedata.lookup, charname)
示例14: safe_path
def safe_path(origtitle):
title = safe_path_component(ftfy(origtitle))
if len(title) == 0:
title = origtitle = u'_'
if title.startswith(u'-') or title.startswith(u'.'):
title = u'_' + title
try:
charname = safe_path_component(unicodedata.name(origtitle[0]))
except ValueError:
charname = u'UNKNOWN'
category = charname.split('_')[0]
# some ridiculous stuff to give every article a unique name that can be
# stored on multiple file systems and tab-completed
if len(origtitle) == 1:
pieces = [u'single_character', category, charname + '.json']
else:
try:
charname2 = safe_path_component(unicodedata.name(origtitle[1]))
except ValueError:
charname2 = u'UNKNOWN'
text_to_encode = unicodedata.normalize("NFKD", safe_path_component(title[:64]))
finalpart = text_to_encode.encode('punycode').rstrip('-')
pieces = [charname, charname2, finalpart + '.json']
path = u'/'.join(pieces)
return path
示例15: main
def main():
# get files
files = []
for i in range(1,29):
if i < 26:
files.append("db/Minna_no_nihongo_1.%02d.txt" % i)
else:
files.append("db/Minna_no_nihongo_2.%02d.txt" % i)
# get words from files
words = get_words_from_files(files)
# add words to network
G=nx.Graph()
for w in words:
G.add_node(w)
G.node[w]['furigana'] = words[w]['furigana']
G.node[w]['meaning'] = words[w]['meaning']
G.node[w]['chapter'] = words[w]['chapter']
# to make statistics
nbins, dmin, dmax = 20, 0, 1
hist, edges = np.histogram([0], bins=nbins, range=(dmin, dmax))
# adding edges
words = G.nodes()
print("Total number of words: ",len(words))
for word1, word2 in itertools.combinations(words,2):
for w1 in word1:
for w2 in word2:
if "CJK UNIFIED" in ud.name(w1) and "CJK UNIFIED" in ud.name(w2):
f1, f2 = fingerprint[w1], fingerprint[w2]
match = SequenceMatcher(None, f1, f2 , autojunk=True)
ratio = match.ratio()
# add data to histogram
new_hist, edges = np.histogram(ratio, bins=nbins, range=(dmin, dmax))
hist += new_hist
if ratio > 0.8:
# G.add_edge(word1, word2, weight=5*ratio-4) # 0.8 - 1 --> 0 - 1
G.add_edge(word1, word2, weight=4*ratio-3.2) # 0.8 - 1 --> 0 - 0.8
break
# plot data
score = 0.5*(edges[1:] + edges[:-1])
plt.plot(score, hist)
plt.xlabel("score")
plt.ylabel("histogram")
plt.show()
G = sorted(nx.connected_component_subgraphs(G), key = len, reverse=True)
print("Total number of words connected: ", len(G[0].nodes()))
nx.write_graphml(G[0], "kanjis.graphml", encoding='utf-8', prettyprint=True)