當前位置: 首頁>>代碼示例>>Python>>正文


Python unicodedata.category方法代碼示例

本文整理匯總了Python中unicodedata.category方法的典型用法代碼示例。如果您正苦於以下問題:Python unicodedata.category方法的具體用法?Python unicodedata.category怎麽用?Python unicodedata.category使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在unicodedata的用法示例。


在下文中一共展示了unicodedata.category方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: _tokenize

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def _tokenize(self, text):
        if not self._cased:
            text = unicodedata.normalize('NFD', text)
            text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
            text = text.lower()
        spaced = ''
        for ch in text:
            if self._is_punctuation(ch) or self._is_cjk_character(ch):
                spaced += ' ' + ch + ' '
            elif self._is_space(ch):
                spaced += ' '
            elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
                continue
            else:
                spaced += ch
        tokens = []
        for word in spaced.strip().split():
            tokens += self._word_piece_tokenize(word)
        return tokens 
開發者ID:CyberZHG,項目名稱:keras-bert,代碼行數:21,代碼來源:tokenizer.py

示例2: normalizestr

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def normalizestr(string):
    """ Converts special characters like copyright,
        trademark signs to ascii name """
    # print("input: '{}'".format(string))
    input_string = string
    for mark, ascii_repl in unicode_marks(string):
        string = string.replace(mark, ascii_repl)

    rv = []
#    for c in unicodedata.normalize('NFKC', smart_text(string)):
    for c in unicodedata.normalize('NFKC', string):
        # cat = unicodedata.category(c)[0]
        # if cat in 'LN' or c in ok:
        rv.append(c)

    new = ''.join(rv).strip()
    result = unidecode(new)
    if result != input_string:
        print("Fixed string: '{}'".format(result))
    return result 
開發者ID:googlefonts,項目名稱:gftools,代碼行數:22,代碼來源:gftools-fix-ascii-fontmetadata.py

示例3: deaccent

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result) 
開發者ID:loretoparisi,項目名稱:word2vec-twitter,代碼行數:18,代碼來源:word2vecReaderUtils.py

示例4: _is_punctuation

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

### modeling.py 
開發者ID:CoNLL-UD-2018,項目名稱:UDPipe-Future,代碼行數:18,代碼來源:bert_wrapper.py

示例5: _run_strip_accents

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 這個函數去除掉text中的非間距字符

        # 標準化對於任何需要以一致的方式處理Unicode文本的程序都是非常重要的。
        # 當處理來自用戶輸入的字符串而你很難去控製編碼的時候尤其如此。
        # normalize() 將文本標準化,第一個參數指定字符串標準化的方式,NFD表示字符應該分解為多個組合字符表示
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            # category() 返回字符在UNICODE裏分類的類型
            cat = unicodedata.category(char)
            if cat == "Mn":
                #  Mark, Nonspacing 指示字符是非間距字符,這指示基字符的修改。
                # https://www.fileformat.info/info/unicode/category/Mn/list.htm
                continue
            output.append(char)
        return "".join(output) 
開發者ID:eva-n27,項目名稱:BERT-for-Chinese-Question-Answering,代碼行數:20,代碼來源:tokenization.py

示例6: get_hgnc_id_from_symbol

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def get_hgnc_id_from_symbol(gene_symbol):
        """
        Get HGNC curie from symbol using monarch and mygene services
        :param gene_symbol:
        :return:
        """
        monarch_url = 'https://solr.monarchinitiative.org/solr/search/select'
        params = DipperUtil._get_solr_weight_settings()
        params["q"] = "{0} \"{0}\"".format(gene_symbol)
        params["fq"] = ["taxon:\"NCBITaxon:9606\"", "category:\"gene\""]
        gene_id = None
        try:
            monarch_request = requests.get(monarch_url, params=params)
            response = monarch_request.json()
            count = response['response']['numFound']
            if count > 0:
                gene_id = response['response']['docs'][0]['id']
        except requests.ConnectionError:
            print("error fetching {0}".format(monarch_url))

        return gene_id 
開發者ID:monarch-initiative,項目名稱:dipper,代碼行數:23,代碼來源:DipperUtil.py

示例7: _run_strip_accents

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""

        korean = "%s-%s%s-%s" % (chr(0xac00), chr(0xd7a3),
                                chr(0x3131), chr(0x3163))
        if re.search("[%s]+" % korean, text):
            return "".join(
                substr if re.search("^[%s]+$" % korean, substr)
                else self._run_strip_accents(substr)
                for substr in re.findall("[%s]+|[^%s]+" % (korean, korean), text)
            )

        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output) 
開發者ID:SanghunYun,項目名稱:UDA_pytorch,代碼行數:22,代碼來源:tokenization.py

示例8: encode_field

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def encode_field(s):
  # Empty string converts to "".
  if s == "":
    return '""  '
  # Check whether the string contains any special characters.
  special = False
  if '"' in s or ' ' in s:
    special = True
  else:
    for c in s:
      if unicodedata.category(c)[0] == "C":
        special = True
        break
  if not special:
    return s
  # Quoted field is needed.
  f = '"'
  for c in s:
    if c in ESCAPE_SEQUENCES2:
      f += ESCAPE_SEQUENCES2[c]
      continue
    f += c
  return f + '"' 
開發者ID:sustrik,項目名稱:uxy,代碼行數:25,代碼來源:base.py

示例9: remove_accents

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def remove_accents(self, text):
		""" Normalise (normalize) unicode data in Python to remove umlauts, accents etc.

		Rule 10 - When special characters appear as part of the name, paternal surname and maternal surname,
		they must be excluded for the calculation of the homonym and the verification digit. 
		The characters will be interpreted, yes and only if, they are individually within the name,
		paternal surname and maternal surname. Examples:
			
		Roberto O’farril Carballo OACR-661121
		Rubén D’angelo Fargo DAFR-710108
		Luz Ma. Fernández Juárez FEJL-830120
		"""		 
		#s_no_accents = ''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))
		try:
			text = unicode(text, 'utf-8')
		except (TypeError, NameError): # unicode is a default on python 3 
			pass
		text = unicodedata.normalize('NFD', text)
		text = text.encode('ascii', 'ignore')
		text = text.decode("utf-8")
		return str(text) 
開發者ID:thomgonzalez,項目名稱:pyfiscal,代碼行數:23,代碼來源:base.py

示例10: _run_strip_accents

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output) 
開發者ID:Socialbird-AILab,項目名稱:BERT-Classification-Tutorial,代碼行數:12,代碼來源:tokenization.py

示例11: _is_whitespace

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False 
開發者ID:Socialbird-AILab,項目名稱:BERT-Classification-Tutorial,代碼行數:12,代碼來源:tokenization.py

示例12: _is_control

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False 
開發者ID:Socialbird-AILab,項目名稱:BERT-Classification-Tutorial,代碼行數:12,代碼來源:tokenization.py

示例13: _is_punctuation

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False 
開發者ID:Socialbird-AILab,項目名稱:BERT-Classification-Tutorial,代碼行數:16,代碼來源:tokenization.py

示例14: text_normalize

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def text_normalize(text):
    text = ''.join(char for char in unicodedata.normalize('NFD', text)
                           if unicodedata.category(char) != 'Mn') # Strip accents

    text = text.lower()
    text = re.sub("[^{}]".format(hp.vocab), " ", text)
    text = re.sub("[ ]+", " ", text)
    return text 
開發者ID:Kyubyong,項目名稱:dc_tts,代碼行數:10,代碼來源:data_load.py

示例15: text_normalize

# 需要導入模塊: import unicodedata [as 別名]
# 或者: from unicodedata import category [as 別名]
def text_normalize(text):
    text = ''.join(char for char in unicodedata.normalize('NFD', text)
                   if unicodedata.category(char) != 'Mn')  # Strip accents

    text = text.lower()
    text = re.sub("[^{}]".format(hp.vocab), " ", text)
    text = re.sub("[ ]+", " ", text)
    return text 
開發者ID:KinglittleQ,項目名稱:GST-Tacotron,代碼行數:10,代碼來源:Data.py


注:本文中的unicodedata.category方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。