Python unicodedata.category函数代码示例

本文整理汇总了Python中unicodedata.category函数的典型用法代码示例。如果您正苦于以下问题：Python category函数的具体用法？Python category怎么用？Python category使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了category函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: splitText

  def splitText(text):
    """ Split text into sub segments of size not bigger than MAX_SEGMENT_SIZE. """
    segments = []
    remaining_text = __class__.cleanSpaces(text)

    while len(remaining_text) > __class__.MAX_SEGMENT_SIZE:
      cur_text = remaining_text[:__class__.MAX_SEGMENT_SIZE]

      # try to split at punctuation
      split_idx = __class__.findLastCharIndexMatching(cur_text,
                                                      # https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
                                                      lambda x: unicodedata.category(x) in ("Ps", "Pe", "Pi", "Pf", "Po"))
      if split_idx is None:
        # try to split at whitespace
        split_idx = __class__.findLastCharIndexMatching(cur_text,
                                                        lambda x: unicodedata.category(x).startswith("Z"))
      if split_idx is None:
        # try to split at anything not a letter or number
        split_idx = __class__.findLastCharIndexMatching(cur_text,
                                                        lambda x: not (unicodedata.category(x)[0] in ("L", "N")))
      if split_idx is None:
        # split at the last char
        split_idx = __class__.MAX_SEGMENT_SIZE - 1

      new_segment = cur_text[:split_idx + 1].rstrip()
      segments.append(new_segment)
      remaining_text = remaining_text[split_idx + 1:].lstrip(string.whitespace + string.punctuation)

    if remaining_text:
      segments.append(remaining_text)

    return segments

开发者ID:desbma，项目名称:GoogleSpeech，代码行数:32，代码来源:__init__.py

示例2: TokenOffsets

def TokenOffsets(string: str):
    """
    Yield the offsets of all Unicode category borders in the *string*,
    including the initial 0 and the final offset value of ``len(string)``.

    Caplitalized words special case: A single upper case letter ('Lu')
    followed by lower case letters ('Ll') are treated as a single token.
    """
    if string is not None and len(string) > 0:
        yield 0
        last = category(string[0])

        for i in range(1, len(string)):
            current = category(string[i])

            if last != current:
                # "join" capitalized tokens:
                if last == 'Lu' and \
                   current == 'Ll' and \
                   (i == 1 or (i > 1 and category(string[i - 2]) != 'Lu')):
                    pass
                else:
                    yield i

            last = current

        yield len(string)

开发者ID:fnl，项目名称:libfnl，代码行数:27，代码来源:strtok.py

示例3: ranking

	def ranking(self):
		"""
		For each result, removes stopwords, ranks the word, augments the query
		and returns True if successful else False
		"""
		print "Indexing results ...."

		for i in range(len(self.results)):
			result = self.results[i]
			title = result[0]
			summary = result[1]

			# Remove punctuation and create lists of words
			titleWords = "".join(c for c in title if not unicodedata.category(c).startswith('P')).split()
			summaryWords = "".join(c for c in summary if not unicodedata.category(c).startswith('P')).split()

			for tw in titleWords:
				if tw.lower() in self.stopWords:
					continue
				if self.user_feedback[i] == 'y':
					self.applyRanking(i, tw, True, True)
				else:
					self.applyRanking(i, tw, True, False)

			for sw in summaryWords:
				if sw.lower() in self.stopWords:
					continue
				if self.user_feedback[i] == 'y':
					self.applyRanking(i, sw, False, True)
				else:
					self.applyRanking(i, sw, False, False)

		print "Indexing results ...."

		return self.augmentQuery()

开发者ID:akshaisarma，项目名称:QueryAugmentation，代码行数:35，代码来源:UI.py

示例4: normalize_roman

def normalize_roman(string, additional=None):
    """Removes diacritics from the string and converts to lowercase.

        >>> normalize_roman(u'Eèé')
        u'eee'
    """
    if additional:
        safe = additional.keys() + additional.values()
        def gen():
            for c in string:
                if c not in safe:
                    yield normalize_roman(c)
                elif c in additional:
                    yield additional[c]
                else:
                    yield c
        return ''.join(gen())
    else:
        chars = []
        for c in string:
            if unicodedata.category(c) == 'Lo':
                chars.append(c)
            else:
                nor = unicodedata.normalize('NFD', c)
                chars.extend(x for x in nor if unicodedata.category(x) != 'Mn')
        return ''.join(chars).lower()

开发者ID:Pusungwi，项目名称:hangulize，代码行数:26，代码来源:normalization.py

示例5: test_exclude_characters_of_specific_groups

def test_exclude_characters_of_specific_groups():
    st = characters(blacklist_categories=('Lu', 'Nd'))

    find(st, lambda c: unicodedata.category(c) != 'Lu')
    find(st, lambda c: unicodedata.category(c) != 'Nd')

    assert_no_examples(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))

开发者ID:doismellburning，项目名称:hypothesis，代码行数:7，代码来源:test_simple_characters.py

示例6: consolidate_ampers

def consolidate_ampers(text: str) -> str:
    """Converts all ampersands in a text to a single one (&).

    :param text: A string which should have ampersands converted.
    :return: The text string after all ampersands have been replaced.
    """

    chosen_amper_value = "\u0026"

    amper_values = dict.fromkeys(
        [chr(i) for i in range(sys.maxunicode)
         # Avoid unnamed control chars throwing ValueErrors
         if (unicodedata.category(chr(i)).startswith('P')
             or unicodedata.category(chr(i)).startswith('S'))
         and re.search(
            r" ampersand|ampersand ", unicodedata.name(chr(i)),
            re.IGNORECASE) is not None
         and chr(i) != chosen_amper_value]
    )

    # Change all ampersands to one type of ampersand
    for value in amper_values:
        text = text.replace(value, chosen_amper_value)

    return text

开发者ID:WheatonCS，项目名称:Lexos，代码行数:25，代码来源:scrubber.py

示例7: test_exclude_characters_of_specific_groups

def test_exclude_characters_of_specific_groups():
    st = characters(blacklist_categories=("Lu", "Nd"))

    find_any(st, lambda c: unicodedata.category(c) != "Lu")
    find_any(st, lambda c: unicodedata.category(c) != "Nd")

    assert_no_examples(st, lambda c: unicodedata.category(c) in ("Lu", "Nd"))

开发者ID:HypothesisWorks，项目名称:hypothesis-python，代码行数:7，代码来源:test_simple_characters.py

示例8: tokens

def tokens(source):
    p = 0
    while p < len(source):
        ch = source[p]
        cat = category(ch)
        if ch in NEWLINE_CHARS:
            yield NewlineToken(source[p])
            p += 1
        elif cat[0] in "CZ":
            q = p + 1
            while q < len(source) and category(source[q])[0] in "CZ":
                q += 1
            yield WhitespaceToken(source[p:q])
            p = q
        elif cat[0] in "LN":
            q = p + 1
            while q < len(source) and category(source[q])[0] in "LN":
                q += 1
            yield WordToken(source[p:q])
            p = q
        else:
            q = p + 1
            while q < len(source) and source[q] == ch:
                q += 1
            yield SymbolToken(source[p:q])
            p = q

开发者ID:nigelsmall，项目名称:nige.tech，代码行数:26，代码来源:syntaq_experiment.py

示例9: crear_nombre_usuario

def crear_nombre_usuario(nombre, apellidos):
    # En primer lugar quitamos tildes, colocamos nombres en minúsculas y :
    nombre = ''.join(
        (c for c in unicodedata.normalize('NFD', smart_unicode(nombre)) if
         unicodedata.category(c) != 'Mn')).lower().split()
    apellidos = ''.join(
        (c for c in unicodedata.normalize('NFD', smart_unicode(apellidos)) if
         unicodedata.category(c) != 'Mn')).lower().split()
    iniciales_nombre = ''
    for parte in nombre:
        iniciales_nombre = iniciales_nombre + parte[0]
    try:
        iniciales_apellidos = apellidos[0]
    except:  # Estas dos líneas están para crear usuarios cuando no tienen apellidos
        iniciales_apellidos = 'sin'
    for ind in range(len(apellidos))[1:]:
        try:  # Por si acaso el usuario sólo tuviera un apellido:
            iniciales_apellidos = iniciales_apellidos + apellidos[ind][0]
        except IndexError:
            pass
    usuario = iniciales_nombre + iniciales_apellidos
    valid_usuario = False
    n = 1
    while valid_usuario == False:
        username = usuario + str(n)
        try:
            user = Gauser.objects.get(username=username)
            n += 1
        except:
            valid_usuario = True
    return username

开发者ID:jjmartinr01，项目名称:GaussProject，代码行数:31，代码来源:views.py

示例10: new

	def __new__(cls,s,on_fail='die',msg=None):
		if type(s) == cls: return s
		cls.arg_chk(on_fail)
		for k in cls.forbidden,cls.allowed:
			assert type(k) == list
			for ch in k: assert type(ch) == str and len(ch) == 1
		try:
			s = s.strip()
			if type(s) != str:
				s = s.decode('utf8')
			for ch in s:
				# Allow:    (L)etter,(N)umber,(P)unctuation,(S)ymbol,(Z)space
				# Disallow: (C)ontrol,(M)combining
				# Combining characters create width formatting issues, so disallow them for now
				if unicodedata.category(ch)[0] in 'CM':
					t = { 'C':'control', 'M':'combining' }[unicodedata.category(ch)[0]]
					raise ValueError('{}: {} characters not allowed'.format(ascii(ch),t))
			me = str.__new__(cls,s)
			if cls.max_screen_width:
				me.screen_width = len(s) + len([1 for ch in s if unicodedata.east_asian_width(ch) in ('F','W')])
				assert me.screen_width <= cls.max_screen_width,(
					'too wide (>{} screen width)'.format(cls.max_screen_width))
			else:
				assert len(s) <= cls.max_len, 'too long (>{} symbols)'.format(cls.max_len)
			assert len(s) >= cls.min_len, 'too short (<{} symbols)'.format(cls.min_len)
			assert not cls.allowed or set(list(s)).issubset(set(cls.allowed)),\
				'contains non-allowed symbols: {}'.format(' '.join(set(list(s)) - set(cls.allowed)))
			assert not cls.forbidden or not any(ch in s for ch in cls.forbidden),\
				"contains one of these forbidden symbols: '{}'".format("', '".join(cls.forbidden))
			return me
		except Exception as e:
			return cls.init_fail(e,s)

开发者ID:mmgen，项目名称:mmgen，代码行数:32，代码来源:obj.py

示例11: parse

 def parse(cls, string):
     from unicodedata import category
     parts = []
     last_ch = None
     for ch in string:
         if last_ch is None:
             parts.append([ch])
         elif ch == ".":
             if last_ch in ".-":
                 parts[-1][-1] += "0"
             parts[-1].append("")
         elif ch == "-":
             if last_ch in ".-":
                 parts[-1][-1] += "0"
             parts.append([""])
         else:
             if last_ch not in ".-" and category(ch)[0] != category(last_ch)[0]:
                 parts.append([ch])
             else:
                 parts[-1][-1] += ch
         last_ch = ch
     for part in parts:
         for i, x in enumerate(part):
             try:
                 part[i] = int(x)
             except (ValueError, TypeError):
                 pass
         while len(part) > 1 and not part[-1]:
             part[:] = part[:-1]
     return cls(*map(tuple, parts))

开发者ID:neo4j-contrib，项目名称:boltkit，代码行数:30，代码来源:dist.py

示例12: get_match_list

def get_match_list(data, match_list, order_list=None, only_ascii=False, ignorecase=False):
    """
    Busca coincidencias en una cadena de texto, con un diccionario de "ID" / "Listado de cadenas de busqueda":
     { "ID1" : ["Cadena 1", "Cadena 2", "Cadena 3"],
       "ID2" : ["Cadena 4", "Cadena 5", "Cadena 6"]
     }
    
     El diccionario no pude contener una misma cadena de busqueda en varías IDs.
        
     La busqueda se realiza por orden de tamaño de cadena de busqueda (de mas larga a mas corta) si una cadena coincide,
     se elimina de la cadena a buscar para las siguientes, para que no se detecten dos categorias si una cadena es parte de otra:
     por ejemplo: "Idioma Español" y "Español" si la primera aparece en la cadena "Pablo sabe hablar el Idioma Español" 
     coincidira con "Idioma Español" pero no con "Español" ya que la coincidencia mas larga tiene prioridad.
    
    """
    import unicodedata
    match_dict = dict()
    matches = []

    # Pasamos la cadena a unicode
    data = unicode(data, "utf8")

    # Pasamos el diccionario a {"Cadena 1": "ID1", "Cadena 2", "ID1", "Cadena 4", "ID2"} y los pasamos a unicode
    for key in match_list:
        if order_list and not key in order_list:
            raise Exception("key '%s' not in match_list" % key)
        for value in match_list[key]:
            if value in match_dict:
                raise Exception("Duplicate word in list: '%s'" % value)
            match_dict[unicode(value, "utf8")] = key

    # Si ignorecase = True, lo pasamos todo a mayusculas
    if ignorecase:
        data = data.upper()
        match_dict = dict((key.upper(), match_dict[key]) for key in match_dict)

    # Si ascii = True, eliminamos todos los accentos y Ñ
    if only_ascii:
        data = ''.join((c for c in unicodedata.normalize('NFD', data) if unicodedata.category(c) != 'Mn'))
        match_dict = dict((''.join((c for c in unicodedata.normalize('NFD', key) if unicodedata.category(c) != 'Mn')),
                           match_dict[key]) for key in match_dict)

    # Ordenamos el listado de mayor tamaño a menor y buscamos.
    for match in sorted(match_dict, key=lambda x: len(x), reverse=True):
        s = data
        for a in matches:
            s = s.replace(a, "")
        if match in s:
            matches.append(match)
    if matches:
        if order_list:
            return type("Mtch_list", (),
                        {"key": match_dict[matches[-1]], "index": order_list.index(match_dict[matches[-1]])})
        else:
            return type("Mtch_list", (), {"key": match_dict[matches[-1]], "index": None})
    else:
        if order_list:
            return type("Mtch_list", (), {"key": None, "index": len(order_list)})
        else:
            return type("Mtch_list", (), {"key": None, "index": None})

开发者ID:FusionwareIT，项目名称:Repository-FusionBox，代码行数:60，代码来源:descargas.py

示例13: characters

 def characters(self, content):
     text = content.strip()
     
     if self._inTitle:
         if self._headerProcessed:
             if not self._ignoreTitle:
                 self._writeHtml(content)
                 
     else :
         if self._headerProcessed:           
             if not self._ignoreText:
                 if len(text) > 0:
                     if not self._glossTitleWritten and not self._inTitle:
                         self._writeDefaultTitle()                                        
                     if not self._inParagraph and not self._inGeneratedPara and not self._inArticle and not self._lineGroupPara and not self._inTable:
                         self._startGeneratedPara()
                     if self._endDfn:
                         if self._keywordTag == 'dfn':
                             if unicodedata.category(content[0]) == 'Pd':
                                 self._writeHtml(' ')
                             elif content[0] == ' ':
                                 if unicodedata.category(text[0]) != 'Pd':
                                     self._writeHtml(u' \u2014')
                             else:
                                 self._writeHtml(u' \u2014 ')
                             self._writeHtml(content)
                         else:                                   # 'h4' for fb2
                             if unicodedata.category(text[0]) == 'Pd':
                                 text = text[1:]
                             self._writeHtml(text.strip())
                         self._endDfn = False
                     else:
                         self._writeHtml(content)

开发者ID:davebooth，项目名称:osis-converters，代码行数:33，代码来源:glossary.py

示例14: test_characters_of_specific_groups

def test_characters_of_specific_groups():
    st = characters(whitelist_categories=("Lu", "Nd"))

    find_any(st, lambda c: unicodedata.category(c) == "Lu")
    find_any(st, lambda c: unicodedata.category(c) == "Nd")

    assert_no_examples(st, lambda c: unicodedata.category(c) not in ("Lu", "Nd"))