当前位置: 首页>>代码示例>>Python>>正文


Python regex.UNICODE属性代码示例

本文整理汇总了Python中regex.UNICODE属性的典型用法代码示例。如果您正苦于以下问题:Python regex.UNICODE属性的具体用法?Python regex.UNICODE怎么用?Python regex.UNICODE使用的例子?那么, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在regex的用法示例。


在下文中一共展示了regex.UNICODE属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True) 
开发者ID:thunlp,项目名称:OpenQA,代码行数:25,代码来源:regexp_tokenizer.py

示例2: _phrase_to_regex

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def _phrase_to_regex(phrase):
    # Treat whitespace between words as meaning anything other than alphanumeric
    # characters.
    pattern = r"[^\w--_]+".join(regex.escape(word) for word in phrase.split())
    # Treat spaces at the beginning or end of the phrase as matching any
    # whitespace character. This makes it easy to select stuff like non-breaking
    # space, which occurs frequently in browsers.
    # TODO Support newlines. Note that these are frequently implemented as
    # separate text nodes in the accessibility tree, so the obvious
    # implementation would not work well.
    if phrase == " ":
        pattern = r"\s"
    else:
        if phrase.startswith(" "):
            pattern = r"\s" + pattern
        if phrase.endswith(" "):
            pattern = pattern + r"\s"
    # Only match at boundaries of alphanumeric sequences if the phrase ends
    # are alphanumeric.
    if regex.search(r"^[\w--_]", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = r"(?<![\w--_])" + pattern
    if regex.search(r"[\w--_]$", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = pattern + r"(?![\w--_])"
    return pattern 
开发者ID:dictation-toolbox,项目名称:dragonfly,代码行数:26,代码来源:utils.py

示例3: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, opt: Opt, shared: TShared = None):
        """
        Initialize the BPE module.

        :param opt:
            options
        :param shared:
            shared dictionary
        """
        super().__init__(opt, shared)
        if not SUBWORD_BPE_INSTALLED:
            raise RuntimeError(
                "Please run \"pip install 'git+https://github.com/rsennrich"
                "/subword-nmt.git#egg=subword-nmt'\""
            )
        if not opt.get('dict_file'):
            raise RuntimeError('--dict-file is mandatory.')

        self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)

        self.codecs = f"{opt['dict_file']}.codecs"
        if os.path.exists(self.codecs):
            self._load_from_codecs() 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:25,代码来源:bpe.py

示例4: sdm_sim

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def sdm_sim(headlines, bodies):
    def similarity(headline, body):
        clean_headline = clean(headline)
        clean_body = clean(body)
        fullClient = retinasdk.FullClient("e8bf8de0-fe52-11e6-b22d-93a4ae922ff1", apiServer="http://api.cortical.io/rest", retinaName="en_associative")

        RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE)
        clean_body = RE.sub(u'', clean_body)
        #         clean_body = clean_body.encode('ascii', 'ignore')
        clean_body = clean_body.encode('utf8', 'ignore')
        clean_body = clean_body.decode('utf8', 'ignore')
        #         print(clean_body)
        clean_body.replace("0x6e", " ")
        #         newdata = clean_body[:start] + clean_body[end:]
        #         clean_body = clean_body.translate(None, '0x6e')
        comp_with_stop_words = fullClient.compare('[{"text": "'+clean_headline+'"}, {"text": "'+clean_body +'"}]')
        sim = comp_with_stop_words.cosineSimilarity

        features = []
        features.append(sim)
        return features
    x = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        x.append(similarity(headline, body))
    return x 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:27,代码来源:feature_engineering.py

示例5: remove_punctuation

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def remove_punctuation(text: str, marks=None) -> str:
    """
    Remove punctuation from ``text`` by removing all instances of ``marks``.

    Args:
        text (str): Urdu text
        marks (str): If specified, remove only the characters in this string,
            e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
            Otherwise, all punctuation marks are removed.
    Returns:
        str: returns a ``str`` object containing normalized text.
    Note:
        When ``marks=None``, Python's built-in :meth:`str.translate()` is
        used to remove punctuation; otherwise, a regular expression is used
        instead. The former's performance is about 5-10x faster.
    Examples:
        >>> from urduhack.preprocessing import remove_punctuation
        >>> output = remove_punctuation("کر ؟ سکتی ہے۔")
        کر سکتی ہے

    """
    if marks:
        return re.sub('[{}]+'.format(re.escape(marks)), '', text, flags=re.UNICODE)

    return text.translate(PUNCTUATION_TRANSLATE_UNICODE) 
开发者ID:urduhack,项目名称:urduhack,代码行数:27,代码来源:util.py

示例6: _set_splitters

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def _set_splitters(self, settings=None):
        splitters = {
            'wordchars': set(),  # The ones that split string only if they are not surrounded by letters from both sides
            'capturing': set(),  # The ones that are not filtered out from tokens after split
        }
        splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS)

        wordchars = self._get_wordchars(settings)
        skip = set(self.info.get('skip', [])) | splitters['capturing']
        for token in skip:
            if not re.match(r'^\W+$', token, re.UNICODE):
                continue
            if token in wordchars:
                splitters['wordchars'].add(token)

        self._splitters = splitters 
开发者ID:scrapinghub,项目名称:dateparser,代码行数:18,代码来源:locale.py

示例7: regex_match_score

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None 
开发者ID:HKUST-KnowComp,项目名称:MnemonicReader,代码行数:13,代码来源:utils.py

示例8: has_answer

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def has_answer(args, answer, t):
    global PROCESS_TOK
    text = []
    for i in range(len(t)):
        text.append(t[i].lower())
    res_list = []
    if (args.dataset == "CuratedTrec"):
        try:
            ans_regex = re.compile("(%s)"%answer[0], flags=re.IGNORECASE + re.UNICODE)
        except:
            return False, res_list
        paragraph = " ".join(text)
        answer_new = ans_regex.findall(paragraph)
        for a in answer_new:
            single_answer = normalize(a[0])
            single_answer = PROCESS_TOK.tokenize(single_answer)
            single_answer = single_answer.words(uncased=True)
            for i in range(0, len(text) - len(single_answer) + 1):
                if single_answer == text[i: i + len(single_answer)]:
                    res_list.append((i, i+len(single_answer)-1))
    else:
        for a in answer:
            single_answer = " ".join(a).lower()
            single_answer = normalize(single_answer)
            single_answer = PROCESS_TOK.tokenize(single_answer)
            single_answer = single_answer.words(uncased=True)
            for i in range(0, len(text) - len(single_answer) + 1):
                if single_answer == text[i: i + len(single_answer)]:
                    res_list.append((i, i+len(single_answer)-1))
    if (len(res_list)>0):
        return True, res_list
    else:
        return False, res_list 
开发者ID:thunlp,项目名称:OpenQA,代码行数:35,代码来源:main.py

示例9: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
        """
        self._regexp = regex.compile(
            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set() 
开发者ID:thunlp,项目名称:OpenQA,代码行数:15,代码来源:simple_tokenizer.py

示例10: regex_match

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def regex_match(text, pattern):
    """Test if a regex pattern is contained within a text."""
    try:
        pattern = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE,
        )
    except BaseException:
        return False
    return pattern.search(text) is not None 
开发者ID:ailabstw,项目名称:justcopy-backend,代码行数:12,代码来源:eval.py

示例11: clean_name

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def clean_name(self):
        name = self.cleaned_data["name"].strip()
        if name.lower() == "flagged":
            raise forms.ValidationError(_("Reserved label name"))

        elif len(name) > Label.MAX_NAME_LEN:
            raise forms.ValidationError(_("Label name must be %d characters or less") % Label.MAX_NAME_LEN)

        # first character must be a word char
        elif not regex.match(r"\w", name[0], flags=regex.UNICODE):
            raise forms.ValidationError(_("Label name must start with a letter or digit"))
        return name 
开发者ID:rapidpro,项目名称:casepro,代码行数:14,代码来源:forms.py

示例12: matches

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def matches(self, message):
        text = normalize(message.text)

        def keyword_check(w):
            return lambda: bool(regex.search(r"\b" + w + r"\b", text, flags=regex.UNICODE | regex.V0))

        checks = [keyword_check(keyword) for keyword in self.keywords]

        return self.quantifier.evaluate(checks) 
开发者ID:rapidpro,项目名称:casepro,代码行数:11,代码来源:models.py

示例13: replace_tippi

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def replace_tippi(cls, text):
        import regex
        text = regex.sub("ੱ([ਕਖ])", r"ਕ੍\g<1>", text, flags=regex.UNICODE)
        text = regex.sub(r"ੱ([ਗਘ])", "ਗ੍\g<1>", text)
        text = regex.sub("ੱ([ਚਛ])", "ਚ੍\g<1>", text)
        text = regex.sub("ੱ([ਜਝ])", "ਜ੍\g<1>", text)
        text = regex.sub("ੱ([ਟਠ])", "ਟ੍\g<1>", text)
        text = regex.sub("ੱ([ਡਢ])", "ਡ੍\g<1>", text)
        text = regex.sub("ੱ([ਤਥ])", "ਤ੍\g<1>", text)
        text = regex.sub("ੱ([ਦਧ])", "ਦ੍\g<1>", text)
        text = regex.sub("ੱ([ਪਫ])", "ਪ੍\g<1>", text)
        text = regex.sub("ੱ([ਬਭ])", "ਬ੍\g<1>", text)
        text = regex.sub("ੱ([ਯਰਲਵਸ਼ਸਹਙਞਣਨਮਜ਼ੜਫ਼])", "\g<1>੍\g<1>", text)
        return text 
开发者ID:sanskrit-coders,项目名称:indic_transliteration,代码行数:16,代码来源:northern.py

示例14: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
        """
        self._regexp = regex.compile(
            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE,
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning(
                '%s only tokenizes! Skipping annotators: %s'
                % (type(self).__name__, kwargs.get('annotators'))
            )
        self.annotators = set() 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:17,代码来源:simple_tokenizer.py

示例15: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)'
            % (
                self.DIGIT,
                self.TITLE,
                self.ABBRV,
                self.NEGATION,
                self.HYPHEN,
                self.CONTRACTION1,
                self.ALPHA_NUM,
                self.CONTRACTION2,
                self.START_DQUOTE,
                self.END_DQUOTE,
                self.START_SQUOTE,
                self.END_SQUOTE,
                self.DASH,
                self.ELLIPSES,
                self.PUNCT,
                self.NON_WS,
            ),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE,
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning(
                '%s only tokenizes! Skipping annotators: %s'
                % (type(self).__name__, kwargs.get('annotators'))
            )
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True) 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:40,代码来源:regexp_tokenizer.py


注:本文中的regex.UNICODE属性示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。