Python regex.escape方法代码示例

本文整理汇总了Python中regex.escape方法的典型用法代码示例。如果您正苦于以下问题:Python regex.escape方法的具体用法?Python regex.escape怎么用?Python regex.escape使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在regex的用法示例。


示例1: expand

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def expand(self, s):
        if not self._defs:
            return s
        if self._regex is not None:
            regex = self._regex
            substs = self._substs
            substs = []
            regex = []
            for i, (name, definition) in enumerate(self._defs.items()):
                substs += [definition]
                regex += ['(?P<mwic{i}>{name})'.format(i=i, name=re.escape(name))]
            regex = '|'.join(regex)
            regex = re.compile(regex)
            self._regex = regex
            self._substs = substs
        assert self._regex is not None
        assert self._substs is not None
        def replace(match):
            for i, subst in enumerate(substs):
                if match.group('mwic{i}'.format(i=i)) is not None:
                    return subst
            assert False  # no coverage
        return self._regex.sub(replace, s) 

示例2: _phrase_to_regex

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def _phrase_to_regex(phrase):
    # Treat whitespace between words as meaning anything other than alphanumeric
    # characters.
    pattern = r"[^\w--_]+".join(regex.escape(word) for word in phrase.split())
    # Treat spaces at the beginning or end of the phrase as matching any
    # whitespace character. This makes it easy to select stuff like non-breaking
    # space, which occurs frequently in browsers.
    # TODO Support newlines. Note that these are frequently implemented as
    # separate text nodes in the accessibility tree, so the obvious
    # implementation would not work well.
    if phrase == " ":
        pattern = r"\s"
        if phrase.startswith(" "):
            pattern = r"\s" + pattern
        if phrase.endswith(" "):
            pattern = pattern + r"\s"
    # Only match at boundaries of alphanumeric sequences if the phrase ends
    # are alphanumeric.
    if regex.search(r"^[\w--_]", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = r"(?<![\w--_])" + pattern
    if regex.search(r"[\w--_]$", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = pattern + r"(?![\w--_])"
    return pattern 

示例3: exclude_words

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def exclude_words(phrasegrams, words):
    """Given a list of words, excludes those from the keys of the phrase dictionary."""
    new_phrasergrams = {}
    words_re_list = []
    for word in words:
        we = regex.escape(word)
        words_re_list.append("^" + we + "$|^" + we + "_|_" + we + "$|_" + we + "_")
    word_reg = regex.compile(r""+"|".join(words_re_list))
    for gram in tqdm(phrasegrams):
        valid = True
        for sub_gram in gram:
            if word_reg.search(sub_gram.decode("unicode_escape", "ignore")) is not None:
                valid = False
            if not valid:
        if valid:
            new_phrasergrams[gram] = phrasegrams[gram]
    return new_phrasergrams

# Generating word grams. 

示例4: remove_punctuation

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def remove_punctuation(text: str, marks=None) -> str:
    Remove punctuation from ``text`` by removing all instances of ``marks``.

        text (str): Urdu text
        marks (str): If specified, remove only the characters in this string,
            e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
            Otherwise, all punctuation marks are removed.
        str: returns a ``str`` object containing normalized text.
        When ``marks=None``, Python's built-in :meth:`str.translate()` is
        used to remove punctuation; otherwise, a regular expression is used
        instead. The former's performance is about 5-10x faster.
        >>> from urduhack.preprocessing import remove_punctuation
        >>> output = remove_punctuation("کر ؟ سکتی ہے۔")
        کر سکتی ہے

    if marks:
        return re.sub('[{}]+'.format(re.escape(marks)), '', text, flags=re.UNICODE)

    return text.translate(PUNCTUATION_TRANSLATE_UNICODE) 

示例5: test_expression_diff

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def test_expression_diff(self) -> None:
        source, _ = read_data("expression.py")
        expected, _ = read_data("expression.diff")
        tmp_file = Path(black.dump_to_file(source))
        diff_header = re.compile(
            rf"{re.escape(str(tmp_file))}\t\d\d\d\d-\d\d-\d\d "
            r"\d\d:\d\d:\d\d\.\d\d\d\d\d\d \+\d\d\d\d"
            result = BlackRunner().invoke(black.main, ["--diff", str(tmp_file)])
            self.assertEqual(result.exit_code, 0)
        actual = result.output
        actual = diff_header.sub(DETERMINISTIC_HEADER, actual)
        actual = actual.rstrip() + "\n"  # the diff output has a trailing space
        if expected != actual:
            dump = black.dump_to_file(actual)
            msg = (
                "Expected diff isn't equal to the actual. If you made changes to"
                " expression.py and this is an anticipated difference, overwrite"
                f" tests/data/expression.diff with {dump}"
            self.assertEqual(expected, actual, msg) 

示例6: search_citing_sentences

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def search_citing_sentences(aid, txt, match):
    lines = txt.split('\n')
    txt = ' '.join(lines)
    txt = ' '.join(txt.split())
    sentences = split_sentences(txt)
    founds = set()
    for r in match.keys():
        if r:
            regexp_list = [regex.escape('\cite%s' % r),
                           regex.escape('\\refs{%s}' % r),
                           r'(?<!(bibitem|lref).*?)' + regex.escape('%s' % r)]
            print aid, r
            for regexp in regexp_list:
                results = search_citation(sentences, regexp)
                print("Regex: '{0!s}', Found: {1:d}".format(regexp, len(results)))
                if len(results):
    print("_" * 50)
    return founds 

示例7: _replace_escape_bytes

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def _replace_escape_bytes(self, value):
        chunks = []
        offset = 0
        while offset < len(value):
            match = self._lex_escape_re.search(value, offset)
            if match is None:
                # Append the remaining of the string

            # Append the part of string before match
            offset = match.end()

            # Process the escape
            if match.group(1) is not None: # single-char
                chr = match.group(1)
                if chr == b"\n":
                elif chr == b"\\" or chr == b"'" or chr == b"\"":
                elif chr == b"a":
                elif chr == b"b":
                elif chr == b"f":
                elif chr == b"n":
                elif chr == b"r":
                elif chr == b"t":
                elif chr == b"v":
            elif match.group(2) is not None: # oct
                chunks.append(byte(int(match.group(2), 8)))
            elif match.group(3) is not None: # hex
                chunks.append(byte(int(match.group(3), 16)))

        return b"".join(chunks) 

示例8: escape_string

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def escape_string(self):
        """Escape all special characters in a string
            Chepy: The Chepy object.
        self.state = re.escape(self._convert_to_str())
        return self 

示例9: extract

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
    for m in re.compile(
                e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
            re.I if ignorecase else 0
        yield m.group(0) 

示例10: __findeqtagpairspans

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def __findeqtagpairspans(
        s: str,
        tag: str,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s):
        yield (match.span("__open"), match.span("__content"), match.span("__close")) 

示例11: __findtagpairspans

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def __findtagpairspans(
        s: str,
        tag: str, closetag: Optional[str] = None,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    if closetag is None or tag == closetag:
        yield from __findeqtagpairspans(s, tag, useregex=useregex)

    if not useregex:
        tag = re.escape(tag)
        closetag = re.escape(closetag)

    retags = re.compile(r"(?P<__open>{})|(?P<__close>{})".format(tag, closetag))

    startspans = []

    for match in retags.finditer(s):
        opengroup = match.group("__open")
        if opengroup:

        closegroup = match.group("__close")
        if closegroup and startspans:
            startspan = startspans.pop()
            endspan = match.span()

            yield (startspan, (startspan[1], endspan[0]), endspan) 

示例12: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str]) -> None:
        Builds a codec converting between graphemes/code points and integer
        label sequences.

        charset may either be a string, a list or a dict. In the first case
        each code point will be assigned a label, in the second case each
        string in the list will be assigned a label, and in the final case each
        key string will be mapped to the value sequence of integers. In the
        first two cases labels will be assigned automatically.

        As 0 is the blank label in a CTC output layer, output labels and input
        dictionaries are/should be 1-indexed.

            charset (unicode, list, dict): Input character set.
        if isinstance(charset, dict):
            self.c2l = charset
            self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)}
        # map integer labels to code points because regex only works with strings
        self.l2c = {}  # type: Dict[str, str]
        for k, v in self.c2l.items():
            self.l2c[''.join(chr(c) for c in v)] = k

        # sort prefixes for c2l regex
        self.c2l_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.c2l.keys(), key=len, reverse=True)))
        # sort prefixes for l2c regex
        self.l2c_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.l2c.keys(), key=len, reverse=True))) 

示例13: read_xsampa_table

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def read_xsampa_table(self):
        filename = os.path.join('data', 'ipa-xsampa.csv')
        filename = pkg_resources.resource_filename(__name__, filename)
        with open(filename, 'rb') as f:
            xs2ipa = {x[1]: x[0] for x in csv.reader(f, encoding='utf-8')}
        xs = sorted(xs2ipa.keys(), key=len, reverse=True)
        xs_regex = re.compile('|'.join(list(map(re.escape, xs))))
        return xs_regex, xs2ipa 

示例14: _construct_split_regex

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def _construct_split_regex(self):
        known_words_group = "|".join(map(re.escape, self._get_sorted_words_from_cache()))
        if self._no_word_spacing:
            regex = r"^(.*?)({})(.*)$".format(known_words_group)
            regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(known_words_group)
            self._settings.registry_key, {})[self.info['name']] = \
            re.compile(regex, re.UNICODE | re.IGNORECASE) 

示例15: convert

# 需要导入模块: import regex [as 别名]
# 或者: from regex import escape [as 别名]
def convert(self, newstart: str) -> None:
        """Convert to another list type by replacing starting pattern."""
        match = self._match
        ms = match.start()
        for s, e in reversed(match.spans('pattern')):
            self[s - ms:e - ms] = newstart
        self.pattern = escape(newstart) 
