当前位置: 首页>>代码示例>>Python>>正文


Python regex.finditer方法代码示例

本文整理汇总了Python中regex.finditer方法的典型用法代码示例。如果您正苦于以下问题:Python regex.finditer方法的具体用法?Python regex.finditer怎么用?Python regex.finditer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在regex的用法示例。


在下文中一共展示了regex.finditer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_partial_parse

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def test_partial_parse() -> None:
    match_a = regex.match("(?<R1>a)", "ab")
    match_b = next(regex.finditer("(?<R2>b)", "ab"))

    pp = PartialParse.from_regex_matches(
        (RegexMatch(1, match_a), RegexMatch(2, match_b))
    )

    assert len(pp.prod) == 2
    assert len(pp.rules) == 2

    assert isinstance(pp.score, float)

    def mock_rule(ts: datetime.datetime, a: Time) -> Time:
        return Time()

    pp2 = pp.apply_rule(
        datetime.datetime(day=1, month=1, year=2015), mock_rule, "mock_rule", (0, 1)
    )

    assert pp != pp2

    with pytest.raises(ValueError):
        PartialParse((), ()) 
开发者ID:comtravo,项目名称:ctparse,代码行数:26,代码来源:test_partialparse.py

示例2: map_to_dogol_prime

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def map_to_dogol_prime(self, s):
        """Map a string to Dogolpolsky' classes

        Args:
            s (unicode): IPA word

        Returns:
            (unicode): word with all segments collapsed to D' classes
        """
        segs = []
        for seg in self.fm.seg_regex.finditer(s):
            fts = self.fm.fts(seg.group(0))
            for mask, label in self.dogol_prime:
                if fts >= mask:
                    segs.append(label)
                    break
        return ''.join(segs) 
开发者ID:dmort27,项目名称:panphon,代码行数:19,代码来源:distance.py

示例3: compile_regex_from_str

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def compile_regex_from_str(self, ft_str):
        """Given a string describing features masks for a sequence of segments,
        return a regex matching the corresponding strings.

        Args:
            ft_str (str): feature masks, each enclosed in square brackets, in
            which the features are delimited by any standard delimiter.

        Returns:
           Pattern: regular expression pattern equivalent to `ft_str`
        """

        sequence = []
        for m in re.finditer(r'\[([^]]+)\]', ft_str):
            ft_mask = fts(m.group(1))
            segs = self.all_segs_matching_fts(ft_mask)
            sub_pat = '({})'.format('|'.join(segs))
            sequence.append(sub_pat)
        pattern = ''.join(sequence)
        regex = re.compile(pattern)
        return regex 
开发者ID:dmort27,项目名称:panphon,代码行数:23,代码来源:_panphon.py

示例4: neutralize_with_all_envs

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def neutralize_with_all_envs(trans, env_filters):
    string = ''.join(trans.with_word_boundaries())
    length = len(string)
    for env_filter in env_filters:
        pattern = env_filter.generate_regular_expression()
        for match in re.finditer(pattern, string, overlapped=True):
            mid_index = match.start('MID')
            temp = ''
            for i in range(length):
                if i == mid_index:
                    s = '-'
                else:
                    s = string[i]
                temp += s
            string = temp
    return string


# This function is weirdly named. It should really be something like
# average_minpair_fl
# It has also been changed so as to have two "relativizer" options:
# one to words containing the relevant segments and one to all
# words in the corpus (though it basically does the calculation
# by calling the above minpair_fl() function). 
开发者ID:PhonologicalCorpusTools,项目名称:CorpusTools,代码行数:26,代码来源:functional_load.py

示例5: get_arxiv_meta_archive

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def get_arxiv_meta_archive(aid):
    title = ''
    authors = []
    jref = ''
    txt = ''
    with tarfile.open("./kddcup2003/hep-th-abs.tar.gz", "r:gz") as t:
        for m in t.getmembers():
            if m.name.find(aid) != -1:
                txt = t.extractfile(m).read()
                break
    for m in regex.finditer(r'Title:\s+(.*)(?=Author)', txt, regex.S):
        title = clean_line(m.group(1))
        break
    for m in regex.finditer(r'Authors?:\s+(.*)(?=Comment)', txt, regex.S):
        a = clean_line(m.group(1))
        authors = regex.split(r'(?:,\s*(?:and\s+)?|\s+and\s+)', a)
        break
    for m in regex.finditer(r'Journal-ref:\s+(.*?)(?=\\\\)', txt, regex.S):
        jref = clean_line(m.group(1))
        break

    return title, authors, jref 
开发者ID:soodoku,项目名称:autosum,代码行数:24,代码来源:autosum_arxiv.py

示例6: _split_matches

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_matches(self, regex, node, token_class="regular", repl=None, split_named_subgroups=True, delete_whitespace=False):
        boundaries = []
        split_groups = split_named_subgroups and len(regex.groupindex) > 0
        group_numbers = sorted(regex.groupindex.values())
        for m in regex.finditer(node.value.text):
            if split_groups:
                for g in group_numbers:
                    if m.span(g) != (-1, -1):
                        boundaries.append((m.start(g), m.end(g), None))
            else:
                if repl is None:
                    boundaries.append((m.start(), m.end(), None))
                else:
                    boundaries.append((m.start(), m.end(), m.expand(repl)))
        self._split_on_boundaries(node, boundaries, token_class, delete_whitespace=delete_whitespace) 
开发者ID:tsproisl,项目名称:SoMaJo,代码行数:17,代码来源:tokenizer.py

示例7: _split_emojis

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_emojis(self, node, token_class="emoticon"):
        boundaries = []
        for m in re.finditer(r"\X", node.value.text):
            if m.end() - m.start() > 1:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
            else:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class) 
开发者ID:tsproisl,项目名称:SoMaJo,代码行数:12,代码来源:tokenizer.py

示例8: _split_set

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_set(self, regex, node, items, token_class="regular", ignore_case=False):
        boundaries = []
        for m in regex.finditer(node.value.text):
            instance = m.group(0)
            if ignore_case:
                instance = instance.lower()
            if instance in items:
                boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class) 
开发者ID:tsproisl,项目名称:SoMaJo,代码行数:11,代码来源:tokenizer.py

示例9: _split_left

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_left(self, regex, node):
        boundaries = []
        prev_end = 0
        for m in regex.finditer(node.value.text):
            boundaries.append((prev_end, m.start(), None))
            prev_end = m.start()
        self._split_on_boundaries(node, boundaries, token_class=None, lock_match=False) 
开发者ID:tsproisl,项目名称:SoMaJo,代码行数:9,代码来源:tokenizer.py

示例10: _split_abbreviations

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_abbreviations(self, token_dll, split_multipart_abbrevs=True):
        """Turn instances of abbreviations into tokens."""
        self._split_all_matches(self.single_letter_ellipsis, token_dll, "abbreviation")
        self._split_all_matches(self.and_cetera, token_dll, "abbreviation")
        self._split_all_matches(self.str_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.nr_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.single_token_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.single_letter_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.ps, token_dll, "abbreviation")

        for t in token_dll:
            if t.value.markup or t.value._locked:
                continue
            boundaries = []
            for m in self.abbreviation.finditer(t.value.text):
                instance = m.group(0)
                if split_multipart_abbrevs and self.multipart_abbreviation.fullmatch(instance):
                    start, end = m.span(0)
                    s = start
                    for i, c in enumerate(instance, start=1):
                        if c == ".":
                            boundaries.append((s, start + i, None))
                            s = start + i
                else:
                    boundaries.append((m.start(), m.end(), None))
            self._split_on_boundaries(t, boundaries, "abbreviation") 
开发者ID:tsproisl,项目名称:SoMaJo,代码行数:28,代码来源:tokenizer.py

示例11: extract

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
    for m in re.compile(
            r"\b(?:{})\b".format(r"|".join(
                e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
            )),
            re.I if ignorecase else 0
        ).finditer(s):
        yield m.group(0) 
开发者ID:chuanconggao,项目名称:extratools,代码行数:10,代码来源:strtools.py

示例12: __findeqtagpairspans

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def __findeqtagpairspans(
        s: str,
        tag: str,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s):
        yield (match.span("__open"), match.span("__content"), match.span("__close")) 
开发者ID:chuanconggao,项目名称:extratools,代码行数:9,代码来源:strtools.py

示例13: __findtagpairspans

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def __findtagpairspans(
        s: str,
        tag: str, closetag: Optional[str] = None,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    if closetag is None or tag == closetag:
        yield from __findeqtagpairspans(s, tag, useregex=useregex)
        return

    if not useregex:
        tag = re.escape(tag)
        closetag = re.escape(closetag)

    retags = re.compile(r"(?P<__open>{})|(?P<__close>{})".format(tag, closetag))

    startspans = []

    for match in retags.finditer(s):
        opengroup = match.group("__open")
        if opengroup:
            startspans.append(match.span())
            continue

        closegroup = match.group("__close")
        if closegroup and startspans:
            startspan = startspans.pop()
            endspan = match.span()

            yield (startspan, (startspan[1], endspan[0]), endspan) 
开发者ID:chuanconggao,项目名称:extratools,代码行数:31,代码来源:strtools.py

示例14: getFirstDateFromText

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def getFirstDateFromText(text, debug=False, default_hour=0, default_minute=0, default_second=0, return_precision=False):
    #print("starting getFirstDateFromText")
    global patterns

    for match in regex.finditer(patterns['date_compiled'], text):
        #print("\nmatch is", match.group(0))
        #print("\nmatch.index is", ([item for item in match.groupdict().items() if item[1]]))
        if not isDefinitelyNotDate(match.group(0)):
            match = dict((k, num(v)) for k, v in match.groupdict().items() if num(v))
            return datetime_from_dict(match, debug, default_hour, default_minute, default_second, return_precision)
    #print "finishing getFirstDateFromText"

# the date of a webpage, like a blog or article, will often be the first date mentioned 
开发者ID:DanielJDufour,项目名称:date-extractor,代码行数:15,代码来源:__init__.py

示例15: test_init

# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def test_init(self):
        m = next(regex.finditer(r"(?P<R1>match me)", "xxx match me xxx"))
        r = RegexMatch(1, m)
        self.assertEqual(r.mstart, 4)
        self.assertEqual(r.mend, 12)
        self.assertEqual(len(r), 8)
        self.assertEqual(r._text, "match me")
        self.assertEqual(repr(r), "RegexMatch[4-12]{1:match me}")
        self.assertEqual(r.nb_str(), "RegexMatch[]{1:match me}") 
开发者ID:comtravo,项目名称:ctparse,代码行数:11,代码来源:test_types.py


注:本文中的regex.finditer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。