本文整理汇总了Python中regex.finditer方法的典型用法代码示例。如果您正苦于以下问题:Python regex.finditer方法的具体用法?Python regex.finditer怎么用?Python regex.finditer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类regex
的用法示例。
在下文中一共展示了regex.finditer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_partial_parse
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def test_partial_parse() -> None:
match_a = regex.match("(?<R1>a)", "ab")
match_b = next(regex.finditer("(?<R2>b)", "ab"))
pp = PartialParse.from_regex_matches(
(RegexMatch(1, match_a), RegexMatch(2, match_b))
)
assert len(pp.prod) == 2
assert len(pp.rules) == 2
assert isinstance(pp.score, float)
def mock_rule(ts: datetime.datetime, a: Time) -> Time:
return Time()
pp2 = pp.apply_rule(
datetime.datetime(day=1, month=1, year=2015), mock_rule, "mock_rule", (0, 1)
)
assert pp != pp2
with pytest.raises(ValueError):
PartialParse((), ())
示例2: map_to_dogol_prime
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def map_to_dogol_prime(self, s):
"""Map a string to Dogolpolsky' classes
Args:
s (unicode): IPA word
Returns:
(unicode): word with all segments collapsed to D' classes
"""
segs = []
for seg in self.fm.seg_regex.finditer(s):
fts = self.fm.fts(seg.group(0))
for mask, label in self.dogol_prime:
if fts >= mask:
segs.append(label)
break
return ''.join(segs)
示例3: compile_regex_from_str
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def compile_regex_from_str(self, ft_str):
"""Given a string describing features masks for a sequence of segments,
return a regex matching the corresponding strings.
Args:
ft_str (str): feature masks, each enclosed in square brackets, in
which the features are delimited by any standard delimiter.
Returns:
Pattern: regular expression pattern equivalent to `ft_str`
"""
sequence = []
for m in re.finditer(r'\[([^]]+)\]', ft_str):
ft_mask = fts(m.group(1))
segs = self.all_segs_matching_fts(ft_mask)
sub_pat = '({})'.format('|'.join(segs))
sequence.append(sub_pat)
pattern = ''.join(sequence)
regex = re.compile(pattern)
return regex
示例4: neutralize_with_all_envs
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def neutralize_with_all_envs(trans, env_filters):
string = ''.join(trans.with_word_boundaries())
length = len(string)
for env_filter in env_filters:
pattern = env_filter.generate_regular_expression()
for match in re.finditer(pattern, string, overlapped=True):
mid_index = match.start('MID')
temp = ''
for i in range(length):
if i == mid_index:
s = '-'
else:
s = string[i]
temp += s
string = temp
return string
# This function is weirdly named. It should really be something like
# average_minpair_fl
# It has also been changed so as to have two "relativizer" options:
# one to words containing the relevant segments and one to all
# words in the corpus (though it basically does the calculation
# by calling the above minpair_fl() function).
示例5: get_arxiv_meta_archive
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def get_arxiv_meta_archive(aid):
title = ''
authors = []
jref = ''
txt = ''
with tarfile.open("./kddcup2003/hep-th-abs.tar.gz", "r:gz") as t:
for m in t.getmembers():
if m.name.find(aid) != -1:
txt = t.extractfile(m).read()
break
for m in regex.finditer(r'Title:\s+(.*)(?=Author)', txt, regex.S):
title = clean_line(m.group(1))
break
for m in regex.finditer(r'Authors?:\s+(.*)(?=Comment)', txt, regex.S):
a = clean_line(m.group(1))
authors = regex.split(r'(?:,\s*(?:and\s+)?|\s+and\s+)', a)
break
for m in regex.finditer(r'Journal-ref:\s+(.*?)(?=\\\\)', txt, regex.S):
jref = clean_line(m.group(1))
break
return title, authors, jref
示例6: _split_matches
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_matches(self, regex, node, token_class="regular", repl=None, split_named_subgroups=True, delete_whitespace=False):
boundaries = []
split_groups = split_named_subgroups and len(regex.groupindex) > 0
group_numbers = sorted(regex.groupindex.values())
for m in regex.finditer(node.value.text):
if split_groups:
for g in group_numbers:
if m.span(g) != (-1, -1):
boundaries.append((m.start(g), m.end(g), None))
else:
if repl is None:
boundaries.append((m.start(), m.end(), None))
else:
boundaries.append((m.start(), m.end(), m.expand(repl)))
self._split_on_boundaries(node, boundaries, token_class, delete_whitespace=delete_whitespace)
示例7: _split_emojis
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_emojis(self, node, token_class="emoticon"):
boundaries = []
for m in re.finditer(r"\X", node.value.text):
if m.end() - m.start() > 1:
if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()):
boundaries.append((m.start(), m.end(), None))
else:
if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()):
boundaries.append((m.start(), m.end(), None))
self._split_on_boundaries(node, boundaries, token_class)
示例8: _split_set
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_set(self, regex, node, items, token_class="regular", ignore_case=False):
boundaries = []
for m in regex.finditer(node.value.text):
instance = m.group(0)
if ignore_case:
instance = instance.lower()
if instance in items:
boundaries.append((m.start(), m.end(), None))
self._split_on_boundaries(node, boundaries, token_class)
示例9: _split_left
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_left(self, regex, node):
boundaries = []
prev_end = 0
for m in regex.finditer(node.value.text):
boundaries.append((prev_end, m.start(), None))
prev_end = m.start()
self._split_on_boundaries(node, boundaries, token_class=None, lock_match=False)
示例10: _split_abbreviations
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def _split_abbreviations(self, token_dll, split_multipart_abbrevs=True):
"""Turn instances of abbreviations into tokens."""
self._split_all_matches(self.single_letter_ellipsis, token_dll, "abbreviation")
self._split_all_matches(self.and_cetera, token_dll, "abbreviation")
self._split_all_matches(self.str_abbreviations, token_dll, "abbreviation")
self._split_all_matches(self.nr_abbreviations, token_dll, "abbreviation")
self._split_all_matches(self.single_token_abbreviation, token_dll, "abbreviation")
self._split_all_matches(self.single_letter_abbreviation, token_dll, "abbreviation")
self._split_all_matches(self.ps, token_dll, "abbreviation")
for t in token_dll:
if t.value.markup or t.value._locked:
continue
boundaries = []
for m in self.abbreviation.finditer(t.value.text):
instance = m.group(0)
if split_multipart_abbrevs and self.multipart_abbreviation.fullmatch(instance):
start, end = m.span(0)
s = start
for i, c in enumerate(instance, start=1):
if c == ".":
boundaries.append((s, start + i, None))
s = start + i
else:
boundaries.append((m.start(), m.end(), None))
self._split_on_boundaries(t, boundaries, "abbreviation")
示例11: extract
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
for m in re.compile(
r"\b(?:{})\b".format(r"|".join(
e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
)),
re.I if ignorecase else 0
).finditer(s):
yield m.group(0)
示例12: __findeqtagpairspans
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def __findeqtagpairspans(
s: str,
tag: str,
useregex: bool = False
) -> Iterable[Tuple[Tuple[int, int], ...]]:
for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s):
yield (match.span("__open"), match.span("__content"), match.span("__close"))
示例13: __findtagpairspans
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def __findtagpairspans(
s: str,
tag: str, closetag: Optional[str] = None,
useregex: bool = False
) -> Iterable[Tuple[Tuple[int, int], ...]]:
if closetag is None or tag == closetag:
yield from __findeqtagpairspans(s, tag, useregex=useregex)
return
if not useregex:
tag = re.escape(tag)
closetag = re.escape(closetag)
retags = re.compile(r"(?P<__open>{})|(?P<__close>{})".format(tag, closetag))
startspans = []
for match in retags.finditer(s):
opengroup = match.group("__open")
if opengroup:
startspans.append(match.span())
continue
closegroup = match.group("__close")
if closegroup and startspans:
startspan = startspans.pop()
endspan = match.span()
yield (startspan, (startspan[1], endspan[0]), endspan)
示例14: getFirstDateFromText
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def getFirstDateFromText(text, debug=False, default_hour=0, default_minute=0, default_second=0, return_precision=False):
#print("starting getFirstDateFromText")
global patterns
for match in regex.finditer(patterns['date_compiled'], text):
#print("\nmatch is", match.group(0))
#print("\nmatch.index is", ([item for item in match.groupdict().items() if item[1]]))
if not isDefinitelyNotDate(match.group(0)):
match = dict((k, num(v)) for k, v in match.groupdict().items() if num(v))
return datetime_from_dict(match, debug, default_hour, default_minute, default_second, return_precision)
#print "finishing getFirstDateFromText"
# the date of a webpage, like a blog or article, will often be the first date mentioned
示例15: test_init
# 需要导入模块: import regex [as 别名]
# 或者: from regex import finditer [as 别名]
def test_init(self):
m = next(regex.finditer(r"(?P<R1>match me)", "xxx match me xxx"))
r = RegexMatch(1, m)
self.assertEqual(r.mstart, 4)
self.assertEqual(r.mend, 12)
self.assertEqual(len(r), 8)
self.assertEqual(r._text, "match me")
self.assertEqual(repr(r), "RegexMatch[4-12]{1:match me}")
self.assertEqual(r.nb_str(), "RegexMatch[]{1:match me}")