本文整理汇总了Python中regex.findall方法的典型用法代码示例。如果您正苦于以下问题:Python regex.findall方法的具体用法?Python regex.findall怎么用?Python regex.findall使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类regex
的用法示例。
在下文中一共展示了regex.findall方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _load_search_pattern
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def _load_search_pattern(self):
self.type_mapper = {}
py_regex_pattern = self.pattern
while True:
# Finding all types specified in the groks
m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
for n in m:
self.type_mapper[n[1]] = n[2]
#replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
# with regex and regex group name
py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
py_regex_pattern)
#replace %{pattern_name} with regex
py_regex_pattern = re.sub(r'%{(\w+)}',
lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
py_regex_pattern)
if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
break
self.regex_obj = re.compile(py_regex_pattern)
示例2: _test_display_output
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def _test_display_output(self, image_mode):
"""
Test display data output with given image_mode.
"""
with testing_utils.tempdir() as tmpdir:
data_path = tmpdir
os.makedirs(os.path.join(data_path, 'ImageTeacher'))
opt = {
'task': 'integration_tests:ImageTeacher',
'datapath': data_path,
'image_mode': image_mode,
'display_verbose': True,
}
output = testing_utils.display_data(opt)
train_labels = re.findall(r"\[labels\].*\n", output[0])
valid_labels = re.findall(r"\[eval_labels\].*\n", output[1])
test_labels = re.findall(r"\[eval_labels\].*\n", output[2])
for i, lbls in enumerate([train_labels, valid_labels, test_labels]):
self.assertGreater(len(lbls), 0, 'DisplayData failed')
self.assertEqual(len(lbls), len(set(lbls)), output[i])
示例3: ascii_emoticons
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def ascii_emoticons(index, question, answer):
global valid_emoticon
valid_emoticon = False
# Disabled
if score_settings['ascii_emoticon_modifier_value'] is None:
return 0
# Split by words (tokens)
tokens = answer.split()
# Calculate emoticon score
score = [1 if len(token) > 1 and len(re.findall('[^a-zA-Z0-9]', token)) / len(token) > score_settings['ascii_emoticon_non_char_to_all_chars_ratio'] else 0 for token in tokens]
score = sum([1 if (index > 0 and score[index - 1] == 0 and value == 1) or (index == 0 and value == 1) else 0 for index, value in enumerate(score)]) * score_settings['ascii_emoticon_modifier_value']
if score:
valid_emoticon = True
return score
# Check if sentence includes 'unk' token
示例4: tokenize_field
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def tokenize_field(value):
"""
Extract normalized tokens from a field.
Args:
value (str): The field value.
Returns:
list: The cleaned tokens.
"""
# Extract tokens.
tokens = regex.findall('\p{L}{2,}', value.lower())
# Remove articles.
tokens = [t for t in tokens if t not in [
'a', 'an', 'the', 'and',
]]
return tokens
示例5: unicode_chrs_by_lang
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def unicode_chrs_by_lang(self, lang: str):
"""Detect language specific characters
Detect characters from varios Unicode code point ids. Example
of languages are Common, Arabic, Armenian, Bengali, Bopomofo, Braille,
Buhid, Canadian_Aboriginal, Cherokee, Cyrillic, Devanagari, Ethiopic,
Georgian, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew,
Hiragana, Inherited, Kannada, Katakana, Khmer, Lao, Latin, Limbu,
Malayalam, Mongolian, Myanmar, Ogham, Oriya, Runic, Sinhala, Syriac,
Tagalog, Tagbanwa, TaiLe, Tamil, Telugu, Thaana, Thai, Tibetan, Yi,
but other code points should work also.
Args:
lang (str): Required. A string value identifying the language.
Returns:
Chepy: The Chepy object.
"""
self.state = re.findall(r"\p{" + lang + "}", self._convert_to_str())
return self
示例6: count_occurances
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def count_occurances(self, regex: str, case_sensitive: bool = False):
"""Counts occurances of the regex.
Counts the number of times the provided string occurs.
Args:
regex (str): Required. Regex string to search for
case_sensitive (bool, optional): If search should be case insensitive, by default False
Returns:
Chepy: The Chepy object.
Examples:
>>> Chepy("AABCDADJAKDJHKSDAJSDdaskjdhaskdjhasdkja").count_occurances("ja").output
2
"""
if case_sensitive:
r = re.compile(regex)
else:
r = re.compile(regex, re.IGNORECASE)
self.state = len(r.findall(self._convert_to_str()))
return self
示例7: extract_strings
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def extract_strings(self, length: int = 4):
"""Extract strings from state
Args:
length (int, optional): Min length of string. Defaults to 4.
Returns:
Chepy: The Chepy object.
Examples:
>>> Chepy("tests/files/hello").load_file().extract_strings().o
[
b'__PAGEZERO',
b'__TEXT',
b'__text',
b'__TEXT',
b'__stubs',
b'__TEXT',
...
]
"""
pattern = b"[^\x00-\x1F\x7F-\xFF]{" + str(length).encode() + b",}"
self.state = re.findall(pattern, self._convert_to_bytes())
return self
示例8: search_ctf_flags
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def search_ctf_flags(self, prefix: str, postfix: str = ".+?\{*\}"):
"""Search CTF style flags.
This by default assumes that the flag format is similar
to something like picoCTF{some_flag} as an example.
Args:
prefix (str): Prefix of the flag. Like `picoCTF`
postfix (str, optional): Regex for the remainder of the flag.
Defaults to '.+\{.+}'.
Returns:
Chepy: The Chepy object.
Examples:
>>> Chepy("tests/files/flags").read_file().search_ctf_flags("pico").get_by_index(0)
picoCTF{r3source_pag3_f1ag}
"""
self.state = re.findall(prefix + postfix, self._convert_to_str(), re.IGNORECASE)
return self
示例9: get_nested_select
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def get_nested_select(self):
"""Get tuples groups in query like ::
Select {
([Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 19,2010],
[Geography].[Geography].[Continent].[Europe],
[Measures].[Amount]),
([Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 17,2010],
[Geography].[Geography].[Continent].[Europe],
[Measures].[Amount])
}
out :
['[Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 19,2010],\
[Geography].[Geography].[Continent].[Europe],[Measures].[Amount]',
'[Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 17,2010],\
[Geography].[Geography].[Continent].[Europe],[Measures].[Amount]']
:return: All groups as list of strings.
"""
return regex.findall(r"\(([^()]+)\)", self.mdx_query)
示例10: compile_regex_from_str
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def compile_regex_from_str(self, pat):
"""Given a string describing features masks for a sequence of segments,
return a compiled regex matching the corresponding strings.
Args:
ft_str (str): feature masks, each enclosed in square brackets, in
which the features are delimited by any standard delimiter.
Returns:
Pattern: regular expression pattern equivalent to `ft_str`
"""
s2n = {'-': -1, '0': 0, '+': 1}
seg_res = []
for mat in re.findall(r'\[[^]]+\]+', pat):
ft_mask = {k: s2n[v] for (v, k) in re.findall(r'([+-])(\w+)', mat)}
segs = self.all_segs_matching_fts(ft_mask)
seg_res.append('({})'.format('|'.join(segs)))
regexp = ''.join(seg_res)
return re.compile(regexp)
示例11: tokenize
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def tokenize(self, text):
bpe_tokens = []
list_starts, str_starts = [], []
basic_tokens = text if isinstance(text, list) else [text]
for i, basic_token in enumerate(basic_tokens):
num_subtokens = 0
basic_token = basic_token if (i == 0 or not isinstance(text, list)) else (
' ' + basic_token)
for token in re.findall(self.bpe.pat, basic_token):
token = ''.join(self.bpe.byte_encoder[b] for b in token.encode('utf-8'))
sub_tokens = [bpe_token for bpe_token in self.bpe.bpe(token).split(' ')]
bpe_tokens.extend(sub_tokens)
str_starts += [True] + [False] * (len(sub_tokens) - 1)
num_subtokens += len(sub_tokens)
list_starts += [True] + [False] * (num_subtokens - 1)
word_starts = list_starts if isinstance(text, list) else str_starts
assert len(bpe_tokens) == len(word_starts)
return bpe_tokens, word_starts
示例12: encode
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def encode(self, text):
bpe_tokens = []
bpe_token_original = []
for token in re.findall(self.pat, text):
# print (token)
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
### handle oov
for bpe_token in self.bpe(token).split(' '):
if bpe_token in self.encoder:
bpe_tokens.append(self.encoder[bpe_token])
else:
bpe_tokens.append(self.encoder["empty"])
# bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
bpe_token_original.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens, bpe_token_original
示例13: encode
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def encode(self, text):
indices = []
for token in re.findall(self.token_pattern, text):
token = bytearray(token.encode('utf-8'))
chars = ''.join(self.byte_encoder[code] for code in token)
indices += [self.token_dict[token] for token in self.get_bpe(chars)]
return indices
示例14: encode
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def encode(self, text):
bpe_tokens = []
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
return bpe_tokens
示例15: encode
# 需要导入模块: import regex [as 别名]
# 或者: from regex import findall [as 别名]
def encode(self, text):
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
return bpe_tokens