本文整理汇总了Python中regex.compile方法的典型用法代码示例。如果您正苦于以下问题:Python regex.compile方法的具体用法?Python regex.compile怎么用?Python regex.compile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类regex
的用法示例。
在下文中一共展示了regex.compile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: umi_histogram
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def umi_histogram(fastq):
''' Counts the number of reads for each UMI
Expects formatted fastq files.
'''
annotations = detect_fastq_annotations(fastq)
re_string = construct_transformed_regex(annotations)
parser_re = re.compile(re_string)
counter = collections.Counter()
for read in read_fastq(fastq):
match = parser_re.search(read).groupdict()
counter[match['MB']] += 1
for bc, count in counter.most_common():
sys.stdout.write('{}\t{}\n'.format(bc, count))
示例2: exact_barcode_filter
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def exact_barcode_filter(chunk, bc1, bc2, bc3, re_string=None):
if not re_string:
re_string = '(.*):CELL_(?P<CB>.*):UMI_(.*)\\n(.*)\\n\\+\\n(.*)\\n'
parser_re = re.compile(re_string)
kept = []
for read in chunk:
match = parser_re.search(read).groupdict()
cb1 = match['CB']
if bc3:
cb1, cb2, cb3 = cb1.split("-")
elif bc2:
cb1, cb2 = cb1.split("-")
if cb1 not in bc1:
continue
if bc2 and cb2 not in bc2:
continue
if bc3 and cb3 not in bc3:
continue
kept.append(read)
return kept
示例3: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, **kwargs):
"""
Args:
annotators: None or empty set (only tokenizes).
substitutions: if true, normalizes some token types (e.g. quotes).
"""
self._regexp = regex.compile(
'(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
'(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
'(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
'(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
(self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
self.NON_WS),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
)
if len(kwargs.get('annotators', {})) > 0:
logger.warning('%s only tokenizes! Skipping annotators: %s' %
(type(self).__name__, kwargs.get('annotators')))
self.annotators = set()
self.substitutions = kwargs.get('substitutions', True)
示例4: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file))
self.decoder = {v:k for k,v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.special_tokens = {}
self.special_tokens_decoder = {}
self.set_special_tokens(special_tokens)
示例5: get_versions
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def get_versions(self, req, orig_version):
exclude = set(self.exclude)
vals = {key: val
for key, val in self.vals.items()
if key not in exclude}
link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals)
link_re = re.compile(link_pattern)
result = []
for url in self.releases_urls:
parser = HrefParser(link_re)
parser.feed(await req.get_text_from_url(url))
for match in parser.get_matches():
match["link"] = urljoin(url, match["href"])
match["releases_url"] = url
match["vals"] = vals
result.append(match)
return result
示例6: _load_search_pattern
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def _load_search_pattern(self):
self.type_mapper = {}
py_regex_pattern = self.pattern
while True:
# Finding all types specified in the groks
m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
for n in m:
self.type_mapper[n[1]] = n[2]
#replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
# with regex and regex group name
py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
py_regex_pattern)
#replace %{pattern_name} with regex
py_regex_pattern = re.sub(r'%{(\w+)}',
lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
py_regex_pattern)
if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
break
self.regex_obj = re.compile(py_regex_pattern)
示例7: expect_regex
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def expect_regex(self, pattern):
"""Read until matches pattern or timeout."""
# inspired by pexpect/pty_spawn and pexpect/expect.py expect_loop
end_time = time.time() + self.timeout
buf = ''
prog = regex.compile(pattern)
while (end_time - time.time()) > 0.0:
# switch to nonblocking read
reads, _, _ = select.select([self.fd], [], [], end_time - time.time())
if len(reads) > 0:
try:
buf = remove_ansi_escape_sequences(buf + self.read())
except EOFError:
assert prog.match(buf) is not None, \
'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern)
if prog.match(buf):
return True
else:
# do not eat up CPU when waiting for the timeout to expire
time.sleep(self.timeout/10)
assert prog.match(buf) is not None, \
'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern)
示例8: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.special_tokens = {}
self.special_tokens_decoder = {}
self.set_special_tokens(special_tokens)
示例9: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, is_tuple=False, language="de_CMC"):
"""Create a SentenceSplitter object. If the tokenized paragraphs
contain token classes or extra info, set is_tuple=True.
"""
self.is_tuple = is_tuple
# full stop, ellipsis, exclamation and question marks
self.sentence_ending_punct = re.compile(r"^(?:\.+|…+\.*|[!?]+)$")
self.opening_punct = re.compile(r"^(?:['\"¿¡\p{Pi}\p{Ps}–—]|-{2,})$")
self.closing_punct = re.compile(r"^(?:['\"\p{Pf}\p{Pe}])$")
# International quotes: «» “” ‹› ‘’
# German quotes: »« „“ ›‹ ‚‘
self.problematic_quotes = set(['"'])
if language == "de" or language == "de_CMC":
# German opening quotes [»›] have category Pf
# German closing quotes [“‘«‹] have category Pi
self.problematic_quotes = set(['"', "»", "«", "›", "‹", "“", "‘"])
self.eos_abbreviations = utils.read_abbreviation_file("eos_abbreviations.txt")
示例10: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, pattern1=None, pattern2=None, pattern3=None):
if pattern1 is None:
pattern1 = UPPER
if pattern2 is None:
pattern2 = LOWER
if pattern3 is None:
pattern3 = PUNCT
self.pattern1 = \
[(regex.compile(beta_regex, flags=regex.VERSION1), repl)
for (beta_regex, repl) in pattern1]
self.pattern2 = \
[(regex.compile(beta_regex, flags=regex.VERSION1), repl)
for (beta_regex, repl) in pattern2]
self.pattern3 = \
[(regex.compile(beta_regex, flags=regex.VERSION1), repl)
for (beta_regex, repl) in pattern3]
示例11: exclude_words
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def exclude_words(phrasegrams, words):
"""Given a list of words, excludes those from the keys of the phrase dictionary."""
new_phrasergrams = {}
words_re_list = []
for word in words:
we = regex.escape(word)
words_re_list.append("^" + we + "$|^" + we + "_|_" + we + "$|_" + we + "_")
word_reg = regex.compile(r""+"|".join(words_re_list))
for gram in tqdm(phrasegrams):
valid = True
for sub_gram in gram:
if word_reg.search(sub_gram.decode("unicode_escape", "ignore")) is not None:
valid = False
break
if not valid:
continue
if valid:
new_phrasergrams[gram] = phrasegrams[gram]
return new_phrasergrams
# Generating word grams.
示例12: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
mask_token=mask_token, **kwargs)
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
示例13: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
self.encoder = json.load(open(vocab_file))
self.decoder = {v:k for k,v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
示例14: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
示例15: contains_sender_names
# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def contains_sender_names(sender):
'''Returns a functions to search sender\'s name or it\'s part.
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
>>> feature("Sergey Obukhov")
1
>>> feature("BR, Sergey N.")
1
>>> feature("Sergey")
1
>>> contains_sender_names("<serobnic@mail.ru>")("Serobnic")
1
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
1
'''
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
for e in extract_names(sender)]))
names = names or sender
if names != '':
return binary_regex_search(re.compile(names))
return lambda s: 0