当前位置: 首页>>代码示例>>Python>>正文


Python regex.compile方法代码示例

本文整理汇总了Python中regex.compile方法的典型用法代码示例。如果您正苦于以下问题:Python regex.compile方法的具体用法?Python regex.compile怎么用?Python regex.compile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在regex的用法示例。


在下文中一共展示了regex.compile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: umi_histogram

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def umi_histogram(fastq):
    ''' Counts the number of reads for each UMI

    Expects formatted fastq files.
    '''
    annotations = detect_fastq_annotations(fastq)
    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)

    counter = collections.Counter()
    for read in read_fastq(fastq):
        match = parser_re.search(read).groupdict()
        counter[match['MB']] += 1

    for bc, count in counter.most_common():
        sys.stdout.write('{}\t{}\n'.format(bc, count)) 
开发者ID:vals,项目名称:umis,代码行数:18,代码来源:umis.py

示例2: exact_barcode_filter

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def exact_barcode_filter(chunk, bc1, bc2, bc3, re_string=None):
    if not re_string:
        re_string = '(.*):CELL_(?P<CB>.*):UMI_(.*)\\n(.*)\\n\\+\\n(.*)\\n'
    parser_re = re.compile(re_string)
    kept = []
    for read in chunk:
        match = parser_re.search(read).groupdict()
        cb1 = match['CB']
        if bc3:
            cb1, cb2, cb3 = cb1.split("-")
        elif bc2:
            cb1, cb2 = cb1.split("-")
        if cb1 not in bc1:
            continue
        if bc2 and cb2 not in bc2:
            continue
        if bc3 and cb3 not in bc3:
            continue
        kept.append(read)
    return kept 
开发者ID:vals,项目名称:umis,代码行数:22,代码来源:barcodes.py

示例3: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True) 
开发者ID:thunlp,项目名称:OpenQA,代码行数:25,代码来源:regexp_tokenizer.py

示例4: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file))
        self.decoder = {v:k for k,v in self.encoder.items()}
        self.errors = errors # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens) 
开发者ID:649453932,项目名称:Bert-Chinese-Text-Classification-Pytorch,代码行数:20,代码来源:tokenization_gpt2.py

示例5: get_versions

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def get_versions(self, req, orig_version):
        exclude = set(self.exclude)
        vals = {key: val
                for key, val in self.vals.items()
                if key not in exclude}
        link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals)
        link_re = re.compile(link_pattern)
        result = []
        for url in self.releases_urls:
            parser = HrefParser(link_re)
            parser.feed(await req.get_text_from_url(url))
            for match in parser.get_matches():
                match["link"] = urljoin(url, match["href"])
                match["releases_url"] = url

                match["vals"] = vals
                result.append(match)
        return result 
开发者ID:bioconda,项目名称:bioconda-utils,代码行数:20,代码来源:hosters.py

示例6: _load_search_pattern

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def _load_search_pattern(self):
        self.type_mapper = {}
        py_regex_pattern = self.pattern
        while True:
            # Finding all types specified in the groks
            m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
            for n in m:
                self.type_mapper[n[1]] = n[2]
            #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
            # with regex and regex group name

            py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
                lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            #replace %{pattern_name} with regex
            py_regex_pattern = re.sub(r'%{(\w+)}',
                lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
                break

        self.regex_obj = re.compile(py_regex_pattern) 
开发者ID:garyelephant,项目名称:pygrok,代码行数:26,代码来源:pygrok.py

示例7: expect_regex

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def expect_regex(self, pattern):
        """Read until matches pattern or timeout."""
        # inspired by pexpect/pty_spawn and  pexpect/expect.py expect_loop
        end_time = time.time() + self.timeout
        buf = ''
        prog = regex.compile(pattern)
        while (end_time - time.time()) > 0.0:
            # switch to nonblocking read
            reads, _, _ = select.select([self.fd], [], [], end_time - time.time())
            if len(reads) > 0:
                try:
                    buf = remove_ansi_escape_sequences(buf + self.read())
                except EOFError:
                    assert prog.match(buf) is not None, \
                        'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern)
                if prog.match(buf):
                    return True
            else:
                # do not eat up CPU when waiting for the timeout to expire
                time.sleep(self.timeout/10)
        assert prog.match(buf) is not None, \
            'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern) 
开发者ID:CITGuru,项目名称:PyInquirer,代码行数:24,代码来源:helpers.py

示例8: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens) 
开发者ID:ftarlaci,项目名称:GPT2sQA,代码行数:18,代码来源:tokenization.py

示例9: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, is_tuple=False, language="de_CMC"):
        """Create a SentenceSplitter object. If the tokenized paragraphs
        contain token classes or extra info, set is_tuple=True.

        """
        self.is_tuple = is_tuple
        # full stop, ellipsis, exclamation and question marks
        self.sentence_ending_punct = re.compile(r"^(?:\.+|…+\.*|[!?]+)$")
        self.opening_punct = re.compile(r"^(?:['\"¿¡\p{Pi}\p{Ps}–—]|-{2,})$")
        self.closing_punct = re.compile(r"^(?:['\"\p{Pf}\p{Pe}])$")
        # International quotes: «» “” ‹› ‘’
        # German quotes: »« „“ ›‹ ‚‘
        self.problematic_quotes = set(['"'])
        if language == "de" or language == "de_CMC":
            # German opening quotes [»›] have category Pf
            # German closing quotes [“‘«‹] have category Pi
            self.problematic_quotes = set(['"', "»", "«", "›", "‹", "“", "‘"])
        self.eos_abbreviations = utils.read_abbreviation_file("eos_abbreviations.txt") 
开发者ID:tsproisl,项目名称:SoMaJo,代码行数:20,代码来源:sentence_splitter.py

示例10: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, pattern1=None, pattern2=None, pattern3=None):
        if pattern1 is None:
            pattern1 = UPPER
        if pattern2 is None:
            pattern2 = LOWER
        if pattern3 is None:
            pattern3 = PUNCT
        self.pattern1 = \
            [(regex.compile(beta_regex, flags=regex.VERSION1), repl)
             for (beta_regex, repl) in pattern1]
        self.pattern2 = \
            [(regex.compile(beta_regex, flags=regex.VERSION1), repl)
             for (beta_regex, repl) in pattern2]
        self.pattern3 = \
            [(regex.compile(beta_regex, flags=regex.VERSION1), repl)
             for (beta_regex, repl) in pattern3] 
开发者ID:cltk,项目名称:cltk,代码行数:18,代码来源:beta_to_unicode.py

示例11: exclude_words

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def exclude_words(phrasegrams, words):
    """Given a list of words, excludes those from the keys of the phrase dictionary."""
    new_phrasergrams = {}
    words_re_list = []
    for word in words:
        we = regex.escape(word)
        words_re_list.append("^" + we + "$|^" + we + "_|_" + we + "$|_" + we + "_")
    word_reg = regex.compile(r""+"|".join(words_re_list))
    for gram in tqdm(phrasegrams):
        valid = True
        for sub_gram in gram:
            if word_reg.search(sub_gram.decode("unicode_escape", "ignore")) is not None:
                valid = False
                break
            if not valid:
                continue
        if valid:
            new_phrasergrams[gram] = phrasegrams[gram]
    return new_phrasergrams


# Generating word grams. 
开发者ID:materialsintelligence,项目名称:mat2vec,代码行数:24,代码来源:phrase2vec.py

示例12: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
        super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
                                               sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
                                               mask_token=mask_token, **kwargs)

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") 
开发者ID:linhaow,项目名称:TextClassify,代码行数:20,代码来源:tokenization_roberta.py

示例13: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)

        self.encoder = json.load(open(vocab_file))
        self.decoder = {v:k for k,v in self.encoder.items()}
        self.errors = errors # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") 
开发者ID:linhaow,项目名称:TextClassify,代码行数:18,代码来源:tokenization_gpt2.py

示例14: __init__

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") 
开发者ID:plkmo,项目名称:NLP_Toolkit,代码行数:20,代码来源:tokenization_gpt2.py

示例15: contains_sender_names

# 需要导入模块: import regex [as 别名]
# 或者: from regex import compile [as 别名]
def contains_sender_names(sender):
    '''Returns a functions to search sender\'s name or it\'s part.

    >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>")
    >>> feature("Sergey Obukhov")
    1
    >>> feature("BR, Sergey N.")
    1
    >>> feature("Sergey")
    1
    >>> contains_sender_names("<serobnic@mail.ru>")("Serobnic")
    1
    >>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
    1
    '''
    names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
                                        for e in extract_names(sender)]))
    names = names or sender
    if names != '':
        return binary_regex_search(re.compile(names))
    return lambda s: 0 
开发者ID:mailgun,项目名称:talon,代码行数:23,代码来源:helpers.py


注:本文中的regex.compile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。