当前位置: 首页>>代码示例>>Python>>正文


Python regex.compile函数代码示例

本文整理汇总了Python中regex.compile函数的典型用法代码示例。如果您正苦于以下问题:Python compile函数的具体用法?Python compile怎么用?Python compile使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了compile函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

 def __init__(self, start=None, end=None, void=None, structs=None):
     self.start = start if start else re.compile(r"<(\w+).*?(?<!/)>")
     self.end = end if end else re.compile(r"</(\w+)>")
     self.void = void if void else re.compile(r"<(\w+).*?/>")
     self.stags = set()
     self.etags = set()
     self.vtags = set()
开发者ID:dlukes,项目名称:pyvert,代码行数:7,代码来源:_pyvert.py

示例2: clean_line

def clean_line(line):
    line = strip_nikkud(line)
    replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right?
    line = multiple_replace(line, replace_dict, using_regex=True)
    # line = re.sub(u'[:\?]', '', line)
    # line = re.sub(u'”', u'"', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    f_ayyen = re.search(reg_ayyen_tur, line)
    f_lo_manu = re.search(reg_lo_manu, line)

    if f_ayyen:
        line = line[:f_ayyen.start()]
    if f_lo_manu:
        line = re.sub(f_lo_manu.group('a'), u"", line)
    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
开发者ID:JonMosenkis,项目名称:Sefaria-Data,代码行数:32,代码来源:ein_parser.py

示例3: readConfigFile

def readConfigFile (
	source		# pathname to config file to read
	):
	# Purpose: read the configuration file at 'source', parse it,
	#	store values in a dictionary
	# Returns: the dictionary parsed from 'source'
	# Assumes: 'source' exists
	# Effects: reads from the file system
	# Throws: IOError if there are problems reading

	fp = open (source, 'r')
	lines = fp.readlines ()
	fp.close ()

	ignore_line = regex.compile ('[ \t]*#')		# comment line
	data_line = regex.compile ('[ \t]*'
				'\([^ \t]+\)'
				'[ \t]*\(.*\)')	
	dict = {}

	for line in lines:
		if ignore_line.match (line) == -1:
			if data_line.match (line) != -1:
				(parameter, value) = data_line.group (1,2)
				dict [string.upper (parameter)] = value
	return dict
开发者ID:mgijax,项目名称:websql,代码行数:26,代码来源:config.py

示例4: sample1

def sample1(filename, aft=None, fore=None, top=None, home=None):
    doc = SeriesDocument('HTMLgen.rc')
    doc.goprev,doc.gonext,doc.gotop,doc.gohome = aft,fore,top,home
    doc.background = '../image/texturec.jpg'
    doc.banner = ('../image/historic.gif', 472, 60)
    doc.author = '1776 Thomas Jefferson'
    doc.email = '[email protected]'
    doc.logo = ('../image/eagle21.gif', 64, 54)
    # parse Declaration of Independence
    re_hline = regex.compile('^--+$')
    re_title = regex.compile('^Title:\(.*$\)')
    font2 = Font(size='+2')
    s = open(os.path.join(datadir, 'DoI.txt')).read()
    paragraphs = regsub.split(s, '\n\([\t ]*\n\)+')
    for para in paragraphs:
        if not para: continue
        if re_title.search(para) > -1:
            doc.title = re_title.group(1)
        elif re_hline.search(para) > -1:
            doc.append(HR())
        else:
            p = Paragraph( para )
            # using \` to match beginning of paragraph
            # ^ won't work because it'll match all the newlines
            n = p.markup('\`\(\w\)', font2, reg_type='regex')
            doc.append(p)
    doc.write(os.path.join(htmldir, filename))
开发者ID:daveray,项目名称:soardoc,代码行数:27,代码来源:HTMLtest.py

示例5: updateline

def updateline(file, key, value, casefold = 1):
	try:
		f = open(file, 'r')
		lines = f.readlines()
		f.close()
	except IOError:
		lines = []
	pat = key + ':\(.*\)\n'
	if casefold:
		prog = regex.compile(pat, regex.casefold)
	else:
		prog = regex.compile(pat)
	if value is None:
		newline = None
	else:
		newline = '%s: %s' % (key, value)
	for i in range(len(lines)):
		line = lines[i]
		if prog.match(line) == len(line):
			if newline is None:
				del lines[i]
			else:
				lines[i] = newline
			break
	else:
		if newline is not None:
			lines.append(newline)
	f = open(tempfile, 'w')
	for line in lines:
		f.write(line)
	f.close()
开发者ID:asottile,项目名称:ancient-pythons,代码行数:31,代码来源:mhlib.py

示例6: test_post

 def test_post(title, body, user_name, site, is_answer, body_is_summary):
     result = []
     for rule in FindSpam.rules:
         body_to_check = body
         if rule['stripcodeblocks']:
             body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL)
             body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL)
         if rule['all'] != (site in rule['sites']):
             matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title)
             matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name)
             matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check)
             if matched_title and rule['title']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "title"))
             if matched_username and rule['username']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "username"))
             if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']):
                 type_of_post = "answer" if is_answer else "body"
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
                         result.append(rule['reason'].replace("{}", type_of_post))
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", type_of_post))
     return result
开发者ID:JC3,项目名称:SmokeDetector,代码行数:31,代码来源:findspam.py

示例7: tlg_plaintext_cleanup

def tlg_plaintext_cleanup(text, rm_punctuation=False, rm_periods=False):
    """Remove and substitute post-processing for Greek TLG text.
    TODO: Surely more junk to pull out. Please submit bugs!
    TODO: {.+?}|\(.+?\) working?
    TODO: This is a rather slow now, help in speeding up welcome.
    """
    remove_comp = regex.compile(r'-\n|«|»|<|>|\.\.\.|‘|’|_|{.+?}|\(.+?\)|[a-zA-Z0-9]', flags=regex.VERSION1)
    text = remove_comp.sub('', text)

    new_text = None
    if rm_punctuation:
        new_text = ''
        punctuation = [',', '·', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']
        if rm_periods:
            punctuation += ['.', ';']
        for char in text:
            # second try at rming some punctuation; merge with above regex
            if char in punctuation:
                pass
            else:
                new_text += char
    if new_text:
        text = new_text

    # replace line breaks w/ space
    replace_comp = regex.compile(r'\n')
    text = replace_comp.sub(' ', text)

    comp_space = regex.compile(r'\s+')
    text = comp_space.sub(' ', text)

    return text
开发者ID:jfaville,项目名称:cltk,代码行数:32,代码来源:formatter.py

示例8: setliteral

 def setliteral(self, tag):
     self.literal = 1
     re = "%s%s[%s]*%s" % (ETAGO, tag, string.whitespace, TAGC)
     if self._normfunc is string.lower:
         self._lit_etag_re = regex.compile(re, regex.casefold)
     else:
         self._lit_etag_re = regex.compile(re)
开发者ID:ashumeow,项目名称:grail,代码行数:7,代码来源:SGMLLexer.py

示例9: __init__

    def __init__(self, src, javaFlag=0):
        Doxy2SWIG.__init__(self, src, javaFlag)
        """ Turns on the title, brief description and detailed description markup.
        Turn them off when inside member documentatation.

        """
        self.FilterTitle = True
        self.sitkClassName=''
        self.EmptyText = False
        # compiled regular expressions
        # common formula types in xml version of documentation
        self.dollarFormula = re.compile("^\\$(.+)\\$$")
        self.arrayFormula = re.compile("^\\\\\\[(.+)\\\\\\]$")
        # more complex formula layout, that breaks R documentation
        # checks.
        self.mathstuff1 = re.compile(r"\\begin\{array\}\{[^}]+\}")
        self.mathstuff2 = re.compile(r"\\begin\{array\}")
        self.mathstuff3 = re.compile(r"\\end\{array\}")
        # a complex recursive regular expression, to deal with formula
        # inside mbox and text structures
        self.mathstuff4 = regex.compile(r"\\mbox({((?>[^}{]*(?1)?)*)})", flags=regex.V1)
        self.mathstuff5 = regex.compile(r"\\text({((?>[^}{]*(?1)?)*)})", flags=regex.V1)
        # the special doxygen tags - note - not greedy
        self.mathstuff6 = re.compile(r"\\f\$(.+?)\\f\$")
        # alignment tags
        self.mathstuff7 = re.compile(r" & ")
开发者ID:kaspermarstal,项目名称:SimpleElastix,代码行数:26,代码来源:doxy2swig.py

示例10: all_caps_text

def all_caps_text(s, site):
    s = regex.sub("<[^>]*>", "", s)   # remove HTML tags
    s = regex.sub("&\w+;", "", s)     # remove HTML entities
    if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s):
        return False, ""   # common words in non-spam all-caps titles
    if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s):
        return True, "All in caps"
开发者ID:rekire,项目名称:SmokeDetector,代码行数:7,代码来源:findspam.py

示例11: _replace_for

    def _replace_for(self, text, nested_position, keyword_number=1):
        """
        Finds and replace the % for: ... % endfor loops
        of the mail.template. It will create keyword records for
        each loop found.
        :param text: mail.template text
        :param nested_position: counts how nested if the current pass
        :param keyword_number: counts how many for we found
        :return: simplified text without the if code, keywords found
        """
        # Regex for finding text wrapped in loops
        loop_regex = r'(% for .*?:$)(.*?)(% endfor)'
        ul_loop_regex = r'(?:<ul[^<]*?)(% for .*?:$)(.*?)(% endfor)(.*?</ul>)'

        # First scan for ul_loops
        for_pattern = re.compile(ul_loop_regex, flags=re.DOTALL | re.MULTILINE)
        simple_text, found_keywords = self._replace_for_type(
            text, nested_position, keyword_number, 'for_ul', for_pattern)
        keyword_number += len(found_keywords)

        # Then scan for regular loops
        for_pattern = re.compile(loop_regex, flags=re.DOTALL | re.MULTILINE)
        simple_text, keywords = self._replace_for_type(
            simple_text, nested_position, keyword_number, 'for', for_pattern)
        found_keywords |= keywords

        return simple_text, found_keywords
开发者ID:maxime-beck,项目名称:compassion-modules,代码行数:27,代码来源:communication_revision.py

示例12: __init__

 def __init__(self, directory_name):
     self.directory = directory_name
     self.unigram_frequency = Counter()
     self.trigrams = dict()
     self.trigram_load_pattern = re2.compile(r'^([^ ]*) ([^ ]*) ([^\t]*)\t(\d*)')
     self.middle_token_pattern = re2.compile(r'^\p{posix_alnum}*$', re2.UNICODE)
     super(FileScorer, self).__init__()
开发者ID:o76923,项目名称:PyGTM,代码行数:7,代码来源:SimScorer.py

示例13: _reload_allowed_list_file

    def _reload_allowed_list_file(self):
        '''(Re)loads the list with rules for non-segment borders, e.g stops
        the possible segment border being split (if not forced by a forcing
        rule specified for the stop rule. The stop rules are pairs of two rules
        of which the first is matched against the segment to the left, and the
        latter is matched against the segment to the right. The filename is
        given in the __init__, and the default file is "./data/stop_list".

        See the __init__() and segment() function for more about the algorithm.
        ATTENTION note that verbose regexps are used.'''

        with open(self._allowed_list_filename, 'r') as f:
            _filedata = f.readlines()
        
        self._allowed_regexps = list()
        _rule_left = ''
        _rule_right = ''

        for i in range(len(_filedata)):
            # rules must be specified in correct order: first left, then right
            if _filedata[i].startswith('LEFT:'):
                _rule_left = regex.compile(_filedata[i][5:], regex.VERBOSE)
            elif _filedata[i].startswith('RIGHT:'):
                _rule_right = regex.compile(_filedata[i][6:], regex.VERBOSE)
                self._allowed_regexps.append((_rule_left, _rule_right))
                _rule_left = ''
                _rule_right = ''
            else:
                # everything else is ignored
                continue
开发者ID:kristiank,项目名称:Lausestaja,代码行数:30,代码来源:ortographicsegmenter.py

示例14: __init__

    def __init__(self):
        # These attributes are set by the parse method
        self.doc = None
        self.para = None
        self.current_string = None
        self.flow = None

        self.stateMachine = StateMachine()
        self.stateMachine.add_state("PARA", self._para)
        self.stateMachine.add_state("ESCAPE", self._escape)
        self.stateMachine.add_state("END", None, end_state=1)
        self.stateMachine.add_state("ANNOTATION-START", self._annotation_start)
        self.stateMachine.add_state("CITATION-START", self._citation_start)
        self.stateMachine.add_state("BOLD-START", self._bold_start)
        self.stateMachine.add_state("ITALIC-START", self._italic_start)
        self.stateMachine.add_state("CODE-START", self._code_start)
        self.stateMachine.add_state("QUOTES-START", self._quotes_start)
        self.stateMachine.add_state("INLINE-INSERT", self._inline_insert)
        self.stateMachine.add_state("CHARACTER-ENTITY", self._character_entity)
        self.stateMachine.set_start("PARA")
        self.patterns = {
            'escape': re.compile(r'\\', re.U),
            'escaped-chars': re.compile(r'[\\\(\{\}\[\]_\*,\.\*`"&]', re.U),
            'annotation': re.compile(
                r'(?<!\\)\{(?P<text>.*?)(?<!\\)\}(\(\s*(?P<type>\S*?\s*[^\\"\']?)(["\'](?P<specifically>.*?)["\'])??\s*(\((?P<namespace>\w+)\))?\s*(~(?P<language>[\w-]+))?\))?', re.U),
            'bold': re.compile(r'\*(?P<text>((?<=\\)\*|[^\*])*)(?<!\\)\*', re.U),
            'italic': re.compile(r'_(?P<text>((?<=\\)_|[^_])*)(?<!\\)_', re.U),
            'code': re.compile(r'`(?P<text>(``|[^`])*)`', re.U),
            'quotes': re.compile(r'"(?P<text>((?<=\\)"|[^"])*)(?<!\\)"', re.U),
            'inline-insert': re.compile(r'>\((?P<attributes>.*?)\)', re.U),
            'character-entity': re.compile(r'&(\#[0-9]+|#[xX][0-9a-fA-F]+|[\w]+);'),
            'citation': re.compile(r'(\[\s*\*(?P<id>\S+)(\s+(?P<id_extra>.+?))?\])|(\[\s*\#(?P<name_name>\S+)(\s+(?P<extra>.+?))?\])|(\[\s*(?P<citation>.*?)\])', re.U)
        }
开发者ID:dustinrb,项目名称:sam,代码行数:33,代码来源:samparser.py

示例15: clean_line

def clean_line(line):
    line = strip_nikkud(line)
    line = re.sub(u':', '', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    pos = re.search(reg_ayyen_tur, line)

    if pos:
        line = line[:pos.start()]

    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
开发者ID:JonMosenkis,项目名称:Sefaria-Data,代码行数:26,代码来源:basic_ein_parser.py


注:本文中的regex.compile函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。