本文整理汇总了Python中regex.compile函数的典型用法代码示例。如果您正苦于以下问题:Python compile函数的具体用法?Python compile怎么用?Python compile使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了compile函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, start=None, end=None, void=None, structs=None):
self.start = start if start else re.compile(r"<(\w+).*?(?<!/)>")
self.end = end if end else re.compile(r"</(\w+)>")
self.void = void if void else re.compile(r"<(\w+).*?/>")
self.stags = set()
self.etags = set()
self.vtags = set()
示例2: clean_line
def clean_line(line):
line = strip_nikkud(line)
replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right?
line = multiple_replace(line, replace_dict, using_regex=True)
# line = re.sub(u'[:\?]', '', line)
# line = re.sub(u'”', u'"', line)
reg_parentheses = re.compile(u'\((.*?)\)')
reg_brackets = re.compile(u'\[(.*?)\]')
in_per = reg_parentheses.search(line)
in_bra = reg_brackets.search(line)
reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''')
line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
f_ayyen = re.search(reg_ayyen_tur, line)
f_lo_manu = re.search(reg_lo_manu, line)
if f_ayyen:
line = line[:f_ayyen.start()]
if f_lo_manu:
line = re.sub(f_lo_manu.group('a'), u"", line)
if in_per:
if in_bra:
clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct
clean = re.sub(reg_parentheses, '', clean)
else:
clean = re.sub(reg_parentheses, ur'\1', line)
elif in_bra:
clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct
else:
clean = line
return clean
示例3: readConfigFile
def readConfigFile (
source # pathname to config file to read
):
# Purpose: read the configuration file at 'source', parse it,
# store values in a dictionary
# Returns: the dictionary parsed from 'source'
# Assumes: 'source' exists
# Effects: reads from the file system
# Throws: IOError if there are problems reading
fp = open (source, 'r')
lines = fp.readlines ()
fp.close ()
ignore_line = regex.compile ('[ \t]*#') # comment line
data_line = regex.compile ('[ \t]*'
'\([^ \t]+\)'
'[ \t]*\(.*\)')
dict = {}
for line in lines:
if ignore_line.match (line) == -1:
if data_line.match (line) != -1:
(parameter, value) = data_line.group (1,2)
dict [string.upper (parameter)] = value
return dict
示例4: sample1
def sample1(filename, aft=None, fore=None, top=None, home=None):
doc = SeriesDocument('HTMLgen.rc')
doc.goprev,doc.gonext,doc.gotop,doc.gohome = aft,fore,top,home
doc.background = '../image/texturec.jpg'
doc.banner = ('../image/historic.gif', 472, 60)
doc.author = '1776 Thomas Jefferson'
doc.email = '[email protected]'
doc.logo = ('../image/eagle21.gif', 64, 54)
# parse Declaration of Independence
re_hline = regex.compile('^--+$')
re_title = regex.compile('^Title:\(.*$\)')
font2 = Font(size='+2')
s = open(os.path.join(datadir, 'DoI.txt')).read()
paragraphs = regsub.split(s, '\n\([\t ]*\n\)+')
for para in paragraphs:
if not para: continue
if re_title.search(para) > -1:
doc.title = re_title.group(1)
elif re_hline.search(para) > -1:
doc.append(HR())
else:
p = Paragraph( para )
# using \` to match beginning of paragraph
# ^ won't work because it'll match all the newlines
n = p.markup('\`\(\w\)', font2, reg_type='regex')
doc.append(p)
doc.write(os.path.join(htmldir, filename))
示例5: updateline
def updateline(file, key, value, casefold = 1):
try:
f = open(file, 'r')
lines = f.readlines()
f.close()
except IOError:
lines = []
pat = key + ':\(.*\)\n'
if casefold:
prog = regex.compile(pat, regex.casefold)
else:
prog = regex.compile(pat)
if value is None:
newline = None
else:
newline = '%s: %s' % (key, value)
for i in range(len(lines)):
line = lines[i]
if prog.match(line) == len(line):
if newline is None:
del lines[i]
else:
lines[i] = newline
break
else:
if newline is not None:
lines.append(newline)
f = open(tempfile, 'w')
for line in lines:
f.write(line)
f.close()
示例6: test_post
def test_post(title, body, user_name, site, is_answer, body_is_summary):
result = []
for rule in FindSpam.rules:
body_to_check = body
if rule['stripcodeblocks']:
body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL)
body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL)
if rule['all'] != (site in rule['sites']):
matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title)
matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name)
matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check)
if matched_title and rule['title']:
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
result.append(rule['reason'])
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", "title"))
if matched_username and rule['username']:
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
result.append(rule['reason'])
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", "username"))
if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']):
type_of_post = "answer" if is_answer else "body"
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
result.append(rule['reason'].replace("{}", type_of_post))
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", type_of_post))
return result
示例7: tlg_plaintext_cleanup
def tlg_plaintext_cleanup(text, rm_punctuation=False, rm_periods=False):
"""Remove and substitute post-processing for Greek TLG text.
TODO: Surely more junk to pull out. Please submit bugs!
TODO: {.+?}|\(.+?\) working?
TODO: This is a rather slow now, help in speeding up welcome.
"""
remove_comp = regex.compile(r'-\n|«|»|<|>|\.\.\.|‘|’|_|{.+?}|\(.+?\)|[a-zA-Z0-9]', flags=regex.VERSION1)
text = remove_comp.sub('', text)
new_text = None
if rm_punctuation:
new_text = ''
punctuation = [',', '·', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']
if rm_periods:
punctuation += ['.', ';']
for char in text:
# second try at rming some punctuation; merge with above regex
if char in punctuation:
pass
else:
new_text += char
if new_text:
text = new_text
# replace line breaks w/ space
replace_comp = regex.compile(r'\n')
text = replace_comp.sub(' ', text)
comp_space = regex.compile(r'\s+')
text = comp_space.sub(' ', text)
return text
示例8: setliteral
def setliteral(self, tag):
self.literal = 1
re = "%s%s[%s]*%s" % (ETAGO, tag, string.whitespace, TAGC)
if self._normfunc is string.lower:
self._lit_etag_re = regex.compile(re, regex.casefold)
else:
self._lit_etag_re = regex.compile(re)
示例9: __init__
def __init__(self, src, javaFlag=0):
Doxy2SWIG.__init__(self, src, javaFlag)
""" Turns on the title, brief description and detailed description markup.
Turn them off when inside member documentatation.
"""
self.FilterTitle = True
self.sitkClassName=''
self.EmptyText = False
# compiled regular expressions
# common formula types in xml version of documentation
self.dollarFormula = re.compile("^\\$(.+)\\$$")
self.arrayFormula = re.compile("^\\\\\\[(.+)\\\\\\]$")
# more complex formula layout, that breaks R documentation
# checks.
self.mathstuff1 = re.compile(r"\\begin\{array\}\{[^}]+\}")
self.mathstuff2 = re.compile(r"\\begin\{array\}")
self.mathstuff3 = re.compile(r"\\end\{array\}")
# a complex recursive regular expression, to deal with formula
# inside mbox and text structures
self.mathstuff4 = regex.compile(r"\\mbox({((?>[^}{]*(?1)?)*)})", flags=regex.V1)
self.mathstuff5 = regex.compile(r"\\text({((?>[^}{]*(?1)?)*)})", flags=regex.V1)
# the special doxygen tags - note - not greedy
self.mathstuff6 = re.compile(r"\\f\$(.+?)\\f\$")
# alignment tags
self.mathstuff7 = re.compile(r" & ")
示例10: all_caps_text
def all_caps_text(s, site):
s = regex.sub("<[^>]*>", "", s) # remove HTML tags
s = regex.sub("&\w+;", "", s) # remove HTML entities
if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s):
return False, "" # common words in non-spam all-caps titles
if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s):
return True, "All in caps"
示例11: _replace_for
def _replace_for(self, text, nested_position, keyword_number=1):
"""
Finds and replace the % for: ... % endfor loops
of the mail.template. It will create keyword records for
each loop found.
:param text: mail.template text
:param nested_position: counts how nested if the current pass
:param keyword_number: counts how many for we found
:return: simplified text without the if code, keywords found
"""
# Regex for finding text wrapped in loops
loop_regex = r'(% for .*?:$)(.*?)(% endfor)'
ul_loop_regex = r'(?:<ul[^<]*?)(% for .*?:$)(.*?)(% endfor)(.*?</ul>)'
# First scan for ul_loops
for_pattern = re.compile(ul_loop_regex, flags=re.DOTALL | re.MULTILINE)
simple_text, found_keywords = self._replace_for_type(
text, nested_position, keyword_number, 'for_ul', for_pattern)
keyword_number += len(found_keywords)
# Then scan for regular loops
for_pattern = re.compile(loop_regex, flags=re.DOTALL | re.MULTILINE)
simple_text, keywords = self._replace_for_type(
simple_text, nested_position, keyword_number, 'for', for_pattern)
found_keywords |= keywords
return simple_text, found_keywords
示例12: __init__
def __init__(self, directory_name):
self.directory = directory_name
self.unigram_frequency = Counter()
self.trigrams = dict()
self.trigram_load_pattern = re2.compile(r'^([^ ]*) ([^ ]*) ([^\t]*)\t(\d*)')
self.middle_token_pattern = re2.compile(r'^\p{posix_alnum}*$', re2.UNICODE)
super(FileScorer, self).__init__()
示例13: _reload_allowed_list_file
def _reload_allowed_list_file(self):
'''(Re)loads the list with rules for non-segment borders, e.g stops
the possible segment border being split (if not forced by a forcing
rule specified for the stop rule. The stop rules are pairs of two rules
of which the first is matched against the segment to the left, and the
latter is matched against the segment to the right. The filename is
given in the __init__, and the default file is "./data/stop_list".
See the __init__() and segment() function for more about the algorithm.
ATTENTION note that verbose regexps are used.'''
with open(self._allowed_list_filename, 'r') as f:
_filedata = f.readlines()
self._allowed_regexps = list()
_rule_left = ''
_rule_right = ''
for i in range(len(_filedata)):
# rules must be specified in correct order: first left, then right
if _filedata[i].startswith('LEFT:'):
_rule_left = regex.compile(_filedata[i][5:], regex.VERBOSE)
elif _filedata[i].startswith('RIGHT:'):
_rule_right = regex.compile(_filedata[i][6:], regex.VERBOSE)
self._allowed_regexps.append((_rule_left, _rule_right))
_rule_left = ''
_rule_right = ''
else:
# everything else is ignored
continue
示例14: __init__
def __init__(self):
# These attributes are set by the parse method
self.doc = None
self.para = None
self.current_string = None
self.flow = None
self.stateMachine = StateMachine()
self.stateMachine.add_state("PARA", self._para)
self.stateMachine.add_state("ESCAPE", self._escape)
self.stateMachine.add_state("END", None, end_state=1)
self.stateMachine.add_state("ANNOTATION-START", self._annotation_start)
self.stateMachine.add_state("CITATION-START", self._citation_start)
self.stateMachine.add_state("BOLD-START", self._bold_start)
self.stateMachine.add_state("ITALIC-START", self._italic_start)
self.stateMachine.add_state("CODE-START", self._code_start)
self.stateMachine.add_state("QUOTES-START", self._quotes_start)
self.stateMachine.add_state("INLINE-INSERT", self._inline_insert)
self.stateMachine.add_state("CHARACTER-ENTITY", self._character_entity)
self.stateMachine.set_start("PARA")
self.patterns = {
'escape': re.compile(r'\\', re.U),
'escaped-chars': re.compile(r'[\\\(\{\}\[\]_\*,\.\*`"&]', re.U),
'annotation': re.compile(
r'(?<!\\)\{(?P<text>.*?)(?<!\\)\}(\(\s*(?P<type>\S*?\s*[^\\"\']?)(["\'](?P<specifically>.*?)["\'])??\s*(\((?P<namespace>\w+)\))?\s*(~(?P<language>[\w-]+))?\))?', re.U),
'bold': re.compile(r'\*(?P<text>((?<=\\)\*|[^\*])*)(?<!\\)\*', re.U),
'italic': re.compile(r'_(?P<text>((?<=\\)_|[^_])*)(?<!\\)_', re.U),
'code': re.compile(r'`(?P<text>(``|[^`])*)`', re.U),
'quotes': re.compile(r'"(?P<text>((?<=\\)"|[^"])*)(?<!\\)"', re.U),
'inline-insert': re.compile(r'>\((?P<attributes>.*?)\)', re.U),
'character-entity': re.compile(r'&(\#[0-9]+|#[xX][0-9a-fA-F]+|[\w]+);'),
'citation': re.compile(r'(\[\s*\*(?P<id>\S+)(\s+(?P<id_extra>.+?))?\])|(\[\s*\#(?P<name_name>\S+)(\s+(?P<extra>.+?))?\])|(\[\s*(?P<citation>.*?)\])', re.U)
}
示例15: clean_line
def clean_line(line):
line = strip_nikkud(line)
line = re.sub(u':', '', line)
reg_parentheses = re.compile(u'\((.*?)\)')
reg_brackets = re.compile(u'\[(.*?)\]')
in_per = reg_parentheses.search(line)
in_bra = reg_brackets.search(line)
reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
pos = re.search(reg_ayyen_tur, line)
if pos:
line = line[:pos.start()]
if in_per:
if in_bra:
clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct
clean = re.sub(reg_parentheses, '', clean)
else:
clean = re.sub(reg_parentheses, ur'\1', line)
elif in_bra:
clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct
else:
clean = line
return clean