本文整理汇总了Python中regex.match函数的典型用法代码示例。如果您正苦于以下问题:Python match函数的具体用法?Python match怎么用?Python match使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了match函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, text, lv=None, lc=None, vc=None):
if isinstance(text, Ex):
self._text = text.text
self._lc = text.lc
self._vc = text.vc
elif not isinstance(text, str):
raise TypeError("text must be string")
else:
self._text = text
if lv:
if re.match(r'^[a-z]{3}-[0-9]{3,}$', lv):
self._lc = lv[:3]
self._vc = int(lv[4:])
else: raise ValueError("lv must be in the format xxx-000")
elif lc and (vc != None):
lc = lc.lower()
if re.match(r'^[a-z]{3}$', lc):
self._lc = lc
else: raise ValueError("lc must be a 3-letter ISO 639 code")
try:
vc = int(vc)
if vc < 0: raise ValueError("vc must be a positive integer")
self._vc = vc
except ValueError: raise ValueError("vc must be a positive integer")
else:
raise TypeError("{cls} requires lv".format(cls=self.__class__.__name__))
示例2: get_next_document
def get_next_document(h):
while True:
l = h.readline()
if not l:
doc = None
break
l = l.decode('utf-8').strip()
if not l:
continue
if re.match(u'^<doc ', l, re.UNICODE):
# Fix _unk_.
l = re.sub(r'_unk_', 'unknown', l)
# Forum detection.
if re.match(RE_FORUM, l, re.UNICODE):
l = re.sub(u'>$', r' forum="1">', l, re.UNICODE)
else:
l = re.sub(u'>$', r' forum="0">', l, re.UNICODE)
# Host and tld extraction.
l = re.sub(r'( url="https{0,1}://)([^/]+)\.([a-z]{2,4})(|/|%)([^"]*")', r'\1\2.\3\4\5 urldomain="\2.\3" tld="\3"', l)
# Fix some known problems in doc attr values.
l = re.sub(r'=" +"', r'="unknown"', l) # fix: attr=" "
l = re.sub(r'="([^"]+)\\" ', r'="\1" ', l) # fix: attr="val\"
doc = [l]
else:
doc = doc + [l]
if re.match(u'^</doc>', l, re.UNICODE):
break
return doc
示例3: guess_split
def guess_split(majiribun, reading):
kanjis=[]
matchreg_greedy=''
matchreg_nongreedy=''
for char in majiribun:
if kanji_re.match(char):
kanjis.append(char)
matchreg_greedy += "(\p{Hiragana}+)"
matchreg_nongreedy += "(\p{Hiragana}+?)"
else:
matchreg_greedy += re.escape(char)
matchreg_nongreedy += re.escape(char)
m = re.match(matchreg_greedy + '$', reading)
if m:
yomis = m.groups()
yomis_nongreedy = re.match(matchreg_nongreedy + '$', reading).groups()
if yomis != yomis_nongreedy:
# Ambiguous!
return None
d = {}
for idx in range(0, len(kanjis)):
d[kanjis[idx]] = yomis[idx]
return(d)
示例4: process_file
def process_file(file_path, tagger, idf_doc_count, idf_table, threshold, maximum_words):
"""
Takes the uploaded file, detecs its type (plain text, alto XML, zip)
and calls a parsing function accordingly. If everything succeeds it
returns keywords and 200 code, returns an error otherwise.
"""
file_info = magic.from_file(file_path)
lines = []
if re.match("^UTF-8 Unicode (with BOM) text", file_info):
lines = lines_from_txt_file(file_path, encoding='utf-8-sig')
elif re.match("^UTF-8 Unicode", file_info):
lines = lines_from_txt_file(file_path, encoding='utf-8')
elif re.match("^ASCII text", file_info):
lines = lines_from_txt_file(file_path, encoding='utf-8')
elif re.match('^XML 1.0 document', file_info) and \
(file_path.endswith('.alto') or file_path.endswith('.xml')):
lines = lines_from_alto_file(file_path)
elif re.match('^Zip archive data', file_info):
lines = lines_from_zip_file(file_path)
else:
return {"eror": "Unsupported file type: {}".format(file_info)}, 400
if not lines:
return {"error": "Empty file"}, 400
return keywords.get_keywords(lines, tagger, idf_doc_count, idf_table, threshold, maximum_words), 200
示例5: __init__
def __init__(self, room, s, negative=True):
"""
парсит выражения типа '/5m jid [email protected]', 'nick exp regexp', etc.
короче в стиле глюкса
"""
self.room = room
self.negative = negative
self.end_time, s = fetch_time(s)
if s.count('||'): s, self.reason = s[:s.find('||')].strip(), s[s.find('||')+2:].strip()
else: s, self.reason = s.strip(), ''
if s.lower().startswith('jid '):
self.by_jid = True
s = s[4:].lower()
if not s: raise ValueError
elif s.lower().startswith('nick '):
self.by_jid = False
s = s[5:]
if not s: raise ValueError
else:
self.by_jid = True
self.regexp = False
item = room.get(s, None)
if item:
if item.jid == item.realjid: raise NoJID(item.jid)
else: self.value = item.realjid.lower()
else: raise NickNotFound(s)
return
if s.lower().startswith('exp '):
self.regexp = True
s = s[4:]
try: regex.match(s, '[email protected]')
except: raise MyRegexpError(s)
else: self.regexp = False
self.value = s
示例6: testRegex
def testRegex(self):
# Basic match, beginning of string
self.assertEqual(1, match("foo", "foobar"))
# Basic match, middle of string
self.assertEqual(1, match("oba", "foobar"))
# Basic match, no match
self.assertEqual(0, match("obo", "foobar"))
# Match with start qualifier
self.assertEqual(1, match("^fo", "foobar"))
# Match with start qualifier in body
self.assertEqual(0, match("^bar", "foobar"))
# Match with end qualifier
self.assertEqual(1, match("bar$", "foobar"))
# Match with end qualifier in body
self.assertEqual(0, match("foo$", "foobar"))
# Match with optional qualifier
self.assertEqual(1, match("fo*b", "foobar"))
# Match with optional qualifier 2
self.assertEqual(1, match("fooa*b", "foobar"))
# Match with optional qualifier 3
self.assertEqual(1, match("a*foo", "foobar"))
示例7: parseaddr
def parseaddr(address):
# This is probably not perfect
address = string.strip(address)
# Case 1: part of the address is in <[email protected]> form.
pos = regex.search('<.*>', address)
if pos >= 0:
name = address[:pos]
address = address[pos:]
length = regex.match('<.*>', address)
name = name + address[length:]
address = address[:length]
else:
# Case 2: part of the address is in (comment) form
pos = regex.search('(.*)', address)
if pos >= 0:
name = address[pos:]
address = address[:pos]
length = regex.match('(.*)', name)
address = address + name[length:]
name = name[:length]
else:
# Case 3: neither. Only an address
name = ''
name = string.strip(name)
address = string.strip(address)
if address and address[0] == '<' and address[-1] == '>':
address = address[1:-1]
if name and name[0] == '(' and name[-1] == ')':
name = name[1:-1]
return name, address
示例8: test_yaml
def test_yaml(md_filepath):
filestring = md_filepath.read()
reg = regex.compile(r'^---(.*?)---',flags=regex.DOTALL)
match = regex.search(reg, filestring)
if not match: pytest.skip('No YAML header')
yaml_text = match.group(1)
parsed_yaml = yaml.load(yaml_text)
for requirement in requirements:
req = requirements[requirement]
if req['required']:
assert requirement in parsed_yaml, 'YAML metadata missing required element: ' + requirement
if req['type'] == 'link':
# Check external links have balanced brackets
regexp = regex.compile(r'\[(.*)\]\((.*)\)')
assert regex.match(regexp,parsed_yaml[requirement]), 'YAML metadata formatting error: ' + requirement
if req['type'] == 'date' and requirement in parsed_yaml:
try:
d = parse(str(parsed_yaml[requirement]))
except ValueError:
assert False, 'YAML metadata formatting error: ' + requirement + ' date parse failed.'
regexp = regex.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}')
assert regex.match(regexp,str(parsed_yaml[requirement])), 'YAML metadata formatting error: ' + requirement + ' should use the format YYYY-MM-DD.'
for header in parsed_yaml:
assert header in requirements, 'YAML metadata header ' + header + ' is not a valid metadata type.'
示例9: faiordict2contigorder
def faiordict2contigorder(file_name, file_format):
'''Takes either a .fai or .dict file, and return a contig order dictionary, i.e., chrom_seq['chr1'] == 0'''
assert file_format in ('fai', 'dict')
contig_sequence = []
with open(file_name) as gfile:
line_i = gfile.readline().rstrip('\n')
while line_i:
if file_format == 'fai':
contig_match = re.match(r'([^\t]+)\t', line_i)
elif file_format == 'dict':
if line_i.startswith('@SQ'):
contig_match = re.match(r'@SQ\tSN:([^\t]+)\tLN:', line_i)
if contig_match:
contig_i = contig_match.groups()[0].split(' ')[0]
# some .fai files have space after the contig for descriptions.
contig_sequence.append( contig_i )
line_i = gfile.readline().rstrip('\n')
chrom_seq = {}
for n,contig_i in enumerate(contig_sequence):
chrom_seq[contig_i] = n
return chrom_seq
示例10: acroize_heading
def acroize_heading(m):
acro = text.get('acronym')
if not acro:
return m[0]
heading = m[2]
if not heading:
return acro
m2 = regex.match(r'(\d+(?:–(\d+))?)(?:\.)?\s*(.*)$', heading)
if not m2:
h_text = heading
else:
h_num = m2[1]
h_text = m2[3]
m3 = regex.match(r'(.*?)(\d+(?:–(\d+))?)$', text['acronym'])
acro_prefix = m3[1]
acro_num = m3[2]
if acro_num == h_num:
heading = h_text
elif '–' in acro_num and h_num:
acro = acro_prefix + h_num
heading = h_text
new_heading = f'<span class="acro">{acro}</span>{": " if h_text else ""}{h_text}'
return f'{m[1]}{new_heading}'
示例11: process_lines
def process_lines(lines, NONBREAKING_PREFIX):
# loop text, add lines together until we get a blank line or a <p>
out_text = ''
text = ""
for line in lines:
line = line.strip()
m = re_tag.match(line)
if m is None:
m = regex.match('^\s*$', line)
if m is not None:
# time to process this block, we've hit a blank or <p>
out_text += do_it_for(text, line, NONBREAKING_PREFIX)
if regex.match('^\s*$', line) and len(text): ##if we have text followed by <P>
out_text += "<P>\n"
text = ""
else:
# append the text, with a space
text += line + " "
# do the leftover text
if len(text):
out_text += do_it_for(text, "", NONBREAKING_PREFIX)
return out_text
示例12: create_activation
def create_activation(data, labels, standard_cols, group_labels=[]):
activation = database.Activation()
for i, col in enumerate(data):
# Cast to integer or float if appropriate
# if regex.match('[-\d]+$', col):
# col = int(col)
# elif regex.match('[-\d\.]+$', col):
# col = float(col)
# Set standard attributes if applicable and do validation where appropriate.
# Generally, validation will not prevent a bad value from making it into the
# activation object, but it will flag any potential issues using the "problem" column.
if standard_cols[i] is not None:
sc = standard_cols[i]
# Validate XYZ columns: Should only be integers (and possible trailing decimals).
# If they're not, keep only leading numbers. The exception is that ScienceDirect
# journals often follow the minus sign with a space (e.g., - 35), which we strip.
if regex.match('[xyz]$', sc):
m = regex.match('(-)\s+(\d+\.*\d*)$', col)
if m:
col = "%s%s" % (m.group(1), m.group(2))
if not regex.match('(-*\d+)\.*\d*$', col):
logging.debug("Value %s in %s column is not valid" % (col, sc))
activation.problems.append("Value in %s column is not valid" % sc)
# col = regex.search('(-*\d+)', col).group(1)
return activation
col = (float(col))
elif sc == 'region':
if not regex.search('[a-zA-Z]', col):
logging.debug("Value in region column is not a string")
activation.problems.append("Value in region column is not a string")
setattr(activation, sc, col)
# Always include all columns in record
activation.add_col(labels[i], col)
# Handle columns with multiple coordinates (e.g., 45;12;-12).
# Assume that any series of 3 numbers in a non-standard column
# reflects coordinates. Will fail if there are leading numbers!!!
# Also need to remove space between minus sign and numbers; some ScienceDirect
# journals leave a gap.
if not i in standard_cols:
cs = '([\-\.\s]*\d{1,3})'
m = regex.search('%s[,;\s]+%s[,;\s]+%s' % (cs, cs, cs), unicode(col).strip())
if m:
x, y, z = [regex.sub('-\s+', '-', c) for c in [m.group(1), m.group(2), m.group(3)]]
logger.info("Found multi-coordinate column: %s\n...and extracted: %s, %s, %s" % (col, x, y, z))
activation.set_coords(x, y, z)
activation.groups = group_labels
return activation
示例13: parse_line
def parse_line(line, perv_url):
if not line or len(line.strip()) == 0:
raise ValueError("STR_EMPTY")
line = line.strip()
spt = line.split('-')
if len(spt) == 3:
name_1 = spt[0]
name_2 = spt[1]
attrs = spt[2]
attrs_spt = attrs.split(',')
if not (len(attrs_spt) == 2 or (len(attrs_spt) == 1 and perv_url)):
raise ValueError("STR_ENTRY_EMPTY")
if not name_1 \
or not name_2 \
or not regex.match("^["+_cryllic+"\s]+$", name_1)\
or not regex.match("^["+_cryllic+"\s]+$", name_2)\
or len(name_1.split(' ')) != 2\
or len(name_2.split(' ')) != 2:
raise ValueError("STR_NAME_FORMAT")
if name_1 == name_2:
raise ValueError("STR_SAME_NAMES")
if len(attrs_spt) == 2 and perv_url:
raise ValueError("STR_TAG_FORMAT")
if not regex.match("^(?!\.)["+_cryllic+"\.]+(?<!\.)$", attrs_spt[0]):
raise ValueError("STR_TAG_FORMAT")
link_types = attrs_spt[0].split('.')
if filter(lambda x: not x, link_types):
raise ValueError("STR_TAG_FORMAT")
arr = collections.Counter(link_types)
doubled_tags = set(i for i in arr if arr[i]>1)
if len(doubled_tags) != 0:
raise ValueError("STR_TAG_DOUBLED:" + ",".join(doubled_tags))
url = attrs_spt[1] if len(attrs_spt) == 2 else perv_url
if not regex.match("http://[\w\.]+/[\w]+$", url):
raise ValueError("STR_LINK_FORMAT")
"""
sim_names = list(es.get_similar_names([name_1, name_2]))
if isinstance(sim_names[0], basestring):
raise ValueError(u"STR_SIMILAR_NAME:{},{}".format(name_1,sim_names[0]))
if isinstance(sim_names[1], basestring):
raise ValueError(u"STR_SIMILAR_NAME:{},{}".format(name_2,sim_names[1]))
tags = filter(lambda x: not x[1], zip(link_types, es.check_tags(link_types)))
if len(tags) != 0:
raise ValueError(u"STR_TAG_NOT_FOUND:{}".format(",".join(map(lambda x: x[0], tags))))
"""
return (name_1, name_2, link_types, url)
else:
raise ValueError("STR_FORMAT")
示例14: test_zero_or_one
def test_zero_or_one(self):
p = regex.build_regex("ba?")
result = regex.match(p, "ba")
self.assertTrue(result)
result = regex.match(p, "b")
self.assertTrue(result)
result = regex.match(p, "aa")
self.assertFalse(result)
示例15: process_marked_lines
def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
"""Run regexes against message's marked lines to strip quotations.
Return all but the last quoted segment if it exists.
>>> mark_message_lines(['Hello', 'From: [email protected]', '', '> Hi', 'tsem'])
['Hello']
Also returns return_flags.
return_flags = [were_lines_deleted, first_deleted_line,
last_deleted_line]
"""
# Pre-process marker sequence
# if there are no splitter there should be no markers. However, allow markers if more than 3!
if 's' not in markers and not re.search('(me*){3}', markers):
markers = markers.replace('m', 't')
# Look for forwards (don't remove anything on a forward)
# if there is an f before the first split, then it's a forward.
if re.match('[te]*f', markers):
return_flags[:] = [False, -1, -1]
return lines
# Remove last quoted segment
# match from the end of the markers list
markers.reverse()
# match for unmarked quote following split
quotation = re.match(r'e*(te*)+(se*)+', markers)
if not quotation:
# match for inline replies
if re_orig.match(r'e*[mfts]*((te*)+(me*)+)+[mfts]*((se*)+|(me*){2,})', markers):
return_flags[:] = [False, -1, -1]
return lines
# match for normal reply with quote
quotation = re_orig.match(r'e*(me*)+[mefts]*((se*)+|(me*){2,})', markers)
if not quotation:
# match for normal reply with quote and signature below quote
if re.match(r'e*(te*)+(me*)+.*(s)+e*(te*)+', markers):
quotation = re.match(r'e*(te*)+(me*)+.*(s)+', markers)
markers.reverse()
# If quotation, return it
if quotation:
start = len(markers) - quotation.end() + 1
end = len(markers) - quotation.start() - 1
return_flags[:] = True, start, end
return lines[:start] + lines[end:]
return_flags[:] = [False, -1, -1]
return lines