本文整理汇总了Python中re.sub函数的典型用法代码示例。如果您正苦于以下问题:Python sub函数的具体用法?Python sub怎么用?Python sub使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了sub函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _parse_productions
def _parse_productions(self):
"""
Parse the current contents of the textwidget buffer, to create
a list of productions.
"""
productions = []
# Get the text, normalize it, and split it into lines.
text = self._textwidget.get('1.0', 'end')
text = re.sub(self.ARROW, '->', text)
text = re.sub('\t', ' ', text)
lines = text.split('\n')
# Convert each line to a CFG production
for line in lines:
line = line.strip()
if line=='': continue
productions += parse_cfg_production(line)
#if line.strip() == '': continue
#if not CFGEditor._PRODUCTION_RE.match(line):
# raise ValueError('Bad production string %r' % line)
#
#(lhs_str, rhs_str) = line.split('->')
#lhs = Nonterminal(lhs_str.strip())
#rhs = []
#def parse_token(match, rhs=rhs):
# token = match.group()
# if token[0] in "'\"": rhs.append(token[1:-1])
# else: rhs.append(Nonterminal(token))
# return ''
#CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
#
#productions.append(Production(lhs, *rhs))
return productions
示例2: getPanelInfo
def getPanelInfo(self, doc, strXPath):
try:
npos = doc.text_content().find(strXPath)
if npos == -1:
return ""
strContent = doc.text_content()[npos:-1]
npos = strContent.find("})")
if npos == -1:
return ""
strContent = strContent[0:npos+1]
strContent = (strContent[strContent.find("\"html\":\"")+8:-4])
if "v2" in self.xpathType:
strContent = strContent.decode('unicode-escape')
strContent = re.sub(r"(\\n)*(\\t)*(\\ /)*(\\)*", "", strContent)
strContent = re.sub(r"\\/", "/", strContent)
if strContent:
strContent = strContent.replace("<", "<").replace(">", ">").replace("nbsp;", "")
else:
return ""
except Exception:
s=sys.exc_info()
msg = (u"getPanelInfo Error %s happened on line %d" % (s[1],s[2].tb_lineno))
logger.error(msg)
return ""
return strContent
示例3: GetBook
def GetBook(self, book):
self.footnotes=[]
self.content=[]
counter = 1
plainBook = unicodeToPlain(book)
while True:
url='http://www.biblia.deon.pl/otworz.php'
values={'ksiega': book.encode('iso8859_2'),
'rozdzial': str(counter)}
data=urllib.urlencode(values)
response = urllib2.urlopen(urllib2.Request(url, data)).read()
doc = html.fromstring(response)
if counter == 1:
BookTitle = (doc.findall('.//span[@style="font-size:22px;"]')[0])
self.content.append(re.sub(r'</span>', r'</div>', re.sub(r'<span style=\"font-size:22px;\"',r'<br><br><a name="K' + plainBook + r'"></a><div class="tytul"', html.tostring(BookTitle))))
ChaptersInBook = len(doc.findall('.//select[@name="rozdzial"]/option'))
else:
self.content.append('<br><br>')
plainPrefix = plainBook + str(counter)
self.content.append('<div class="numer">' + str(counter) + '</div>')
Book.GetContent(self, doc.xpath('//div[@class="tresc"]')[0], plainPrefix)
Book.GetFootnotes(self, doc.xpath('//td[@width="150"]/table/tr[5]/td/div[1]')[0], plainPrefix, unicodeToReference(book) + ' ' + str(counter))
if counter == ChaptersInBook:
self.content.append('<br><br>' + "".join(self.footnotes))
break
counter += 1
示例4: extract_bow_v2_features
def extract_bow_v2_features(train, test, test_contains_labels = False):
'''
Performs feature extraction for another simple tfidf model used for
ensembling purposes.
'''
s_data = []
s_labels = []
t_data = []
t_labels = []
stemmer = PorterStemmer()
for i, row in train.iterrows():
s=(" ").join(["q"+ z for z in BeautifulSoup(train["search_term"][i], "lxml").get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title[i], "lxml").get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description[i], "lxml").get_text(" ")
s=re.sub("[^a-zA-Z0-9]"," ", s)
s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
s_data.append(s)
s_labels.append(str(train["relevance"][i]))
for i, row in test.iterrows():
s=(" ").join(["q"+ z for z in BeautifulSoup(test["search_term"][i], "lxml").get_text().split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(test.product_title[i], "lxml").get_text().split(" ")]) + " " + BeautifulSoup(test.product_description[i], "lxml").get_text()
s=re.sub("[^a-zA-Z0-9]"," ", s)
s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
t_data.append(s)
if test_contains_labels:
t_labels.append(str(test["relevance"][i]))
return (s_data, s_labels, t_data, t_labels)
示例5: gen_xkcd_sub
def gen_xkcd_sub(msg, hook=False):
# http://xkcd.com/1288/
substitutions = {
'witnesses': 'these dudes I know',
'allegedly': 'kinda probably',
'new study': 'tumblr post',
'rebuild': 'avenge',
'space': 'SPAAAAAACCCEEEEE',
'google glass': 'virtual boy',
'smartphone': 'pokedex',
'electric': 'atomic',
'senator': 'elf-lord',
'car': 'cat',
'election': 'eating contest',
'congressional leaders': 'river spirits',
'homeland security': 'homestar runner',
'could not be reached for comment': 'is guilty and everyone knows it'
}
# http://xkcd.com/1031/
substitutions['keyboard'] = 'leopard'
# http://xkcd.com/1418/
substitutions['force'] = 'horse'
output = msg
if not hook or random() < 0.001 or True:
for text, replacement in substitutions.items():
if text in output:
output = re.sub(r"\b%s\b" % text, replacement, output)
output = re.sub(r'(.*)(?:-ass )(.*)', r'\1 ass-\2', output)
if msg == output:
return None if hook else msg
else:
return output
示例6: classifyText
def classifyText( text, params ):
start_time = params.my_time()
#clean
try: text = params.cleaner.clean_html( text )
except: pass
text = re.sub('<.*?>', ' ', text )
text = re.sub('\s+', ' ', text )
text = text.lower()
#Tokenize
tokens = re.findall('[a-z]+', text )
#Remove stop words
tokens_2 = []
for t in tokens:
if( not t in params.stopword_list ): tokens_2.append(t)
# print tokens_2
#Stem
stems = []
for t in tokens_2:
stem = params.porterStemmer.stem( t, 0, len(t)-1 )
stems.append(stem)
z = 0#params.linear_classifier['{{intercept}}']+.6
for s in stems:
if s in params.linear_classifier:
# print s, params.linear_classifier[s]
z += params.linear_classifier[s]
end_time = params.my_time()
return ( z<0, [start_time, end_time, len(stems), z, 1/(1+math.exp(-z)), int(z>0)] )
示例7: htmlify
def htmlify (self, text):
t=text.strip()
#t=xml.sax.saxutils.escape(t)
t="<p>%s</p>"%t
t=re.sub('\n\n+','</p><p>',t)
t=re.sub('\n','<br>',t)
return t
示例8: convert_corpus
def convert_corpus(filepath, mapping, alignment, begin="xxBeGiN142xx", end="xxEnD142xx"):
general_corpus = ''
with open(filepath, 'rb') as f:
general_corpus = re.sub('(' + begin + '\W+)+', ' . ', f.read())
general_corpus = re.sub('\n+', ' this_is_n3wline ', general_corpus)
corpus = []
for token in general_corpus.split():
if token.strip() == '.':
if len(corpus) > 0:
if '\n' not in corpus[-1]:
# If the token is punctuation assign a random punctuation.
corpus[-1] = corpus[-1] + random.choice(['.', '.', '.' , ',', ',' ',', '!', '?'])
elif token.strip() == 'this_is_n3wline':
corpus[-1] = corpus[-1] + '.\n\n'
elif alignment[token] in mapping:
if len(corpus) > 0 and re.search('[\n\.!?]',corpus[-1]):
corpus.append(mapping[alignment[token]].capitalize().strip())
else:
corpus.append(mapping[alignment[token]].strip())
corpus[0] = corpus[0].capitalize()
output = ' '.join(corpus)
output = re.sub(r' +', ' ', output)
output = re.sub(r'\n+ ', '\n\n', output)
return output
示例9: getCategoryUrl
def getCategoryUrl(site="",url=""):
catDb = openTable(tableName=global_setting['catTable'])
r = session.get(url)
if not r.text:
return False
soup = BeautifulSoup(r.text)
for level1 in soup.select('.classify_books'):
curLevel1 = level1.select('.classify_title')[0].text
curLevel1 = re.sub('\s', '', curLevel1)
for level2 in level1.select('.classify_kind'):
curLevel2 = level2.select('.classify_kind_name')[0].text
curLevel2 = re.sub('\s', '', curLevel2)
for level3 in level2.select('ul li a'):
#curLevel3 = re.sub('\s', '', level3.text)
curLevel3 = level3.text.strip()
curlUrl = level3['href']
retFind = re.findall(r'\/cp(.*)\.html',curlUrl)
if retFind:
curCatID = retFind[0]
catType = 'book'
else:
retFind = re.findall(r'\/cid(.*)\.html',curlUrl)
if retFind:
curCatID = retFind[0]
catType = 'nonbook'
if retFind:
if catDb.find({'catId':curCatID}).count() >0:
logger.debug('catetogy %s exists,skip\n'%(curCatID))
else:
catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl,'catType':catType, 'site':site})
return True
示例10: obfuscate_codeblocks
def obfuscate_codeblocks(source):
"""Method for obfuscating codeblocks contents.
It can be often useful to temporarly obfuscate codeblocks contents for performing safely some tasks
and then re-introducing them.
Parameters
----------
source : str
string (as single stream) containing the source
Returns
-------
protected_contents : list
list of str containing the contents of codeblocks
str
source with codeblocks contents obfuscated and replaced by a safe placeholder
>>> source = '``` my code block ``` other contents'
>>> prot, ob_source = obfuscate_codeblocks(source)
>>> prot[0][2]
'``` my code block ```'
>>> ob_source
'$PROTECTED-1 other contents'
"""
obfuscate_source = source
protected_contents = []
for match in re.finditer(__regex_codeblock__,obfuscate_source):
protected_contents.append([match.start(),match.end(),match.group()])
obfuscate_source = re.sub(__regex_codeblock__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1)
for match in re.finditer(__regex_codeblock_html__,obfuscate_source):
protected_contents.append([match.start(),match.end(),match.group()])
obfuscate_source = re.sub(__regex_codeblock_html__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1)
return protected_contents,obfuscate_source
示例11: makeIdentifier
def makeIdentifier(self, string):
string = re.sub( r"\s+", " ", string.strip())
string = unicodedata.normalize('NFKD', safeEncode(string))
string = re.sub(r"['\"[email protected]#$&%^*\(\)_+\.,;:/]","", string)
string = re.sub(r"[_ ]+","_", string)
string = string.strip('_')
return string.strip().lower()
示例12: _sanitize
def _sanitize(self, data):
retv = ''
if data.find('\x1b') != -1:
tmp = filter(lambda x: x in string.printable, data)
retv += re.sub('(\{|\}|\*|\%)', '', re.sub('\[[0-9\;]+m', '', tmp))
return retv
return data
示例13: _clean_text
def _clean_text(self, text):
""" Cleans up text before we make it into an HTML tree:
1. Nukes <![CDATA stuff.
2. Nukes XML encoding declarations
3. Replaces </br> with <br/>
4. Nukes invalid bytes in input
5. ?
"""
# Remove <![CDATA because it causes breakage in lxml.
text = re.sub(r"<!\[CDATA\[", u"", text)
text = re.sub(r"\]\]>", u"", text)
# Remove <?xml> declaration in Unicode objects, because it causes an error:
# "ValueError: Unicode strings with encoding declaration are not supported."
# Note that the error only occurs if the <?xml> tag has an "encoding"
# attribute, but we remove it in all cases, as there's no downside to
# removing it. This moves our encoding detection to chardet, rather than
# lxml.
if isinstance(text, unicode):
text = re.sub(r"^\s*<\?xml\s+.*?\?>", "", text)
# Fix </br>
text = re.sub("</br>", "<br/>", text)
# Fix invalid bytes (http://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python)
text = re.sub(u"[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+", "", text)
return text
示例14: sendGPS
def sendGPS(self, head, cmd):
maxcnt = 100
string = head + cmd
res = ""
print "GPS SEND: '%s'" % string
self.dev.flushInput()
self.dev.write(string + "\r\n")
for j in xrange(maxcnt):
res = self.dev.readline()
if len(res) > 0:
res = re.sub("^\s+", "", res)
res = re.sub("\s+$", "", res)
print "RAW GPS REPLY: '%s'" % res
pos = res.find(head)
if pos != -1:
res = res[pos:].split("*")[0]
print "GPS REPLY: '%s'" % res
return res
else:
print "ZERO REPLY"
return None
print "sendGPS: FAILED: '%s'" % res
示例15: main
def main():
cur_dir = os.path.dirname(__file__)
os.chdir(os.path.join(cur_dir, ".."))
modules = sys.argv[1:]
if not modules:
modules = ['django_evolution']
p = subprocess.Popen(['pyflakes'] + modules,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
close_fds=True)
contents = p.stdout.readlines()
# Read in the exclusions file
exclusions = {}
fp = open(os.path.join(cur_dir, "pyflakes.exclude"), "r")
for line in fp.readlines():
if not line.startswith("#"):
exclusions[line.rstrip()] = 1
fp.close()
# Now filter thin
for line in contents:
line = line.rstrip()
test_line = re.sub(r':[0-9]+:', r':*:', line, 1)
test_line = re.sub(r'line [0-9]+', r'line *', test_line)
if test_line not in exclusions:
print line