本文整理匯總了Python中bs4.UnicodeDammit.lower方法的典型用法代碼示例。如果您正苦於以下問題:Python UnicodeDammit.lower方法的具體用法?Python UnicodeDammit.lower怎麽用?Python UnicodeDammit.lower使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類bs4.UnicodeDammit
的用法示例。
在下文中一共展示了UnicodeDammit.lower方法的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: corpus_generator
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import lower [as 別名]
def corpus_generator(self):
with open(self.corpus_path, 'rb') as f:
i = 0
for line in f:
line = UnicodeDammit(line.strip()).unicode_markup
if line:
if self.lower:
line = line.lower()
i += 1
if i % 100000 == 0:
logging.info('Read {} nonblank lines'.format(i))
for tok in re.split(r'\s+', line):
yield tok
示例2: document_generator
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import lower [as 別名]
def document_generator(path, lower=False):
'''
Default document reader. Takes a path to a file with one document per line,
with tokens separate by whitespace, and yields lists of tokens per document.
This could be replaced by any function that yields lists of tokens.
See main() for how it is called.
Note: this uses BeautifulSoup's UnicodeDammit to convert to unicode.
'''
with open(path, 'rb') as f:
i = 0
for line in f:
line = UnicodeDammit(line.strip()).unicode_markup
if line:
if lower:
line = line.lower()
i += 1
if i % 100000 == 0:
logging.info('Read {} nonblank lines'.format(i))
yield re.split(r'\s+', line)
示例3: clean_google_title
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import lower [as 別名]
def clean_google_title(self, title):
has_dot = False
titleCleaned = UnicodeDammit(title).unicode_markup
# clean step 1
# BUGFIX: don't remove [xxx]. eg: "OQL[C++]: Ext...'
titleCleaned = re.sub("(<(.*?)>)", "", titleCleaned)
re_hasdot = re.compile("(\.\.\.|…)", re.I)
match = re_hasdot.search(title)
if match is not None:
has_dot = True
# clean step 2, here title is readable
titleCleaned = re.sub("( |►|…)", "", titleCleaned)
titleCleaned = re.sub("(&#.+?;|&.+?;)", "", titleCleaned)
titleCleaned = titleCleaned.strip()
readableTitle = titleCleaned
# Shrink, only letters left
titleCleaned = re.sub("\W", "", titleCleaned)
titleCleaned = titleCleaned.lower()
return (readableTitle, titleCleaned, has_dot)
示例4: on_pubmsg
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import lower [as 別名]
def on_pubmsg(self, c, e):
nick = e.source.nick
target = e.target if is_channel(e.target) else nick
def reply(msg):
self.send(target, msg)
def dm(msg):
self.send(nick, msg)
line = UnicodeDammit(e.arguments[0]).unicode_markup
log(' \033[37m{}→{}\033[0m'.format(nick, line))
a = line.split(":", 1)
if len(a) > 1 and a[0].lower() == self.nick:
self.do_command(e, a[1].strip().lower(), nick, target, reply, dm)
return
# zeltofilter
if 'zeltoph' in nick:
return
foo = settings.VIPS.get(nick, 0)
if random() < foo:
self.kick(nick)
match = re.match('.*┻━┻.*', line)
if match:
reply('┬─┬ノ(ಠ_ಠノ)')
return
match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line)
if match:
newcs = match.group(3)
self.chaossternchen.append(newcs)
self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs))
return
if line.startswith('.wiki '):
wikipage = line[len('.wiki '):].strip()
if re.match('^[-_+\w]+$', wikipage):
wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage)
if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text:
reply("I'm sorry, I can't find a wiki page with that name.")
else:
reply(wikiurl)
else:
reply('Try to troll somebot else.')
return
if line == 'wat?':
reply("I don't have a clue.")
return
if re.match('^hail eris[.!]* ', line.lower()):
reply("All Hail Discordia!")
return
m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE)
for _1,match,_2 in m:
if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE):
self.kick(nick, "It's spelled Gandhi")
return
if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()):
reply('A facebook link? srsly? Get some self-respect!')
return
match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower())
if match:
reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1))
return
if line == 'moin':
self.moincount += 1
if self.moincount == 5:
reply('moin')
return
else:
self.moincount = 0
if line.lstrip('.!#').startswith('eta '):
eta = line[4:].strip()
with self.db as db:
db.execute("DELETE FROM etas WHERE nick=?", (nick,))
if eta:
db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta))
dm('ETA registered. Thanks!')
return
m = re.findall(URL_REGEX, line.lower())
for url,*_ in m:
res = requests.get(url)
if res.status_code == requests.codes.ok:
soup = BeautifulSoup(res.text)
reply(soup.title.string)
m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE)
for _1,match,_2 in m:
if match != 'AfRA' and match != 'afra' and random() < 0.1:
reply("I'm sure you meant AfRA, not "+match)
return