本文整理匯總了Python中bs4.UnicodeDammit.detwingle方法的典型用法代碼示例。如果您正苦於以下問題:Python UnicodeDammit.detwingle方法的具體用法?Python UnicodeDammit.detwingle怎麽用?Python UnicodeDammit.detwingle使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類bs4.UnicodeDammit
的用法示例。
在下文中一共展示了UnicodeDammit.detwingle方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: learn
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def learn(self, name, phrase, channel):
name = self.aliases.resolve(name)
if name not in self.users:
self.users[name] = True
if "password" in phrase:
return
phrase = phrase.split(" ")
phrase = filter(lambda x: x and "http" not in x and "ftp:" not in x and x[0] != ".", phrase)
now = datetime.datetime.utcnow()
documents = []
for i in range(len(phrase) + 1):
seed = UnicodeDammit.detwingle(phrase[i-1] if i > 0 else "")
answer = UnicodeDammit.detwingle(phrase[i] if i < len(phrase) else "")
documents.append({
"name": name,
"seed": seed,
"answer": answer,
"added": now,
"random": random.random()
})
yield self.db.insert(documents, safe=True)
示例2: ramble
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def ramble(self, name=None, seed=""):
if name:
name = self.aliases.resolve(name)
if name not in self.users:
returnValue("")
message = []
if seed:
seed = UnicodeDammit.detwingle(seed)
chunk = seed
while chunk and len(" ".join(message)) < 300:
message.append(chunk)
chunk = yield self.prev(name, chunk)
message.reverse()
chunk = yield self.next(name, seed)
while chunk and len(" ".join(message)) < 300:
message.append(chunk)
chunk = yield self.next(name, chunk)
if not chunk and len(" ".join(message)) < 30:
chunk = yield self.next(name, chunk)
response = (" ".join(message)).decode("utf8")
if seed and response == seed.decode("utf8"):
response = yield self.ramble(name)
returnValue(response)
示例3: __init__
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def __init__(self,url):# logs info,warning,error,critical,debug events.
'''
Description: This is the class constructor and is going to get a simple url as input and parse it based on RFC1738.
Status: In Progress.
Usage: This is going to be used by by the connection manager and the active/passive scanner to extract url variables.
'''
self.url = UnicodeDammit.detwingle(url, 'UTF-8')
self.defaultHttpsPort = 443
self.defaultHttpPort = 80
urlLogger.logInfo("--- Package: UrlManager - Module: UrlHandler Class: urlHandler Initiated ---")
示例4: selectdir
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def selectdir(geturl):
r = scraper.get(geturl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
rt = UnicodeDammit.detwingle(r.text)
html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
if debug == 1:
orenc = str(html.original_encoding)
print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
findlinks = html.findAll('a')
dirlist = []
for link in findlinks:
b = link.get('href')
if not re.match(r'^((\.\.)?\/)$', str(b)):
if re.search(r'^(.*)(\/)$', str(b)):
dirlist.append(b)
p = urlparse(geturl)
part = p.path.split('/')[-1]
path = p.path.rstrip(part)
if '/' not in path[:1]:
path = '/' + path
urlfqdn = p.scheme + '://' + p.netloc
parent = urlfqdn + path
i = 0
dirtotal = len(dirlist)
if dirtotal > 0:
print('\nFOUND %d DIRECTORIES: \n' % dirtotal)
while i < dirtotal:
sel = i + 1
print(str(sel) + ' - ' + str(dirlist[i]))
i += 1
print('')
lim = dirtotal + 1
matchtop = r'^(%s)(\/)?$' % urlfqdn
if not re.match(matchtop,geturl):
print('0 - BACK TO PARENT DIRECTORY \n')
startsel = '0-%d' % dirtotal
else:
startsel = '1-%d' % dirtotal
selectdir = raw_input('make a selection [%s] --> ' % startsel)
if not int(selectdir) in range(0, lim):
selectdir = raw_input('invalid entry. please enter a selection %s --> ' % startsel)
if selectdir == '0':
geturl = parent
subcont = 0
else:
n = int(selectdir) - 1
usedir = dirlist[n]
geturl = parent + usedir
subcont = 1
else:
print('\nNO DIRECTORIES FOUND. using current directory.. \n')
subcont = 0
geturl = parent + part
return geturl, subcont, parent
示例5: formatForReddit
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def formatForReddit(self, feedEntry, postType, subreddit, raw):
if 'content' in feedEntry:
content = feedEntry['content'][0]['value']
elif 'description' in feedEntry:
content = feedEntry.description
else:
content = ''
logging.debug(content)
parser = EveRssHtmlParser()
title = feedEntry['title']
# some feeds like Twitter are raw so the parser hates it.
if (raw):
regex_of_url = '(https?:\/\/[\dA-z\.-]+\.[A-z\.]{2,6}[\/\w&=#\.\-\?]*)'
title = re.sub(regex_of_url, '', title)
clean_content = content.replace(' pic.twitter.com', ' http://pic.twitter.com')
clean_content = re.sub(regex_of_url, '<a href="\\1">link</a>', clean_content)
clean_content = UnicodeDammit.detwingle(clean_content)
#logging.info(clean_content)
u = UnicodeDammit(clean_content,
smart_quotes_to='html',
is_html = False )
# fix twitter putting ellipses on the end
content = u.unicode_markup.replace(unichr(8230),' ...')
logging.debug('.....')
if "tumblr.com" in content:
# Replace with larger images (hopefully such images exist)
content = content.replace('_500.', '_1280.')
# Added the .replace because the parser does something funny to them and
# removes them before I can handle them
content = content.replace(' ', ' ')
content = content.replace('•', '*').replace('·','*')
content = content.replace('“','\'').replace('”','\'')
content = re.sub('( [ ]+)', ' ', content)
parser.feed(content)
parser.comments[0] = '%s\n\n%s' %(feedEntry['link'], parser.comments[0])
parser.comments[-1] += self.config['signature']
if 'author' in feedEntry:
author = '~' + feedEntry['author'].replace('@', ' at ')
else:
author = ''
return {'comments': parser.comments,
'link': feedEntry['link'],
'subreddit': subreddit,
'title': '[%s] %s %s' %(postType, title, author)}
示例6: normalize
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def normalize(s):
if isinstance(s, unicode):
return s
try:
u = s.decode("utf8")
except:
try:
u = (s[:-1]).decode("utf8")
except:
try:
u = UnicodeDammit.detwingle(s).decode("utf8")
except:
u = UnicodeDammit(s, ["utf8", "windows-1252"]).unicode_markup
return u
示例7: getpage
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def getpage(cfurl):
r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
if 'text' in r.headers.get('Content-Type'):
rt = UnicodeDammit.detwingle(r.text)
html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
print('\r\n--------------------------------------------------------\r\n')
if debug == 1:
orenc = str(html.original_encoding)
print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
bs = html.prettify(formatter=None)
print(bs)
print('\r\n--------------------------------------------------------\r\n')
else:
found = -1
if debug == 1:
print('\n\033[34mDEBUG: finished list length: \033[37;1m%d \033[0m\n' % len(finished))
示例8: format
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def format(self, script):
dammit = UnicodeDammit.detwingle(script)
soup = BeautifulSoup(dammit, from_encoding="utf8")
header = soup.find('subtitle_script')
header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\nScaledBorderAndShadow: yes\nYCbCr Matrix: TV.709\n\n";
styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n";
events = "\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n";
stylelist = soup.findAll('style')
eventlist = soup.findAll('event')
for style in stylelist:
styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n"
for event in eventlist:
events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n"
formattedSubs = header+styles+events
return formattedSubs
示例9: to_unicode
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def to_unicode(data, is_html=False, detwingle=False, verbose=True,
lang=None):
" converts everything to unicode"
dammit = UnicodeDammit(data, is_html=is_html)
if detwingle and dammit.original_encoding == 'windows-1252':
new_data = UnicodeDammit.detwingle(data)
dammit = UnicodeDammit(new_data, is_html=is_html)
if verbose:
sys.stderr.write("Original encoding (via BS): %s\n" %
(dammit.original_encoding))
if lang is None:
return dammit.unicode_markup
if lang == 'auto':
lang = _guess_lang_from_data(dammit.unicode_markup, is_html=is_html)
if verbose:
sys.stderr.write("Detected language: %s\n" % (lang))
return _to_unicode_chared(data, lang, verbose=verbose)
示例10: unicode_dammit_example
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def unicode_dammit_example():
# Install the 'chardet' or 'cchardet' Python libraries for better guesses
### Take a string with unknown encoding and make the string Unicode
weirdass_string = "Sacr\xc3\xa9 bleu!"
dammit = UnicodeDammit(weirdass_string)
print "Original Word with weird encoding:", weirdass_string
print "Dammit Print:", (dammit.unicode_markup)
print "Dammit Type:", (dammit.original_encoding)
### Take a doc with mostly UTF-8 encoding (and misc encodings due to mult
# data sources) and convert to UTF-8 Unicode with .Dammit.detwingle()
snowmen = (u"\N{SNOWMAN}" * 3)
quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}")
doc = snowmen.encode("utf8") + quote.encode("windows-1252")
# So now we have one doc with two encodings in it, printing is a mess
#print "Weird Decoding doc with utf8:", doc # messed up, won't print
#print (doc.decode("windows-1252")) # So messed up it doesn't even print
# Decode using UnicodeDammit.detwingle() converts the string to pure UTF-8
new_doc = UnicodeDammit.detwingle(doc)
print new_doc.decode("utf8")
示例11: to_unicode
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def to_unicode(data, is_html=False, detwingle=False, verbose=False,
lang=None):
""" Produce unicode from text of unknown encoding.
Input: bytestring """
dammit = UnicodeDammit(data, is_html=is_html)
if detwingle and dammit.original_encoding == 'windows-1252':
new_data = UnicodeDammit.detwingle(data)
dammit = UnicodeDammit(new_data, is_html=is_html)
if verbose:
sys.stderr.write("Original encoding (via BS): %s\n" %
(dammit.original_encoding))
if lang is None:
return dammit.unicode_markup
if lang == 'auto':
lang = TextSanitizer.guess_lang_from_data(
dammit.unicode_markup, is_html=is_html)
if verbose:
sys.stderr.write("Detected language: %s\n" % (lang))
return TextSanitizer._to_unicode_chared(data, lang, verbose=verbose)
示例12: getlinks
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def getlinks(cfurl):
r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
rt = UnicodeDammit.detwingle(r.text)
html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
if debug == 1:
orenc = str(html.original_encoding)
print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
bs = html.prettify(formatter=None)
linkresult = html.findAll('a')
if len(linkresult) > 0:
foundlinks = len(linkresult)
print('\nFOUND %s LINKS AT %s:\n' % (str(foundlinks), cfurl))
for link in linkresult:
b = link.get('href')
b = str(b)
if b not in cfurl and not re.match(r'^(\.\.)?\/$', b):
print(b)
print('')
else:
print('\nNO LINKS FOUND.\n')
foundlinks = 0
time.sleep(4)
return foundlinks
示例13: slim_html
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def slim_html(self, raw_html):
doc = UnicodeDammit.detwingle(raw_html)
soup = BeautifulSoup(doc, "html5lib", from_encoding="utf-8")
return soup.prettify().encode("utf-8");
示例14: followlinks
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def followlinks(bx):
p = urlparse(bx)
if '/' not in p.path[-1:]:
part = p.path.split('/')[-1]
path = p.path.rstrip(part)
else:
path = p.path
if '/' not in path[:1]:
path = '/' + path
urlfqdn = p.scheme + '://' + p.netloc
parent = urlfqdn + path + '/'
s = scraper.get(bx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
print('\n----------------------------------------------------------- \n')
print(s)
print('\n')
scr = UnicodeDammit.detwingle(s.text)
shtml = BeautifulSoup(scr, "html.parser")
if debug == 1:
orenc = str(shtml.original_encoding)
print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
print('\n----------------------------------------------------------- \n')
sfindlinks = shtml.findAll('a')
slen = len(sfindlinks)
sdirs = []
si = 0
while si < slen:
for slink in sfindlinks:
if debug == 1:
print('\n\033[34;1mSLINK LOOP\r\n\033[32;21m* si = %d, si < %d\033[0m\n' % (si, slen))
sl = slink.get('href')
si += 1
if sl:
if not re.search(r'^((\.\.)?\/)$', str(sl)):
if '/' in bx[-1:]:
if 'http' not in sl[:4]:
sl = sl.lstrip('/')
sx = bx + sl
else:
sx = sl
print(sx)
getCF(sx, 0)
ss = scraper.get(sx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
bs = BeautifulSoup(ss.text, "html.parser")
if bs is not None:
if debug == 1:
orenc = str(bs.original_encoding)
print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
pagehead = bs.html.head.contents
pagehead = str(pagehead)
if pagehead:
pagetitle = re.search(r'<title>(.*)<\/title>', pagehead)
pagetitle = str(pagetitle.group(1))
bigtitle = pagetitle.upper()
titlestars = lambda a: '*' * (len(str(a)) + 4)
pagestars = titlestars(pagetitle)
print('\n\033[40m\033[33m%s\n\033[34;1m* %s * \n\033[40m\033[33;21m%s\n\033[0m' % (pagestars, bigtitle, pagestars))
sb = bs.find_all('a', href = re.compile(r'.+$'))
#sb = bs.findAll('a')
sblen = len(sb)
if sblen > 0:
n = 0
while n < sblen:
for sbl in sb:
if debug == 1:
print('\n\033[35;1mSBL LOOP\r\n\033[37;21m* n = %d, n < %d \033[0m\n' % (n, sblen))
if sbl is not None:
sr = sbl.get('href').strip()
sr = str(sr)
print('\n* %s \n') % sr
if not re.search('http', sr[:4]):
parent = getparent(sx)
srs = sr.lstrip('/')
sr = parent + srs
if re.match(r'([^.]+\/)$', str(sr)):
followlinks(sr)
sdirs.append(sr)
else:
if '/' not in sr[-1:]:
getCF(sr, 0)
sdirs.append(sr)
n += 1
else:
n += 1
continue
elif 'Error-222' in bx:
print('\nuh-oh. might have triggered a flag with cloudflare.\n')
for i in xrange(10,0,-1):
time.sleep(1)
print('delaying request for %d seconds.. \r' % i)
sys.stdout.flush()
break
else:
if not re.search('http', str(sl[:4])):
parent = getparent(bx)
sl = sl.lstrip('/')
sx = parent + sl
else:
sx = str(sl)
#.........這裏部分代碼省略.........
示例15: UnicodeDammit
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<b><!-- i am a comment --></b>
<p class="story">...</p>
"""
# 編碼監測 可設置可能編碼
dammit = UnicodeDammit(html_doc, ["utf8", "gbk"])
if 0:
print dammit.original_encoding
# 轉換掉字符中的Windows-1252字符
html_doc = UnicodeDammit.detwingle(html_doc)
# 文檔解析過程
# diagnose(html_doc)
# 解析部分文檔 提高效率
only_a_tag = SoupStrainer("a")
# html_doc 可以是 文件對象或字符串
soup = BeautifulSoup(html_doc, features=["lxml"], from_encoding='utf8')
'''
:param: features=[] 解析器列表
:param: from_encoding='utf8' 編碼
:param: parse_only SoupStrainer 實例
'''