本文整理汇总了Python中bs4.UnicodeDammit方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.UnicodeDammit方法的具体用法?Python bs4.UnicodeDammit怎么用?Python bs4.UnicodeDammit使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bs4
的用法示例。
在下文中一共展示了bs4.UnicodeDammit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_rsc_html
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def parse_rsc_html(htmlstring):
"""Messy RSC HTML needs this special parser to fix problems before creating selector."""
converted = UnicodeDammit(htmlstring)
if not converted.unicode_markup:
raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
# Add p.otherpara tags around orphan text
newp = None
for child in root.get_element_by_id('wrapper'):
if newp is not None:
if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
child.addprevious(newp)
newp = None
else:
newp.append(child)
if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
newp = Element('p', **{'class': 'otherpara'})
newp.text = child.tail
child.tail = ''
return root
示例2: search_txt
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def search_txt(filename, word):
'''
Search the word in a text file
'''
# Detect the encoding
with open(filename, 'rb') as file:
content = file.read(1024)
suggestion = UnicodeDammit(content)
encoding = suggestion.original_encoding
# Open and read
with open(filename, encoding=encoding) as file:
for line in file:
if word in line.lower():
return True
return False
示例3: force_unicode
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def force_unicode(s):
"""
Ensure a string is unicode, not encoded; used for enforcing file paths to be unicode upon saving a subtitle,
to prevent encoding issues when saving a subtitle to a non-ascii path.
:param s: string
:return: unicode string
"""
if not isinstance(s, str):
try:
s = s.decode("utf-8")
except UnicodeDecodeError:
t = chardet.detect(s)
try:
s = s.decode(t["encoding"])
except UnicodeDecodeError:
s = UnicodeDammit(s).unicode_markup
return s
示例4: parse
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def parse(self, response):
"""
default parse method, rule is not useful now
"""
# import pdb; pdb.set_trace()
response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
hxs = HtmlXPathSelector(response)
index_level = self.determine_level(response)
log.msg("Parse: index level:" + str(index_level))
if index_level in [1, 2, 3, 4]:
self.save_to_file_system(index_level, response)
relative_urls = self.get_follow_links(index_level, hxs)
if relative_urls is not None:
for url in relative_urls:
log.msg('yield process, url:' + url)
yield Request(url, callback=self.parse)
elif index_level == 5:
personProfile = HtmlParser.extract_person_profile(hxs)
linkedin_id = self.get_linkedin_id(response.url)
linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
if linkedin_id:
personProfile['_id'] = linkedin_id
personProfile['url'] = UnicodeDammit(response.url).markup
yield personProfile
示例5: generate_rows
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
partition_id=None, records_limit = -1):
"""
The main reading method.
"""
url_book = self.mirror
lid = len(str(self.book_id))
fullbid = str(self.book_id)
rootbid = fullbid # sometimes the id to access a file has a variation, ex fullbid=14285-8 for the book 14285
stopit = 0
for i in range(lid-1):
if (fullbid[i+1] != "-") and (stopit==0):
url_book += '/'+fullbid[i]
else:
stopit=1
rootbid=fullbid[0:i]
url_book += '/'+ rootbid + '/'+ fullbid + '.txt'
response = urlopen(url_book)
raw = response.read() #.decode('utf8')
converted = UnicodeDammit(raw)
raw = converted.unicode_markup
start_book = raw.find("START OF")
end_book = raw.rfind('END OF')
preamb = raw[:start_book]
author = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Author') != -1][0]
title = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Title') != -1][0]
date = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Release Date') != -1][0]
book_paraph = raw[start_book:end_book].split("\r\n\r\n")
logger.info("Book length %s" % len(raw))
logger.info("N paragraphs:", len(book_paraph))
for id_p, p in enumerate(book_paraph):
yield {'id':id_p, 'author': author, 'title': title, 'text': p}
示例6: getItem
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def getItem(self, itemUrl, addlHeaders=None):
content, handle = self.wg.getpage(itemUrl, returnMultiple=True, addlHeaders={'Referer': self.refererUrl})
if not content or not handle:
raise ValueError("Failed to retreive file from page '%s'!" % itemUrl)
info = handle.info()
if not 'Content-Disposition' in info:
info['Content-Disposition'] = ''
fileN = jsLiteralParse.parseContentDispositon(info['Content-Disposition'], itemUrl)
fileN = bs4.UnicodeDammit(fileN).unicode_markup
mType = handle.info()['Content-Type']
# If there is an encoding in the content-type (or any other info), strip it out.
# We don't care about the encoding, since WebRequest will already have handled that,
# and returned a decoded unicode object.
if mType and ";" in mType:
mType = mType.split(";")[0].strip()
self.log.info("Retreived file of type '%s', name of '%s' with a size of %0.3f K", mType, fileN, len(content)/1000.0)
return content, fileN, mType
示例7: beautify
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def beautify(self, data, charset):
dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
data = dammit.unicode_markup
return data
示例8: pycurl_get_resp
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def pycurl_get_resp(data_buf, headers, payload, resp):
charset = None
if 'content-type' in headers:
content_type = headers['content-type'].lower()
match = re.search('charset=(\S+)', content_type)
if match:
charset = match.group(1)
print('Decoding using %s' % charset)
body = data_buf.getvalue()
if len(body) == 0:
data = ''
charset = 'utf-8'
else:
if charset is None:
dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
data = dammit.unicode_markup
charset = dammit.original_encoding
else:
data = body.decode(charset, 'ignore')
# headers.remove({})
headers['content'] = [h for h in headers['content'] if len(h) > 0]
soup_lxml = BeautifulSoup(data, 'lxml')
soup_html = BeautifulSoup(data, 'html.parser')
resp.update({
'url': payload.get('url'),
# 'soup': soup,
'title': get_title(soup_lxml),
'links': get_links(soup_lxml),
'links2': get_links2(soup_lxml),
'metas': get_metas(soup_lxml),
'images': get_images(soup_lxml),
'scripts': get_scripts(soup_lxml),
'text': get_text(soup_html),
'data': data,
'headers': headers,
'charset': charset,
'spider': 'pycurl',
'payload': payload,
})
示例9: _get_encoding
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def _get_encoding(cls, input_string, encoding):
converted = UnicodeDammit(input_string, [encoding] if encoding else [])
# Not worth raising exception? lxml will raise if parse fails.
# if not converted.unicode_markup:
# raise UnicodeDecodeError('Failed to detect encoding')
return converted.original_encoding
示例10: get_encoding
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def get_encoding(input_string, guesses=None, is_html=False):
"""Return the encoding of a byte string. Uses bs4 UnicodeDammit.
:param string input_string: Encoded byte string.
:param list[string] guesses: (Optional) List of encoding guesses to prioritize.
:param bool is_html: Whether the input is HTML.
"""
converted = UnicodeDammit(input_string, override_encodings=[guesses] if guesses else [], is_html=is_html)
return converted.original_encoding
示例11: scan
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def scan(self, payload: Payload, request: Request) -> WorkerResponse:
normalize: bool = True
ioctype: str = 'all'
results: Dict = {}
if ioctype == 'all':
for ioc in self.compiled_re:
if self.compiled_re[ioc]:
matches = self.compiled_re[ioc].findall(UnicodeDammit(payload.content).unicode_markup)
if matches:
results[ioc] = list(set(matches))
elif self.compiled_re[ioctype]:
matches = self.compiled_re[ioctype].findall(UnicodeDammit(payload.content).unicode_markup)
if matches:
results[ioctype] = list(set(matches))
if 'ipv6' in results:
results['ipv6'] = [
address for address in results['ipv6'] if self._validate_ipv6(address)
]
if not results['ipv6']:
results.pop('ipv6')
if normalize:
results = self._normalize(results)
return WorkerResponse(results)
示例12: scan
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def scan(self, payload: Payload, request: Request) -> WorkerResponse:
extracted: List[ExtractedPayload] = []
tnef_results = TNEF(payload.content)
if tnef_results.attachments:
for tnef_attachment in tnef_results.attachments:
try:
filename = UnicodeDammit(tnef_attachment.name).unicode_markup
except:
filename = "None"
tnef_meta = PayloadMeta(extra_data={'filename': filename})
extracted.append(ExtractedPayload(tnef_attachment.data, tnef_meta))
return WorkerResponse(extracted=extracted)
示例13: discover_mirror_selection
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def discover_mirror_selection():
"""Discover "geographically suitable" Ubuntu mirrors."""
timer = Timer()
logger.info("Identifying fast Ubuntu mirrors using %s ..", MIRROR_SELECTION_URL)
data = fetch_url(MIRROR_SELECTION_URL, timeout=3, retry=True, max_attempts=5)
# shorter timeout with more retries is good for unstable connections to MIRROR_SELECTION_URL
dammit = UnicodeDammit(data)
mirrors = set(
CandidateMirror(mirror_url=mirror_url.strip())
for mirror_url in dammit.unicode_markup.splitlines()
if mirror_url and not mirror_url.isspace() and mirror_url.startswith(('http://', 'https://'))
)
logger.debug("Found %s in %s.", pluralize(len(mirrors), "fast Ubuntu mirror"), timer)
return mirrors
示例14: guess_encoding
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def guess_encoding(self):
# override default subtitle guess_encoding method to not include language-specific encodings guessing
# chardet encoding detection seem to yield better results
"""Guess encoding using chardet.
:return: the guessed encoding.
:rtype: str
"""
if self._guessed_encoding:
return self._guessed_encoding
logger.info('Guessing encoding for language %s', self.language)
# guess/detect encoding using chardet
encoding = chardet.detect(self.content)['encoding']
logger.info('Chardet found encoding %s', encoding)
if not encoding:
# fallback on bs4
logger.info('Falling back to bs4 detection')
a = UnicodeDammit(self.content)
logger.info("bs4 detected encoding: %s", a.original_encoding)
if a.original_encoding:
self._guessed_encoding = a.original_encoding
return a.original_encoding
raise ValueError(u"Couldn't guess the proper encoding for %s", self)
self._guessed_encoding = encoding
return encoding
示例15: extract
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import UnicodeDammit [as 别名]
def extract(self):
try:
arch, fName = self.wg.getFileAndName(self.url, addlHeaders={'Referer': self.refererUrl})
except IndexError:
print("ERROR: Failure retrieving page!")
return None, []
baseName = fName.split(".")[0]
if not isinstance(arch, bytes):
if 'You need permission' in arch or 'Sign in to continue to Docs':
self.log.critical("Retrieving zip archive failed?")
self.log.critical("Retreived content type: '%s'", type(arch))
raise TypeError("Cannot access document? Is it protected?")
else:
with open("tmp_page.html", "w") as fp:
fp.write(arch)
raise ValueError("Doc not valid?")
zp = io.BytesIO(arch)
zfp = zipfile.ZipFile(zp)
resources = []
baseFile = None
for item in zfp.infolist():
if not "/" in item.filename and not baseFile:
contents = zfp.open(item).read()
contents = bs4.UnicodeDammit(contents).unicode_markup
baseFile = (item.filename, contents)
elif baseName in item.filename and baseName:
raise ValueError("Multiple base file items?")
else:
resources.append((item.filename, mimetypes.guess_type(item.filename)[0], zfp.open(item).read()))
if not baseFile:
raise ValueError("No base file found!")
return baseFile, resources