本文整理汇总了Python中calibre.ebooks.BeautifulSoup.BeautifulSoup类的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup类的具体用法?Python BeautifulSoup怎么用?Python BeautifulSoup使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BeautifulSoup类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: save_soup
def save_soup(soup, target):
ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
nm = ns.find('meta')
metas = soup.findAll('meta', content=True)
added = False
for meta in metas:
if 'charset' in meta.get('content', '').lower():
meta.replaceWith(nm)
added = True
if not added:
head = soup.find('head')
if head is not None:
head.insert(0, nm)
selfdir = os.path.dirname(target)
for tag in soup.findAll(['img', 'link', 'a']):
for key in ('src', 'href'):
path = tag.get(key, None)
if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
html = unicode(soup)
with open(target, 'wb') as f:
f.write(html.encode('utf-8'))
示例2: search_for_asin_on_amazon
def search_for_asin_on_amazon(self, query):
'''Search for book's asin on amazon using given query'''
query = urlencode({'keywords': query})
url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query
try:
response = open_url(self._connections['amazon'], url)
except PageDoesNotExist:
return None
# check to make sure there are results
if ('did not match any products' in response and 'Did you mean:' not in response and
'so we searched in All Departments' not in response):
return None
soup = BeautifulSoup(response)
results = soup.findAll('div', {'id': 'resultsCol'})
if not results:
return None
for result in results:
if 'Buy now with 1-Click' in str(result):
asin_search = AMAZON_ASIN_PAT.search(str(result))
if asin_search:
return asin_search.group(1)
return None
示例3: existing_annotations
def existing_annotations(parent, field, return_all=False):
"""
Return count of existing annotations, or existence of any
"""
# import calibre_plugins.marvin_manager.config as cfg
_log_location(field)
annotation_map = []
if field:
db = parent.opts.gui.current_db
id = db.FIELD_MAP["id"]
for i, record in enumerate(db.data.iterall()):
mi = db.get_metadata(record[id], index_is_id=True)
if field == "Comments":
if mi.comments:
soup = BeautifulSoup(mi.comments)
else:
continue
else:
soup = BeautifulSoup(mi.get_user_metadata(field, False)["#value#"])
if soup.find("div", "user_annotations") is not None:
annotation_map.append(mi.id)
if not return_all:
break
if return_all:
_log("Identified %d annotated books of %d total books" % (len(annotation_map), len(db.data)))
_log("annotation_map: %s" % repr(annotation_map))
else:
_log("no active field")
return annotation_map
示例4: existing_annotations
def existing_annotations(parent, field, return_all=False):
'''
Return count of existing annotations, or existence of any
'''
import calibre_plugins.annotations.config as cfg
annotation_map = []
if field:
db = parent.opts.gui.current_db
id = db.FIELD_MAP['id']
for i, record in enumerate(db.data.iterall()):
mi = db.get_metadata(record[id], index_is_id=True)
if field == 'Comments':
if mi.comments:
soup = BeautifulSoup(mi.comments)
else:
continue
else:
soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#'])
if soup.find('div', 'user_annotations') is not None:
annotation_map.append(mi.id)
if not return_all:
break
if return_all:
_log_location("Identified %d annotated books of %d total books" %
(len(annotation_map), len(db.data)))
return annotation_map
示例5: get_annotations_date_range
def get_annotations_date_range(self):
'''
Find oldest, newest annotation in annotated books
initial values of self.oldest, self.newest are reversed to allow update comparisons
if no annotations, restore to correct values
'''
annotations_found = False
for cid in self.annotation_map:
mi = self.cdb.get_metadata(cid, index_is_id=True)
if self.field == 'Comments':
soup = BeautifulSoup(mi.comments)
else:
soup = BeautifulSoup(mi.get_user_metadata(self.field, False)['#value#'])
uas = soup.findAll('div', 'annotation')
for ua in uas:
annotations_found = True
timestamp = float(ua.find('td', 'timestamp')['uts'])
if timestamp < self.oldest_annotation:
self.oldest_annotation = timestamp
if timestamp > self.newest_annotation:
self.newest_annotation = timestamp
if not annotations_found:
temp = self.newest_annotation
self.newest_annotation = self.oldest_annotation
self.oldest_annotation = temp
示例6: extract_calibre_cover
def extract_calibre_cover(raw, base, log):
from calibre.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw)
matches = soup.find(name=["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "font", "br"])
images = soup.findAll("img")
if matches is None and len(images) == 1 and images[0].get("alt", "") == "cover":
img = images[0]
img = os.path.join(base, *img["src"].split("/"))
if os.path.exists(img):
return open(img, "rb").read()
# Look for a simple cover, i.e. a body with no text and only one <img> tag
if matches is None:
body = soup.find("body")
if body is not None:
text = u"".join(map(unicode, body.findAll(text=True)))
if text.strip():
# Body has text, abort
return
images = body.findAll("img", src=True)
if 0 < len(images) < 2:
img = os.path.join(base, *images[0]["src"].split("/"))
if os.path.exists(img):
return open(img, "rb").read()
示例7: extract_calibre_cover
def extract_calibre_cover(raw, base, log):
from calibre.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw)
matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
'font', 'br'])
images = soup.findAll('img')
if matches is None and len(images) == 1 and \
images[0].get('alt', '')=='cover':
img = images[0]
img = os.path.join(base, *img['src'].split('/'))
if os.path.exists(img):
return open(img, 'rb').read()
# Look for a simple cover, i.e. a body with no text and only one <img> tag
if matches is None:
body = soup.find('body')
if body is not None:
text = u''.join(map(unicode, body.findAll(text=True)))
if text.strip():
# Body has text, abort
return
images = body.findAll('img', src=True)
if 0 < len(images) < 2:
img = os.path.join(base, *images[0]['src'].split('/'))
if os.path.exists(img):
return open(img, 'rb').read()
示例8: get_series
def get_series(title, authors, timeout=60):
mi = Metadata(title, authors)
if title and title[0] in _ignore_starts:
title = title[1:]
title = re.sub(r'^(A|The|An)\s+', '', title).strip()
if not title:
return mi
if isinstance(title, unicode):
title = title.encode('utf-8')
title = urllib.quote_plus(title)
author = authors[0].strip()
if not author:
return mi
if ',' in author:
author = author.split(',')[0]
else:
author = author.split()[-1]
url = URL.format(author, title)
br = browser()
try:
raw = br.open_novisit(url, timeout=timeout).read()
except URLError as e:
if isinstance(e.reason, socket.timeout):
raise Exception('KDL Server busy, try again later')
raise
if 'see the full results' not in raw:
return mi
raw = xml_to_unicode(raw)[0]
soup = BeautifulSoup(raw)
searcharea = soup.find('div', attrs={'class':'searcharea'})
if searcharea is None:
return mi
ss = searcharea.find('div', attrs={'class':'seriessearch'})
if ss is None:
return mi
a = ss.find('a', href=True)
if a is None:
return mi
href = a['href'].partition('?')[-1]
data = urlparse.parse_qs(href)
series = data.get('SeriesName', [])
if not series:
return mi
series = series[0]
series = re.sub(r' series$', '', series).strip()
if series:
mi.series = series
ns = ss.nextSibling
if ns.contents:
raw = unicode(ns.contents[0])
raw = raw.partition('.')[0].strip()
try:
mi.series_index = int(raw)
except:
pass
return mi
示例9: find_all_annotated_books
def find_all_annotated_books(self):
'''
Find all annotated books in library
'''
self._log_location("field: {0}".format(self.field))
cids = self.cdb.search_getting_ids('formats:EPUB', '')
for cid in cids:
mi = self.cdb.get_metadata(cid, index_is_id=True)
raw = mi.get_user_metadata(self.field, False)
if raw['#value#'] is not None:
soup = BeautifulSoup(raw['#value#'])
if soup.find('div', 'user_annotations') is not None:
self.annotation_map.append(mi.id)
示例10: get_soup
def get_soup(self, src, url=None):
nmassage = []
nmassage.extend(self.preprocess_regexps)
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url)
for pat, repl in nmassage:
usrc = pat.sub(repl, usrc)
soup = BeautifulSoup(usrc)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
for pat, repl in nmassage:
replace = pat.sub(repl, replace)
soup = BeautifulSoup(replace)
if self.keep_only_tags:
body = soup.new_tag('body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
return self.preprocess_html_ext(soup)
示例11: get_asin
def get_asin(self, connection):
query = urlencode({'keywords': '%s - %s' % (self._title, self._author)})
try:
connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
response = connection.getresponse().read()
except:
try:
connection.close()
if self._proxy:
connection = HTTPConnection(self._http_address, self._http_port)
connection.set_tunnel('www.amazon.com', 80)
else:
connection = HTTPConnection('www.amazon.com')
connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
response = connection.getresponse().read()
except:
self._status = self.FAIL
self._status_message = self.FAILED_COULD_NOT_CONNECT_TO_AMAZON
raise Exception(self._status_message)
# check to make sure there are results
if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response:
self._status = self.FAIL
self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
raise Exception(self._status_message)
soup = BeautifulSoup(response)
results = soup.findAll('div', {'id': 'resultsCol'})
if not results or len(results) == 0:
self._status = self.FAIL
self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
raise Exception(self._status_message)
for r in results:
if 'Buy now with 1-Click' in str(r):
asinSearch = self.AMAZON_ASIN_PAT.search(str(r))
if asinSearch:
self._asin = asinSearch.group(1)
mi = self._db.get_metadata(self._book_id)
identifiers = mi.get_identifiers()
identifiers['mobi-asin'] = self._asin
mi.set_identifiers(identifiers)
self._db.set_metadata(self._book_id, mi)
self._book_settings.prefs['asin'] = self._asin
return connection
self._status = self.FAIL
self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_ASIN
raise Exception(self._status_message)
示例12: _remove_old_style
def _remove_old_style(self, html):
'''
Remove the old style tag, finalize soup in preparation for styling
'''
unstyled_soup = BeautifulSoup(html)
head = unstyled_soup.find("head")
voc = unstyled_soup.body.find('div', {'class': 'vocabulary'})
tds = voc.findAll(lambda tag: tag.name == 'td' and tag.a)
dart = random.randrange(len(tds))
self.td = tds[dart]
self.oh = self.td.a['href']
self.td.a['href'] = self._finalize()
old_style = head.find('style')
if old_style:
old_style.extract()
return unstyled_soup
示例13: _inject_css
def _inject_css(self, html):
'''
stick a <style> element into html
'''
css = self.prefs.get('injected_css', None)
if css:
try:
styled_soup = BeautifulSoup(html)
head = styled_soup.find("head")
style_tag = Tag(styled_soup, 'style')
style_tag['type'] = "text/css"
style_tag.insert(0, css)
head.insert(0, style_tag)
html = styled_soup.renderContents()
except:
return html
return(html)
示例14: get_soup
def get_soup(self, src, url=None):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
# Some websites have buggy doctype declarations that mess up beautifulsoup
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url)
soup = BeautifulSoup(usrc, markupMassage=nmassage)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
if self.keep_only_tags:
body = Tag(soup, 'body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
return self.preprocess_html_ext(soup)
示例15: generate_html
def generate_html(comments):
args = dict(xmlns=XHTML_NS,
title_str=title_str,
css=css,
title=title,
author=author,
publisher=publisher,
pubdate_label=_('Published'), pubdate=pubdate,
series_label=_('Series'), series=series,
rating_label=_('Rating'), rating=rating,
tags_label=_('Tags'), tags=tags,
comments=comments,
footer=''
)
for key in mi.custom_field_keys():
try:
display_name, val = mi.format_field_extended(key)[:2]
key = key.replace('#', '_')
args[key] = escape(val)
args[key+'_label'] = escape(display_name)
except:
pass
# Used in the comment describing use of custom columns in templates
args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
args['_genre'] = args.get('_genre', '{_genre}')
generated_html = P('jacket/template.xhtml',
data=True).decode('utf-8').format(**args)
# Post-process the generated html to strip out empty header items
soup = BeautifulSoup(generated_html)
if not series:
series_tag = soup.find(attrs={'class':'cbj_series'})
if series_tag is not None:
series_tag.extract()
if not rating:
rating_tag = soup.find(attrs={'class':'cbj_rating'})
if rating_tag is not None:
rating_tag.extract()
if not tags:
tags_tag = soup.find(attrs={'class':'cbj_tags'})
if tags_tag is not None:
tags_tag.extract()
if not pubdate:
pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
if pubdate_tag is not None:
pubdate_tag.extract()
if output_profile.short_name != 'kindle':
hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
if hr_tag is not None:
hr_tag.extract()
return strip_encoding_declarations(
soup.renderContents('utf-8').decode('utf-8'))