本文整理汇总了Python中calibre.ebooks.BeautifulSoup.Tag类的典型用法代码示例。如果您正苦于以下问题:Python Tag类的具体用法?Python Tag怎么用?Python Tag使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Tag类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: postprocess_html
def postprocess_html(self, soup, first_fetch):
author_general = soup.find('span', { 'class': 'author_general' })
author_general.em.extract()
# the complete content
full_div = None
transcript_div = soup.find('div', { 'id': 'transcript' })
if transcript_div: # that's an interview
# get all <div class="qa" />
qa_div_list = list(find_by_class(transcript_div, 'div', 'qa'))
for qa_div in qa_div_list:
qa_div.extract()
# replace all <a class="question_link">...</a> with <strong>...</strong>
question_link = qa_div.find('a', { 'class': 'question_link' })
question_strong = Tag(soup, 'strong')
question_strong.append(question_link.string)
question_link.replaceWith(question_strong)
full_div = find_by_class(soup.find('div', { 'id': 'content' }), 'div', 'presentation_full').next()
# clean the <h1 />
full_div.h1.span.extract()
title_div = full_div.h1.div
title_div.replaceWith(title_div.string)
# clear the presentation area
for div in full_div.findAll('div'):
div.extract()
# add qa list back to presentation area
for qa_div in qa_div_list:
full_div.append(qa_div)
else:
# text only without title
text_div = find_by_class(soup, 'div', 'text_info').next()
text_div.extract()
for other in text_div.findAll('div'):
other.extract()
# full_div contains title
full_div = soup.find('div', { 'id': 'content' })
for other in full_div.findAll('div'):
other.extract()
full_div.append(text_div)
# keep full_div in <body /> only
full_div.extract()
for other in soup.body:
other.extract()
soup.body.append(full_div)
return soup
示例2: get_soup
def get_soup(self, src, url=None):
nmassage = []
nmassage.extend(self.preprocess_regexps)
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url)
for pat, repl in nmassage:
usrc = pat.sub(repl, usrc)
set_soup_module(sys.modules[BeautifulSoup.__module__])
soup = parse(usrc, return_root=False)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
for pat, repl in nmassage:
replace = pat.sub(repl, replace)
soup = parse(replace, return_root=False)
if self.keep_only_tags:
body = Tag(soup, 'body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
return self.preprocess_html_ext(soup)
示例3: get_soup
def get_soup(self, src, url=None):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
# Some websites have buggy doctype declarations that mess up beautifulsoup
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url)
soup = BeautifulSoup(usrc, markupMassage=nmassage)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
if self.keep_only_tags:
body = Tag(soup, 'body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
return self.preprocess_html_ext(soup)
示例4: _inject_css
def _inject_css(self, html):
'''
stick a <style> element into html
'''
css = self.prefs.get('injected_css', None)
if css:
try:
styled_soup = BeautifulSoup(html)
head = styled_soup.find("head")
style_tag = Tag(styled_soup, 'style')
style_tag['type'] = "text/css"
style_tag.insert(0, css)
head.insert(0, style_tag)
html = styled_soup.renderContents()
except:
return html
return(html)
示例5: inject_css
def inject_css(self, html):
'''
stick a <style> element into html
Deep View content structured differently
<html style=""><body style="">
'''
css = str(self.css_pte.toPlainText())
if css:
raw_soup = self._remove_old_style(html)
style_tag = Tag(raw_soup, 'style')
style_tag['type'] = "text/css"
style_tag.insert(0, css)
head = raw_soup.find("head")
head.insert(0, style_tag)
self.styled_soup = raw_soup
html = self.styled_soup.renderContents()
return html
示例6: preview_css
def preview_css(self):
'''
Construct a dummy set of notes and annotation for preview purposes
Modeled after book_status:_get_formatted_annotations()
'''
from calibre_plugins.marvin_manager.annotations import (
ANNOTATIONS_HTML_TEMPLATE, Annotation, Annotations, BookNotes, BookmarkNotes)
# Assemble the preview soup
soup = BeautifulSoup(ANNOTATIONS_HTML_TEMPLATE)
# Load the CSS from MXD resources
path = os.path.join(self.parent.opts.resources_path, 'css', 'annotations.css')
with open(path, 'rb') as f:
css = f.read().decode('utf-8')
style_tag = Tag(soup, 'style')
style_tag.insert(0, css)
soup.head.style.replaceWith(style_tag)
# Assemble the sample Book notes
book_notes_soup = BookNotes().construct(self.sample_book_notes)
soup.body.append(book_notes_soup)
cd_tag = Tag(soup, 'div', [('class', "divider")])
soup.body.append(cd_tag)
# Assemble the sample Bookmark notes
bookmark_notes_soup = BookmarkNotes().construct(self.sample_bookmark_notes)
soup.body.append(bookmark_notes_soup)
cd_tag = Tag(soup, 'div', [('class', "divider")])
soup.body.append(cd_tag)
# Assemble the sample annotations
pas = Annotations(None, title="Preview")
pas.annotations.append(Annotation(self.sample_ann_1))
pas.annotations.append(Annotation(self.sample_ann_2))
pas.annotations.append(Annotation(self.sample_ann_3))
annotations_soup = pas.to_HTML(pas.create_soup())
soup.body.append(annotations_soup)
self.parent.wv.setHtml(unicode(soup.renderContents()))
示例7: construct
def construct(self, book_notes):
'''
Given a list of notes, render HTML
'''
soup = None
if book_notes:
soup = BeautifulSoup('''<div class="{0}"></div>'''.format('book_notes'))
for note in book_notes:
div_tag = Tag(soup, 'div', [('class', "book_note")])
p_tag = Tag(soup, 'p', [('class', "book_note"),
('style', "{0}".format(self._get_note_style()))])
p_tag.append(note)
div_tag.append(p_tag)
soup.div.append(div_tag)
return soup
示例8: comments_to_html
def comments_to_html(comments):
'''
Convert random comment text to normalized, xml-legal block of <p>s
'plain text' returns as
<p>plain text</p>
'plain text with <i>minimal</i> <b>markup</b>' returns as
<p>plain text with <i>minimal</i> <b>markup</b></p>
'<p>pre-formatted text</p> returns untouched
'A line of text\n\nFollowed by a line of text' returns as
<p>A line of text</p>
<p>Followed by a line of text</p>
'A line of text.\nA second line of text.\rA third line of text' returns as
<p>A line of text.<br />A second line of text.<br />A third line of text.</p>
'...end of a paragraph.Somehow the break was lost...' returns as
<p>...end of a paragraph.</p>
<p>Somehow the break was lost...</p>
Deprecated HTML returns as HTML via BeautifulSoup()
'''
if not comments:
return u'<p></p>'
if not isinstance(comments, unicode):
comments = comments.decode(preferred_encoding, 'replace')
if comments.lstrip().startswith('<'):
# Comment is already HTML do not mess with it
return comments
if '<' not in comments:
comments = prepare_string_for_xml(comments)
parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
for x in comments.split('\n\n')]
return '\n'.join(parts)
if sanitize_pat.search(comments) is not None:
try:
return sanitize_comments_html(comments)
except:
import traceback
traceback.print_exc()
return u'<p></p>'
# Explode lost CRs to \n\n
comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
'.\r'), comments)
for lost_cr in lost_cr_pat.finditer(comments):
comments = comments.replace(lost_cr.group(),
'%s%s\n\n%s' % (lost_cr.group(1),
lost_cr.group(2),
lost_cr.group(3)))
comments = comments.replace(u'\r', u'')
# Convert \n\n to <p>s
comments = comments.replace(u'\n\n', u'<p>')
# Convert solo returns to <br />
comments = comments.replace(u'\n', '<br />')
# Convert two hyphens to emdash
comments = comments.replace('--', '—')
soup = BeautifulSoup(comments)
result = BeautifulSoup()
rtc = 0
open_pTag = False
all_tokens = list(soup.contents)
for token in all_tokens:
if type(token) is NavigableString:
if not open_pTag:
pTag = Tag(result,'p')
open_pTag = True
ptc = 0
pTag.insert(ptc,prepare_string_for_xml(token))
ptc += 1
elif type(token) in (CData, Comment, Declaration,
ProcessingInstruction):
continue
elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
'hr']:
if not open_pTag:
pTag = Tag(result,'p')
open_pTag = True
ptc = 0
pTag.insert(ptc, token)
ptc += 1
else:
if open_pTag:
result.insert(rtc, pTag)
rtc += 1
open_pTag = False
ptc = 0
result.insert(rtc, token)
rtc += 1
if open_pTag:
#.........这里部分代码省略.........
示例9: generate_annotation_html
def generate_annotation_html(self, bookmark):
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
# Returns <div class="user_annotations"> ... </div>
last_read_location = bookmark.last_read_location
timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
percent_read = bookmark.percent_read
ka_soup = BeautifulSoup()
dtc = 0
divTag = Tag(ka_soup,'div')
divTag['class'] = 'user_annotations'
# Add the last-read location
spanTag = Tag(ka_soup, 'span')
spanTag['style'] = 'font-weight:bold'
if bookmark.book_format == 'pdf':
spanTag.insert(0,NavigableString(
_("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % dict(
time=strftime(u'%x', timestamp.timetuple()),
loc=last_read_location,
pr=percent_read)))
else:
spanTag.insert(0,NavigableString(
_("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % dict(
time=strftime(u'%x', timestamp.timetuple()),
loc=last_read_location,
pr=percent_read)))
divTag.insert(dtc, spanTag)
dtc += 1
divTag.insert(dtc, Tag(ka_soup,'br'))
dtc += 1
if bookmark.user_notes:
user_notes = bookmark.user_notes
annotations = []
# Add the annotations sorted by location
# Italicize highlighted text
for location in sorted(user_notes):
if user_notes[location]['text']:
annotations.append(
_('<b>Location %(dl)d • %(typ)s</b><br />%(text)s<br />') % dict(
dl=user_notes[location]['displayed_location'],
typ=user_notes[location]['type'],
text=(user_notes[location]['text'] if
user_notes[location]['type'] == 'Note' else
'<i>%s</i>' % user_notes[location]['text'])))
else:
if bookmark.book_format == 'pdf':
annotations.append(
_('<b>Page %(dl)d • %(typ)s</b><br />') % dict(
dl=user_notes[location]['displayed_location'],
typ=user_notes[location]['type']))
else:
annotations.append(
_('<b>Location %(dl)d • %(typ)s</b><br />') % dict(
dl=user_notes[location]['displayed_location'],
typ=user_notes[location]['type']))
for annotation in annotations:
divTag.insert(dtc, annotation)
dtc += 1
ka_soup.insert(0,divTag)
return ka_soup
示例10: generate_annotation_html
def generate_annotation_html(self, bookmark):
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
# Returns <div class="user_annotations"> ... </div>
last_read_location = bookmark.last_read_location
timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
percent_read = bookmark.percent_read
ka_soup = BeautifulSoup()
dtc = 0
divTag = Tag(ka_soup, "div")
divTag["class"] = "user_annotations"
# Add the last-read location
spanTag = Tag(ka_soup, "span")
spanTag["style"] = "font-weight:bold"
if bookmark.book_format == "pdf":
spanTag.insert(
0,
NavigableString(
_("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)")
% dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read)
),
)
else:
spanTag.insert(
0,
NavigableString(
_("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)")
% dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read)
),
)
divTag.insert(dtc, spanTag)
dtc += 1
divTag.insert(dtc, Tag(ka_soup, "br"))
dtc += 1
if bookmark.user_notes:
user_notes = bookmark.user_notes
annotations = []
# Add the annotations sorted by location
# Italicize highlighted text
for location in sorted(user_notes):
if user_notes[location]["text"]:
annotations.append(
_("<b>Location %(dl)d • %(typ)s</b><br />%(text)s<br />")
% dict(
dl=user_notes[location]["displayed_location"],
typ=user_notes[location]["type"],
text=(
user_notes[location]["text"]
if user_notes[location]["type"] == "Note"
else "<i>%s</i>" % user_notes[location]["text"]
),
)
)
else:
if bookmark.book_format == "pdf":
annotations.append(
_("<b>Page %(dl)d • %(typ)s</b><br />")
% dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"])
)
else:
annotations.append(
_("<b>Location %(dl)d • %(typ)s</b><br />")
% dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"])
)
for annotation in annotations:
divTag.insert(dtc, annotation)
dtc += 1
ka_soup.insert(0, divTag)
return ka_soup
示例11: rebuild_collections
def rebuild_collections(self, booklist, oncard):
'''
For each book in the booklist for the card oncard, remove it from all
its current collections, then add it to the collections specified in
device_collections.
oncard is None for the main memory, carda for card A, cardb for card B,
etc.
booklist is the object created by the :method:`books` call above.
This is called after the user edits the 'Collections' field in the Device view
when Metadata management is set to 'Manual'.
'''
self._log_location()
command_name = "rebuild_collections"
command_element = "rebuildcollections"
command_soup = BeautifulStoneSoup(self.parent.COMMAND_XML.format(
command_element, time.mktime(time.localtime())))
LOCAL_DEBUG = False
if booklist:
changed = 0
for book in booklist:
if LOCAL_DEBUG:
self._log("{0:7} {1}".format(book.in_library, book.title))
filename = self.parent.path_template.format(book.uuid)
if filename not in self.parent.cached_books:
for fn in self.parent.cached_books:
if book.uuid and book.uuid == self.parent.cached_books[fn]['uuid']:
if LOCAL_DEBUG:
self._log("'%s' matched on uuid %s" % (book.title, book.uuid))
filename = fn
break
elif (book.title == self.parent.cached_books[fn]['title'] and
book.authors == self.parent.cached_books[fn]['authors']):
if LOCAL_DEBUG:
self._log("'%s' matched on title/author" % book.title)
filename = fn
break
else:
self._log("ERROR: file %s not found in cached_books" % repr(filename))
continue
cached_collections = self.parent.cached_books[filename]['device_collections']
if cached_collections != book.device_collections:
# Append the changed book info to the command file
book_tag = Tag(command_soup, 'book')
book_tag['filename'] = filename
book_tag['title'] = book.title
book_tag['author'] = ', '.join(book.authors)
book_tag['uuid'] = book.uuid
collections_tag = Tag(command_soup, 'collections')
for tag in book.device_collections:
c_tag = Tag(command_soup, 'collection')
c_tag.insert(0, tag)
collections_tag.insert(0, c_tag)
book_tag.insert(0, collections_tag)
command_soup.manifest.insert(0, book_tag)
# Update cache
self.parent.cached_books[filename]['device_collections'] = book.device_collections
changed += 1
if changed:
# Stage the command file
self.parent._stage_command_file(command_name, command_soup,
show_command=self.parent.prefs.get('development_mode', False))
# Wait for completion
self.parent._wait_for_command_completion(command_name)
else:
self._log("no collection changes detected cached_books <=> device books")
示例12: postprocess_html
def postprocess_html(self,soup, True):
try:
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg before article body
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
except:
self.log("ERROR: One picture per article in postprocess_html")
try:
# Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and len(caption) > 0:
cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
cTag.insert(0, c)
caption.replaceWith(cTag)
except:
self.log("ERROR: Problem in change captions to italic")
try:
# Change <nyt_headline> to <h2>
h1 = soup.find('h1')
blogheadline = str(h1) #added for dealbook
if h1:
headline = h1.find("nyt_headline")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag)
elif blogheadline.find('entry-title'):#added for dealbook
tag = Tag(soup, "h2")#added for dealbook
tag['class'] = "headline"#added for dealbook
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
h1.replaceWith(tag)#added for dealbook
else:
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
headline = soup.find('title')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.renderContents()))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try:
#if this is from a blog (dealbook, fix the byline format
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
if bylineauthor:
tag = Tag(soup, "h6")
tag['class'] = "byline"
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
bylineauthor.replaceWith(tag)
except:
self.log("ERROR: fixing byline author format")
try:
#if this is a blog (dealbook) fix the credit style for the pictures
blogcredit = soup.find('div',attrs={'class':'credit'})
if blogcredit:
tag = Tag(soup, "h6")
#.........这里部分代码省略.........
示例13: postprocess_html
def postprocess_html(self,soup, True):
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg before article body
article_body = soup.find(True, {'id':'articleBody'})
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
# Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]:
cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
cTag.insert(0, c)
caption.replaceWith(cTag)
# Change <nyt_headline> to <h2>
h1 = soup.find('h1')
if h1:
headline = h1.find("nyt_headline")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag)
else:
# Blog entry - replace headline, remove <hr> tags
headline = soup.find('title')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead:
# Nuke the href
if masthead.a:
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag)
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents:
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
#.........这里部分代码省略.........
示例14: to_HTML
def to_HTML(self, header=''):
'''
Generate HTML with user-specified CSS, element order
'''
# Retrieve CSS prefs
from calibre_plugins.annotations.appearance import default_elements
stored_css = plugin_prefs.get('appearance_css', default_elements)
elements = []
for element in stored_css:
elements.append(element['name'])
if element['name'] == 'Note':
note_style = re.sub('\n', '', element['css'])
elif element['name'] == 'Text':
text_style = re.sub('\n', '', element['css'])
elif element['name'] == 'Timestamp':
ts_style = re.sub('\n', '', element['css'])
# Additional CSS for timestamp color and bg to be formatted
datetime_style = ("background-color:{0};color:{1};" + ts_style)
# Order the elements according to stored preferences
comments_body = ''
for element in elements:
if element == 'Text':
comments_body += '{text}'
elif element == 'Note':
comments_body += '{note}'
elif element == 'Timestamp':
ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}">
<tr>
<td class="location" style="text-align:left">{location}</td>
<td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td>
</tr>
</table>'''
comments_body += re.sub(r'>\s+<', r'><', ts_css)
if self.annotations:
soup = BeautifulSoup(ANNOTATIONS_HEADER)
dtc = 0
# Add the annotations
for i, agroup in enumerate(sorted(self.annotations, key=self._annotation_sorter)):
location = agroup.location
if location is None:
location = ''
friendly_timestamp = self._timestamp_to_datestr(agroup.timestamp)
text = ''
if agroup.text:
for agt in agroup.text:
text += '<p class="highlight" style="{0}">{1}</p>'.format(text_style, agt)
note = ''
if agroup.note:
for agn in agroup.note:
note += '<p class="note" style="{0}">{1}</p>'.format(note_style, agn)
try:
dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg']
dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg']
except:
if agroup.highlightcolor is None:
msg = "No highlight color specified, using Default"
else:
msg = "Unknown color '%s' specified" % agroup.highlightcolor
self._log_location(msg)
dt_bgcolor = COLOR_MAP['Default']['bg']
dt_fgcolor = COLOR_MAP['Default']['fg']
if agroup.hash is not None:
# Use existing hash when re-rendering
hash = agroup.hash
else:
m = hashlib.md5()
m.update(text)
m.update(note)
hash = m.hexdigest()
divTag = Tag(BeautifulSoup(), 'div')
content_args = {
'color': agroup.highlightcolor,
'friendly_timestamp': friendly_timestamp,
'location': location,
'note': note,
'text': text,
'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor),
'unix_timestamp': agroup.timestamp,
}
divTag.insert(0, comments_body.format(**content_args))
divTag['class'] = "annotation"
divTag['genre'] = ''
if agroup.genre:
divTag['genre'] = escape(agroup.genre)
divTag['hash'] = hash
divTag['location_sort'] = agroup.location_sort
divTag['reader'] = agroup.reader_app
divTag['style'] = ANNOTATION_DIV_STYLE
soup.div.insert(dtc, divTag)
#.........这里部分代码省略.........
示例15: comments_to_html
def comments_to_html(comments):
"""
Convert random comment text to normalized, xml-legal block of <p>s
'plain text' returns as
<p>plain text</p>
'plain text with <i>minimal</i> <b>markup</b>' returns as
<p>plain text with <i>minimal</i> <b>markup</b></p>
'<p>pre-formatted text</p> returns untouched
'A line of text\n\nFollowed by a line of text' returns as
<p>A line of text</p>
<p>Followed by a line of text</p>
'A line of text.\nA second line of text.\rA third line of text' returns as
<p>A line of text.<br />A second line of text.<br />A third line of text.</p>
'...end of a paragraph.Somehow the break was lost...' returns as
<p>...end of a paragraph.</p>
<p>Somehow the break was lost...</p>
Deprecated HTML returns as HTML via BeautifulSoup()
"""
if not comments:
return u"<p></p>"
if not isinstance(comments, unicode):
comments = comments.decode(preferred_encoding, "replace")
if comments.lstrip().startswith("<"):
# Comment is already HTML do not mess with it
return comments
if "<" not in comments:
comments = prepare_string_for_xml(comments)
parts = [u'<p class="description">%s</p>' % x.replace(u"\n", u"<br />") for x in comments.split("\n\n")]
return "\n".join(parts)
if sanitize_pat.search(comments) is not None:
try:
return sanitize_comments_html(comments)
except:
import traceback
traceback.print_exc()
return u"<p></p>"
# Explode lost CRs to \n\n
comments = lost_cr_exception_pat.sub(lambda m: m.group().replace(".", ".\r"), comments)
for lost_cr in lost_cr_pat.finditer(comments):
comments = comments.replace(
lost_cr.group(), "%s%s\n\n%s" % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))
)
comments = comments.replace(u"\r", u"")
# Convert \n\n to <p>s
comments = comments.replace(u"\n\n", u"<p>")
# Convert solo returns to <br />
comments = comments.replace(u"\n", "<br />")
# Convert two hyphens to emdash
comments = comments.replace("--", "—")
soup = BeautifulSoup(comments)
result = BeautifulSoup()
rtc = 0
open_pTag = False
all_tokens = list(soup.contents)
for token in all_tokens:
if type(token) is NavigableString:
if not open_pTag:
pTag = Tag(result, "p")
open_pTag = True
ptc = 0
pTag.insert(ptc, prepare_string_for_xml(token))
ptc += 1
elif type(token) in (CData, Comment, Declaration, ProcessingInstruction):
continue
elif token.name in ["br", "b", "i", "em", "strong", "span", "font", "a", "hr"]:
if not open_pTag:
pTag = Tag(result, "p")
open_pTag = True
ptc = 0
pTag.insert(ptc, token)
ptc += 1
else:
if open_pTag:
result.insert(rtc, pTag)
rtc += 1
open_pTag = False
ptc = 0
result.insert(rtc, token)
rtc += 1
if open_pTag:
result.insert(rtc, pTag)
for p in result.findAll("p"):
p["class"] = "description"
#.........这里部分代码省略.........