本文整理汇总了Python中calibre.ebooks.BeautifulSoup.BeautifulSoup.findAll方法的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup.findAll方法的具体用法?Python BeautifulSoup.findAll怎么用?Python BeautifulSoup.findAll使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类calibre.ebooks.BeautifulSoup.BeautifulSoup
的用法示例。
在下文中一共展示了BeautifulSoup.findAll方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_annotations_date_range
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def get_annotations_date_range(self):
'''
Find oldest, newest annotation in annotated books
initial values of self.oldest, self.newest are reversed to allow update comparisons
if no annotations, restore to correct values
'''
annotations_found = False
for cid in self.annotation_map:
mi = self.cdb.get_metadata(cid, index_is_id=True)
if self.field == 'Comments':
soup = BeautifulSoup(mi.comments)
else:
soup = BeautifulSoup(mi.get_user_metadata(self.field, False)['#value#'])
uas = soup.findAll('div', 'annotation')
for ua in uas:
annotations_found = True
timestamp = float(ua.find('td', 'timestamp')['uts'])
if timestamp < self.oldest_annotation:
self.oldest_annotation = timestamp
if timestamp > self.newest_annotation:
self.newest_annotation = timestamp
if not annotations_found:
temp = self.newest_annotation
self.newest_annotation = self.oldest_annotation
self.oldest_annotation = temp
示例2: extract_calibre_cover
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def extract_calibre_cover(raw, base, log):
from calibre.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw)
matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
'font', 'br'])
images = soup.findAll('img')
if matches is None and len(images) == 1 and \
images[0].get('alt', '')=='cover':
img = images[0]
img = os.path.join(base, *img['src'].split('/'))
if os.path.exists(img):
return open(img, 'rb').read()
# Look for a simple cover, i.e. a body with no text and only one <img> tag
if matches is None:
body = soup.find('body')
if body is not None:
text = u''.join(map(unicode, body.findAll(text=True)))
if text.strip():
# Body has text, abort
return
images = body.findAll('img', src=True)
if 0 < len(images) < 2:
img = os.path.join(base, *images[0]['src'].split('/'))
if os.path.exists(img):
return open(img, 'rb').read()
示例3: extract_calibre_cover
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def extract_calibre_cover(raw, base, log):
from calibre.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw)
matches = soup.find(name=["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "font", "br"])
images = soup.findAll("img")
if matches is None and len(images) == 1 and images[0].get("alt", "") == "cover":
img = images[0]
img = os.path.join(base, *img["src"].split("/"))
if os.path.exists(img):
return open(img, "rb").read()
# Look for a simple cover, i.e. a body with no text and only one <img> tag
if matches is None:
body = soup.find("body")
if body is not None:
text = u"".join(map(unicode, body.findAll(text=True)))
if text.strip():
# Body has text, abort
return
images = body.findAll("img", src=True)
if 0 < len(images) < 2:
img = os.path.join(base, *images[0]["src"].split("/"))
if os.path.exists(img):
return open(img, "rb").read()
示例4: search_for_asin_on_amazon
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def search_for_asin_on_amazon(self, query):
'''Search for book's asin on amazon using given query'''
query = urlencode({'keywords': query})
url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query
try:
response = open_url(self._connections['amazon'], url)
except PageDoesNotExist:
return None
# check to make sure there are results
if ('did not match any products' in response and 'Did you mean:' not in response and
'so we searched in All Departments' not in response):
return None
soup = BeautifulSoup(response)
results = soup.findAll('div', {'id': 'resultsCol'})
if not results:
return None
for result in results:
if 'Buy now with 1-Click' in str(result):
asin_search = AMAZON_ASIN_PAT.search(str(result))
if asin_search:
return asin_search.group(1)
return None
示例5: get_soup
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def get_soup(self, src, url=None):
nmassage = []
nmassage.extend(self.preprocess_regexps)
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url)
for pat, repl in nmassage:
usrc = pat.sub(repl, usrc)
soup = BeautifulSoup(usrc)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
for pat, repl in nmassage:
replace = pat.sub(repl, replace)
soup = BeautifulSoup(replace)
if self.keep_only_tags:
body = soup.new_tag('body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
return self.preprocess_html_ext(soup)
示例6: get_soup
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def get_soup(self, src, url=None):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
# Some websites have buggy doctype declarations that mess up beautifulsoup
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url)
soup = BeautifulSoup(usrc, markupMassage=nmassage)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
if self.keep_only_tags:
body = Tag(soup, 'body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
return self.preprocess_html_ext(soup)
示例7: get_asin
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def get_asin(self, connection):
query = urlencode({'keywords': '%s - %s' % (self._title, self._author)})
try:
connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
response = connection.getresponse().read()
except:
try:
connection.close()
if self._proxy:
connection = HTTPConnection(self._http_address, self._http_port)
connection.set_tunnel('www.amazon.com', 80)
else:
connection = HTTPConnection('www.amazon.com')
connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
response = connection.getresponse().read()
except:
self._status = self.FAIL
self._status_message = self.FAILED_COULD_NOT_CONNECT_TO_AMAZON
raise Exception(self._status_message)
# check to make sure there are results
if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response:
self._status = self.FAIL
self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
raise Exception(self._status_message)
soup = BeautifulSoup(response)
results = soup.findAll('div', {'id': 'resultsCol'})
if not results or len(results) == 0:
self._status = self.FAIL
self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
raise Exception(self._status_message)
for r in results:
if 'Buy now with 1-Click' in str(r):
asinSearch = self.AMAZON_ASIN_PAT.search(str(r))
if asinSearch:
self._asin = asinSearch.group(1)
mi = self._db.get_metadata(self._book_id)
identifiers = mi.get_identifiers()
identifiers['mobi-asin'] = self._asin
mi.set_identifiers(identifiers)
self._db.set_metadata(self._book_id, mi)
self._book_settings.prefs['asin'] = self._asin
return connection
self._status = self.FAIL
self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_ASIN
raise Exception(self._status_message)
示例8: get_asin
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def get_asin(self):
query = urlencode({'keywords': '%s' % self.title_and_author})
try:
self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
response = self._aConnection.getresponse().read()
except:
try:
self._aConnection.close()
if self._proxy:
self._aConnection = HTTPConnection(self._http_address, self._http_port)
self._aConnection.set_tunnel('www.amazon.com', 80)
else:
self._aConnection = HTTPConnection('www.amazon.com')
self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
response = self._aConnection.getresponse().read()
except:
return None
# check to make sure there are results
if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response:
return None
soup = BeautifulSoup(response)
results = soup.findAll('div', {'id': 'resultsCol'})
if not results or len(results) == 0:
return None
for r in results:
if 'Buy now with 1-Click' in str(r):
asinSearch = self.AMAZON_ASIN_PAT.search(str(r))
if asinSearch:
asin = asinSearch.group(1)
mi = self._db.get_metadata(self._book_id)
identifiers = mi.get_identifiers()
identifiers['mobi-asin'] = asin
mi.set_identifiers(identifiers)
self._db.set_metadata(self._book_id, mi)
return asin
示例9: merge_annotations_with_comments
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def merge_annotations_with_comments(parent, cid, comments_soup, new_soup):
'''
comments_soup: comments potentially with user_annotations
'''
# Prepare a new COMMENTS_DIVIDER
comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
plugin_prefs.get('COMMENTS_DIVIDER', '· · • · ✦ · • · ·'))
# Remove the old comments_divider
cds = comments_soup.find('div', 'comments_divider')
if cds:
cds.extract()
# Existing annotations?
uas = comments_soup.find('div', 'user_annotations')
if uas:
# Save the existing annotations to old_soup
old_soup = BeautifulSoup(unicode(uas))
# Remove any hrs from old_soup
hrs = old_soup.findAll('hr')
if hrs:
for hr in hrs:
hr.extract()
# Remove the existing annotations from comments_soup
uas.extract()
# Merge old_soup with new_soup
merged_soup = unicode(comments_soup) + \
unicode(comments_divider) + \
unicode(merge_annotations(parent, cid, old_soup, new_soup))
else:
# No existing, just merge comments_soup with already sorted new_soup
merged_soup = unicode(comments_soup) + \
unicode(comments_divider) + \
unicode(new_soup)
return merged_soup
示例10: read_html_toc
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def read_html_toc(self, toc):
self.base_path = os.path.dirname(toc)
soup = BeautifulSoup(open(toc, "rb").read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
for a in soup.findAll("a"):
if not a.has_key("href"):
continue
purl = urlparse(unquote(a["href"]))
href, fragment = purl[2], purl[5]
if not fragment:
fragment = None
else:
fragment = fragment.strip()
href = href.strip()
txt = "".join([unicode(s).strip() for s in a.findAll(text=True)])
add = True
for i in self.flat():
if i.href == href and i.fragment == fragment:
add = False
break
if add:
self.add_item(href, fragment, txt)
示例11: update_results
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def update_results(self, trigger):
#self._log_location(trigger)
reader_to_match = str(self.find_annotations_reader_comboBox.currentText())
color_to_match = str(self.find_annotations_color_comboBox.currentText())
text_to_match = str(self.find_annotations_text_lineEdit.text())
note_to_match = str(self.find_annotations_note_lineEdit.text())
from_date = self.find_annotations_date_from_dateEdit.dateTime().toTime_t()
to_date = self.find_annotations_date_to_dateEdit.dateTime().toTime_t()
annotation_map = self.annotated_books_scanner.annotation_map
#field = self.prefs.get("cfg_annotations_destination_field", None)
field = get_cc_mapping('annotations', 'field', None)
db = self.opts.gui.current_db
matched_titles = []
self.matched_ids = set()
for cid in annotation_map:
mi = db.get_metadata(cid, index_is_id=True)
soup = None
if field == 'Comments':
if mi.comments:
soup = BeautifulSoup(mi.comments)
else:
if mi.get_user_metadata(field, False)['#value#'] is not None:
soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#'])
if soup:
uas = soup.findAll('div', 'annotation')
for ua in uas:
# Are we already logged?
if cid in self.matched_ids:
continue
# Check reader
if reader_to_match != self.GENERIC_READER:
this_reader = ua['reader']
if this_reader != reader_to_match:
continue
# Check color
if color_to_match != self.GENERIC_STYLE:
this_color = ua.find('table')['color']
if this_color != color_to_match:
continue
# Check date range, allow for mangled timestamp
try:
timestamp = float(ua.find('td', 'timestamp')['uts'])
if timestamp < from_date or timestamp > to_date:
continue
except:
continue
highlight_text = ''
try:
pels = ua.findAll('p', 'highlight')
for pel in pels:
highlight_text += pel.string + '\n'
except:
pass
if text_to_match > '':
if not re.search(text_to_match, highlight_text, flags=re.IGNORECASE):
continue
note_text = ''
try:
nels = ua.findAll('p', 'note')
for nel in nels:
note_text += nel.string + '\n'
except:
pass
if note_to_match > '':
if not re.search(note_to_match, note_text, flags=re.IGNORECASE):
continue
# If we made it this far, add the id to matched_ids
self.matched_ids.add(cid)
matched_titles.append(mi.title)
# Update the results box
matched_titles.sort()
if len(annotation_map):
if len(matched_titles):
first_match = ("<i>%s</i>" % matched_titles[0])
if len(matched_titles) == 1:
results = first_match
else:
results = first_match + (" and %d more." % (len(matched_titles) - 1))
self.result_label.setText('<p style="color:blue">{0}</p>'.format(results))
else:
self.result_label.setText('<p style="color:red">no matches</p>')
else:
self.result_label.setText('<p style="color:red">no annotated books in library</p>')
self.resize_dialog()
示例12: comments_to_html
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
#.........这里部分代码省略.........
'<p>pre-formatted text</p> returns untouched
'A line of text\n\nFollowed by a line of text' returns as
<p>A line of text</p>
<p>Followed by a line of text</p>
'A line of text.\nA second line of text.\rA third line of text' returns as
<p>A line of text.<br />A second line of text.<br />A third line of text.</p>
'...end of a paragraph.Somehow the break was lost...' returns as
<p>...end of a paragraph.</p>
<p>Somehow the break was lost...</p>
Deprecated HTML returns as HTML via BeautifulSoup()
'''
if not comments:
return u'<p></p>'
if not isinstance(comments, unicode):
comments = comments.decode(preferred_encoding, 'replace')
if comments.lstrip().startswith('<'):
# Comment is already HTML do not mess with it
return comments
if '<' not in comments:
comments = prepare_string_for_xml(comments)
parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
for x in comments.split('\n\n')]
return '\n'.join(parts)
if sanitize_pat.search(comments) is not None:
try:
return sanitize_comments_html(comments)
except:
import traceback
traceback.print_exc()
return u'<p></p>'
# Explode lost CRs to \n\n
comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
'.\r'), comments)
for lost_cr in lost_cr_pat.finditer(comments):
comments = comments.replace(lost_cr.group(),
'%s%s\n\n%s' % (lost_cr.group(1),
lost_cr.group(2),
lost_cr.group(3)))
comments = comments.replace(u'\r', u'')
# Convert \n\n to <p>s
comments = comments.replace(u'\n\n', u'<p>')
# Convert solo returns to <br />
comments = comments.replace(u'\n', '<br />')
# Convert two hyphens to emdash
comments = comments.replace('--', '—')
soup = BeautifulSoup(comments)
result = BeautifulSoup()
rtc = 0
open_pTag = False
all_tokens = list(soup.contents)
for token in all_tokens:
if type(token) is NavigableString:
if not open_pTag:
pTag = Tag(result,'p')
open_pTag = True
ptc = 0
pTag.insert(ptc,prepare_string_for_xml(token))
ptc += 1
elif type(token) in (CData, Comment, Declaration,
ProcessingInstruction):
continue
elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
'hr']:
if not open_pTag:
pTag = Tag(result,'p')
open_pTag = True
ptc = 0
pTag.insert(ptc, token)
ptc += 1
else:
if open_pTag:
result.insert(rtc, pTag)
rtc += 1
open_pTag = False
ptc = 0
result.insert(rtc, token)
rtc += 1
if open_pTag:
result.insert(rtc, pTag)
for p in result.findAll('p'):
p['class'] = 'description'
for t in result.findAll(text=True):
t.replaceWith(prepare_string_for_xml(unicode(t)))
return result.renderContents(encoding=None)
示例13: comments_to_html
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import findAll [as 别名]
def comments_to_html(comments):
"""
Convert random comment text to normalized, xml-legal block of <p>s
'plain text' returns as
<p>plain text</p>
'plain text with <i>minimal</i> <b>markup</b>' returns as
<p>plain text with <i>minimal</i> <b>markup</b></p>
'<p>pre-formatted text</p> returns untouched
'A line of text\n\nFollowed by a line of text' returns as
<p>A line of text</p>
<p>Followed by a line of text</p>
'A line of text.\nA second line of text.\rA third line of text' returns as
<p>A line of text.<br />A second line of text.<br />A third line of text.</p>
'...end of a paragraph.Somehow the break was lost...' returns as
<p>...end of a paragraph.</p>
<p>Somehow the break was lost...</p>
Deprecated HTML returns as HTML via BeautifulSoup()
"""
if not comments:
return u"<p></p>"
if not isinstance(comments, unicode):
comments = comments.decode(preferred_encoding, "replace")
if comments.lstrip().startswith("<"):
# Comment is already HTML do not mess with it
return comments
if "<" not in comments:
comments = prepare_string_for_xml(comments)
parts = [u'<p class="description">%s</p>' % x.replace(u"\n", u"<br />") for x in comments.split("\n\n")]
return "\n".join(parts)
if sanitize_pat.search(comments) is not None:
try:
return sanitize_comments_html(comments)
except:
import traceback
traceback.print_exc()
return u"<p></p>"
# Explode lost CRs to \n\n
comments = lost_cr_exception_pat.sub(lambda m: m.group().replace(".", ".\r"), comments)
for lost_cr in lost_cr_pat.finditer(comments):
comments = comments.replace(
lost_cr.group(), "%s%s\n\n%s" % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))
)
comments = comments.replace(u"\r", u"")
# Convert \n\n to <p>s
comments = comments.replace(u"\n\n", u"<p>")
# Convert solo returns to <br />
comments = comments.replace(u"\n", "<br />")
# Convert two hyphens to emdash
comments = comments.replace("--", "—")
soup = BeautifulSoup(comments)
result = BeautifulSoup()
rtc = 0
open_pTag = False
all_tokens = list(soup.contents)
for token in all_tokens:
if type(token) is NavigableString:
if not open_pTag:
pTag = Tag(result, "p")
open_pTag = True
ptc = 0
pTag.insert(ptc, prepare_string_for_xml(token))
ptc += 1
elif type(token) in (CData, Comment, Declaration, ProcessingInstruction):
continue
elif token.name in ["br", "b", "i", "em", "strong", "span", "font", "a", "hr"]:
if not open_pTag:
pTag = Tag(result, "p")
open_pTag = True
ptc = 0
pTag.insert(ptc, token)
ptc += 1
else:
if open_pTag:
result.insert(rtc, pTag)
rtc += 1
open_pTag = False
ptc = 0
result.insert(rtc, token)
rtc += 1
if open_pTag:
result.insert(rtc, pTag)
for p in result.findAll("p"):
p["class"] = "description"
#.........这里部分代码省略.........