本文整理汇总了Python中lib.BeautifulSoup.BeautifulSoup.prettify方法的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup.prettify方法的具体用法?Python BeautifulSoup.prettify怎么用?Python BeautifulSoup.prettify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lib.BeautifulSoup.BeautifulSoup
的用法示例。
在下文中一共展示了BeautifulSoup.prettify方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: seturl
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
def seturl(self):
'''URLとURLからフェッチして保存します'''
user = common.currentuser()
if not user:
common.error(self, 404, "User not found.")
return
ct = models.CustomTest.all().ancestor(user).get()
if not ct:
ct = models.CustomTest(parent=user)
ct.setbypost(self.request.POST)
if not ct.rss_link:
soup = Soup(defines.defaulttesthtml)
else:
result = urlfetch.fetch(ct.rss_link)
if result.status_code != 200:
common.error(self, 200, "Url Fetch Error")
return
soup = Soup(result.content)
try:
ct.data = soup.prettify().decode('UTF-8')
except ValueError, message:
common.error(self, 200, message)
return
示例2: get
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
def get(self):
self.response.headers['Content-Type'] = 'text/plain'
day = date.today() - relativedelta(days=1)
response = fetch_url("http://podcastrss.play.it/the-sports-junkies_mp3_128.xml")
if response and response.status_code == 200:
feed_soup = BeautifulSoup(response.content)
[copyright_el.extract() for copyright_el in feed_soup.findAll("copyright")]
self.response.out.write("%s\n\n\n" % feed_soup.prettify())
DailyFeedSnapshot.create(day, feed_soup.prettify())
msg = "Created a DailyFeedSnapshot for %s." % (day)
self.response.out.write(msg)
logging.info(msg)
else:
msg = "Could not create a DailyFeedSnapshot for %s." % (day)
self.response.out.write(msg)
logging.error(msg)
示例3: create_book
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
def create_book(self):
if self.fail:
return False
paragraphs = []
temp_file = self.file.decode("utf-8")
# temp_file = self.__erase_xml_illegal_chars(temp_file)
if not "temp_dir" in dir(self):
self.temp_dir = tempfile.mkdtemp()
if not self.names:
file = open(self.temp_dir + "/0.html", 'w')
file.write(self.file)
file.close()
os.system(EPUBER_DIR + '/remove_illegal.py <' + self.temp_dir + "/0.html >" + self.temp_dir + "/tmp")
shutil.move(self.temp_dir + "/tmp", self.temp_dir + "/0.html")
self.book.add_file(self.temp_dir + "/0.html", 'c0', "")
else:
for i, name in enumerate(self.names):
split_index = temp_file.find(name)
if i == 0:
paragraph = ""
else:
paragraph = self.HTML_HEADER
paragraph += temp_file[:split_index]
soup = BeautifulSoup(paragraph)
paragraph = soup.prettify()
paragraphs.append(paragraph)
temp_file = temp_file[split_index:]
#soup = BeautifulSoup(temp_file)
#temp_file = soup.prettify()
paragraphs.append(BeautifulSoup(self.HTML_HEADER + temp_file).prettify())
for i, paragraph in enumerate(paragraphs):
file = open(self.temp_dir + "/%d.html" % i, 'w')
file.write(paragraph)
file.close()
os.system(EPUBER_DIR + '/remove_illegal.py <' + self.temp_dir + "/%d.html >" % i + self.temp_dir + "/tmp")
shutil.move(self.temp_dir + "/tmp", self.temp_dir + "/%d.html" % i)
self.book.add_file(self.temp_dir + "/%d.html" % i, 'c%d' % i, self.titles[i])
for i, image in enumerate(self.images):
self.book.add_file(image, self.temp_dir + '/im%d' % i, title="", in_spine=False)
self.book.pack()
return True
示例4: parse_matchup_info
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
def parse_matchup_info(league, team_id):
logging.info("team_id: %d" % team_id)
generic_matchup_url = build_url(league_id=league.id, page='matchup', params={'mid1': team_id}, access_code=league.access_code)
try:
matchup_soup = BeautifulSoup(urlfetch.fetch(generic_matchup_url).content).find('div', attrs={'class': 'scoreboard'}).find('li')
except:
matchup_soup = None
logging.info("\n\n\n%s\n\n\n" % matchup_soup.prettify())
if matchup_soup:
team_names = [str(row.find('a').contents[0]).strip() for row in matchup_soup.findAll('tr')]
score = [float(pts.contents[0]) for pts in matchup_soup.findAll('td', attrs={'class': 'pts'})]
else:
team_names = None
score = None
if team_names and score:
return {'score': score, 'team_names': team_names}
else:
return None
示例5: __init__
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
class Item:
"""
A wrapper around a GoogleReader item
"""
def __init__(self, feed_item = None, tag_name = '(unknown)', raw_data = None):
self.had_errors = False
if feed_item is not None:
try: self.feed_name = feed_item['feed_name']
except (KeyError, TypeError):
self.feed_name = tag_name
self.tag_name = tag_name
self.title = strip_html_tags(feed_item['title'])
self.title = unicode(BeautifulSoup(self.title, convertEntities = BeautifulSoup.HTML_ENTITIES))
self.google_id = feed_item['google_id']
self.date = time.strftime('%Y%m%d%H%M%S', time.localtime(float(feed_item['updated'])))
self.is_read = 'read' in feed_item['categories']
self.is_starred = 'starred' in feed_item['categories']
self.is_shared = 'broadcast' in feed_item['categories']
self.url = feed_item['link']
self.content = feed_item['content']
self.original_id = feed_item['original_id']
self.media = try_lookup(feed_item, 'media')
self.is_dirty = False
self.is_stale = False
else:
# just copy the dict's keys to my instance vars
for key,value in raw_data.items():
setattr(self, key, value)
# calculated attributes that aren't stored in the DB
self.safe_google_id = Item.escape_google_id(self.google_id)
self.resources_path = "%s/%s/%s" % (app_globals.OPTIONS['output_path'], app_globals.CONFIG['resources_path'], self.safe_google_id)
self.basename = self.get_basename()
@staticmethod
def unescape_google_id(safe_google_id):
return urllib.unquote(safe_google_id)
@staticmethod
def escape_google_id(unsafe_google_id):
return urllib.quote(unsafe_google_id, safe='')
def get_basename(self):
return utf8(
self.date + ' ' +
filter(lambda x: x not in '"\':#!+/$\\?*', ascii(self.title))[:120] + ' .||' +
self.safe_google_id + '||' )
def soup_setup(self):
self.soup = BeautifulSoup(self.content)
try:
self.base = url_dirname(self.original_id)
except TypeError:
self.base = None
def soup_teardown(self):
self.soup
self.content = self.soup.prettify()
def process(self):
debug("item %s -> process()" % self.title)
self.soup_setup()
# process
process.insert_alt_text(self.soup)
self.download_images(need_soup = False)
# save changes back as content
self.soup_teardown()
def download_images(self, need_soup=True):
self.had_errors = False
if need_soup:
self.soup_setup()
try: media = self.media
except AttributeError: media = None
if media is not None:
success = process.insert_enclosure_images(self.soup, url_list = self.media)
if not success:
self.had_errors = True
success = process.download_images(self.soup,
dest_folder = self.resources_path,
href_prefix = app_globals.CONFIG['resources_path'] + '/' + self.safe_google_id + '/',
base_href = self.base)
if not success:
self.had_errors = True
if need_soup:
self.soup_teardown()
def save(self):
app_globals.DATABASE.add_item(self)
def delete(self):
app_globals.DATABASE.remove_item(self)
for f in glob.glob(app_globals.OPTIONS['output_path'] + '/*.' + self.safe_google_id + '.*'):
#.........这里部分代码省略.........
示例6: __init__
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
class Item:
"""
A wrapper around a GoogleReader item
"""
def __init__(self, feed_item = None, tag_name = '(unknown)', raw_data = None):
self.had_errors = False
if feed_item is not None:
try: self.feed_name = feed_item['feed_name']
except (KeyError, TypeError):
self.feed_name = tag_name
self.tag_name = tag_name
self.title = strip_html_tags(utf8(feed_item['title']))
self.title = unicode(BeautifulSoup(self.title, convertEntities = BeautifulSoup.HTML_ENTITIES))
self.google_id = feed_item['google_id']
self.date = time.strftime('%Y%m%d%H%M%S', time.localtime(float(feed_item['updated'])))
self.is_read = 'read' in feed_item['categories']
self.is_starred = 'starred' in feed_item['categories']
self.is_shared = 'broadcast' in feed_item['categories']
self.url = utf8(feed_item['link'])
self.content = utf8(feed_item['content'])
self.original_id = utf8(feed_item['original_id'])
self.media = try_lookup(feed_item, 'media')
self.is_pagefeed = self.any_source_is_pagefeed(map(utf8, feed_item['sources']))
self.instapaper_url = ""
self.is_dirty = False
self.is_stale = False
else:
# just copy the dict's keys to my instance vars
for key,value in raw_data.items():
setattr(self, key, value)
# calculated attributes that aren't stored in the DB
self.safe_google_id = Item.escape_google_id(self.google_id)
self.resources_path = "%s/%s/%s" % (app_globals.OPTIONS['output_path'], app_globals.CONFIG['resources_path'], self.safe_google_id)
self.basename = self.get_basename()
@staticmethod
def unescape_google_id(safe_google_id):
return urllib.unquote(safe_google_id)
@staticmethod
def escape_google_id(unsafe_google_id):
return urllib.quote(unsafe_google_id, safe='')
def get_basename(self):
"""A filesystem-safe key, unique to this item"""
return utf8(
self.date + ' ' +
filter(lambda x: x not in '"\':#!+/$\\?*', ascii(self.title))[:120] + ' .||' +
self.safe_google_id + '||' )
def soup_setup(self):
self.soup = BeautifulSoup(self.content)
try:
self.base = url_dirname(self.original_id)
except TypeError:
self.base = None
def soup_teardown(self):
self.soup
self.content = self.soup.prettify()
def process(self):
debug("item %s -> process()" % self.title)
self.soup_setup()
thread_pool.ping()
# process
debug("item %s -> insert_alt_text()" % self.title)
process.insert_alt_text(self.soup)
thread_pool.ping()
self.download_images(need_soup = False)
thread_pool.ping()
# save changes back as content
self.soup_teardown()
def redownload_images(self):
self.had_errors = False
self.download_images()
self.update()
def download_images(self, need_soup=True):
self.had_errors = False
if need_soup:
self.soup_setup()
try: media = self.media
except AttributeError: media = None
if media is not None:
success = process.insert_enclosure_images(self.soup, url_list = self.media)
if not success:
self.had_errors = True
debug("item %s -> download_images()" % (self.title,))
success = process.download_images(self.soup,
dest_folder = self.resources_path,
#.........这里部分代码省略.........
示例7: parse
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
def parse(self, output=""):
self.temp_dir = tempfile.mkdtemp()
if output == "":
output = self.temp_dir + "/tmp.html"
os.system(
PATH_TO_WV_WARE
+ "wvWare -x"
+ PATH_TO_WV_HTML
+ "/wvHtml.xml --charset=cp1251 %s > %s" % (self.name, output)
)
paragraphs = []
# temp_file = self.file.decode("utf-8")
file = open(self.temp_dir + "/tmp.html", "r")
temp_file = prettify.remove_spaces(file.read())
temp_file = prettify.remove_unnecessary_tags(temp_file)
soup = BeautifulSoup(temp_file)
temp_names = soup.findAll(align="center")
names = []
titles = []
for temp_name in temp_names:
if not re.match(r"^(<.*?>|\s+)*$", str(temp_name)):
names.append(re.sub(r"\s+", " ", str(temp_name)))
temp = re.sub(r"(<.*>|\s+)", " ", temp_name.prettify())
titles.append(re.sub(r"\s+", " ", temp))
temp_file = re.sub(r"\s+", " ", temp_file.decode("cp1251").encode("utf-8"))
out = open(self.temp_dir + "/tmp", "w")
out.write(temp_file)
out.write(" \n\n\n")
for name in names:
out.write(name + "\n\n\n")
out.close()
if not names:
print "not names"
file = open(self.temp_dir + "/0.html", "w")
file.write(temp_file)
file.close()
self.book.add_file(self.temp_dir + "/0.html", "c0", "")
for i, name in enumerate(names):
split_index = temp_file.find(name)
if i == 0:
paragraph = ""
else:
paragraph = self.HTML_HEADER
paragraph += temp_file[:split_index]
soup = BeautifulSoup(paragraph)
paragraph = soup.prettify()
paragraphs.append(paragraph)
temp_file = temp_file[split_index:]
# soup = BeautifulSoup(temp_file)
# temp_file = soup.prettify()
for i, paragraph in enumerate(paragraphs):
file = open(self.temp_dir + "/%d.html" % i, "w")
file.write(paragraph)
file.close()
self.book.add_file(self.temp_dir + "/%d.html" % i, "c%d" % i, titles[i])
# for i, image in enumerate(self.images):
# self.book.add_file(image, 'im%d' % i, title="", in_spine=False)
self.book.pack()
return True