当前位置: 首页>>代码示例>>Python>>正文


Python BeautifulSoup.prettify方法代码示例

本文整理汇总了Python中lib.BeautifulSoup.BeautifulSoup.prettify方法的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup.prettify方法的具体用法?Python BeautifulSoup.prettify怎么用?Python BeautifulSoup.prettify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lib.BeautifulSoup.BeautifulSoup的用法示例。


在下文中一共展示了BeautifulSoup.prettify方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: seturl

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
    def seturl(self):
        '''URLとURLからフェッチして保存します'''
        user = common.currentuser()
        if not user:
            common.error(self, 404, "User not found.")
            return

        ct = models.CustomTest.all().ancestor(user).get()
        if not ct:
            ct = models.CustomTest(parent=user)

        ct.setbypost(self.request.POST)

        if not ct.rss_link:
            soup = Soup(defines.defaulttesthtml)
        else:
            result = urlfetch.fetch(ct.rss_link)
            if result.status_code != 200:
                common.error(self, 200, "Url Fetch Error")
                return
            soup = Soup(result.content)

        try: 
            ct.data = soup.prettify().decode('UTF-8')
        except ValueError, message:
            common.error(self, 200, message)
            return
开发者ID:poochin,项目名称:feedbyselectors,代码行数:29,代码来源:customtest.py

示例2: get

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
    def get(self):
        self.response.headers['Content-Type'] = 'text/plain'

        day = date.today() - relativedelta(days=1)
        response = fetch_url("http://podcastrss.play.it/the-sports-junkies_mp3_128.xml")
        if response and response.status_code == 200:
            feed_soup = BeautifulSoup(response.content)
            [copyright_el.extract() for copyright_el in feed_soup.findAll("copyright")]

            self.response.out.write("%s\n\n\n" % feed_soup.prettify())
            DailyFeedSnapshot.create(day, feed_soup.prettify())
            msg = "Created a DailyFeedSnapshot for %s." % (day)
            self.response.out.write(msg)
            logging.info(msg)
        else:
            msg = "Could not create a DailyFeedSnapshot for %s." % (day)
            self.response.out.write(msg)
            logging.error(msg)
开发者ID:mshafrir,项目名称:Junkscast,代码行数:20,代码来源:cron.py

示例3: create_book

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
    def create_book(self):
        if self.fail:
            return False
        paragraphs = []
        temp_file = self.file.decode("utf-8")

#        temp_file = self.__erase_xml_illegal_chars(temp_file)

        if not "temp_dir" in dir(self):
            self.temp_dir = tempfile.mkdtemp()
        if not self.names:
            file = open(self.temp_dir + "/0.html", 'w')
            file.write(self.file)
            file.close()
            os.system(EPUBER_DIR + '/remove_illegal.py <' + self.temp_dir + "/0.html >" + self.temp_dir + "/tmp")
            shutil.move(self.temp_dir + "/tmp", self.temp_dir + "/0.html")
            self.book.add_file(self.temp_dir + "/0.html", 'c0', "")
        else:
        
            for i, name in enumerate(self.names):
                split_index = temp_file.find(name)
                if i == 0:
                    paragraph = ""
                else:
                    paragraph = self.HTML_HEADER
                
                paragraph += temp_file[:split_index]
                soup = BeautifulSoup(paragraph)
                paragraph = soup.prettify()
                paragraphs.append(paragraph)
                temp_file = temp_file[split_index:]
                #soup = BeautifulSoup(temp_file)
                #temp_file = soup.prettify()
            paragraphs.append(BeautifulSoup(self.HTML_HEADER + temp_file).prettify())
        for i, paragraph in enumerate(paragraphs):
            file = open(self.temp_dir + "/%d.html" % i, 'w')
            file.write(paragraph)
            file.close()
            os.system(EPUBER_DIR + '/remove_illegal.py <' + self.temp_dir + "/%d.html >" % i + self.temp_dir + "/tmp")
            shutil.move(self.temp_dir + "/tmp", self.temp_dir + "/%d.html" % i)
            self.book.add_file(self.temp_dir + "/%d.html" % i, 'c%d' % i, self.titles[i])
        for i, image in enumerate(self.images):
            self.book.add_file(image, self.temp_dir + '/im%d' % i, title="", in_spine=False)
        self.book.pack()
        return True
开发者ID:ktisha,项目名称:ebook-service,代码行数:47,代码来源:LibRuParser.py

示例4: parse_matchup_info

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
def parse_matchup_info(league, team_id):
	logging.info("team_id: %d" % team_id)
	
	generic_matchup_url = build_url(league_id=league.id, page='matchup', params={'mid1': team_id}, access_code=league.access_code)
	try:
		matchup_soup = BeautifulSoup(urlfetch.fetch(generic_matchup_url).content).find('div', attrs={'class': 'scoreboard'}).find('li')
	except:
		matchup_soup = None	
		
	logging.info("\n\n\n%s\n\n\n" % matchup_soup.prettify())
	
	if matchup_soup:
		team_names = [str(row.find('a').contents[0]).strip() for row in matchup_soup.findAll('tr')]
		score = [float(pts.contents[0]) for pts in matchup_soup.findAll('td', attrs={'class': 'pts'})]
	else:
		team_names = None
		score = None
	
	if team_names and score:
		return {'score': score, 'team_names': team_names}
	else:
		return None
开发者ID:mshafrir,项目名称:Rotoist,代码行数:24,代码来源:yahoo.py

示例5: __init__

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
class Item:
	"""
	A wrapper around a GoogleReader item
	"""
	def __init__(self, feed_item = None, tag_name = '(unknown)', raw_data = None):
		self.had_errors = False
		if feed_item is not None:
			try: self.feed_name = feed_item['feed_name']
			except (KeyError, TypeError):
				self.feed_name = tag_name
			self.tag_name = tag_name
			self.title = strip_html_tags(feed_item['title'])
			self.title = unicode(BeautifulSoup(self.title, convertEntities = BeautifulSoup.HTML_ENTITIES))
			self.google_id = feed_item['google_id']
			self.date = time.strftime('%Y%m%d%H%M%S', time.localtime(float(feed_item['updated'])))
			self.is_read = 'read' in feed_item['categories']
			self.is_starred = 'starred' in feed_item['categories']
			self.is_shared = 'broadcast' in feed_item['categories']
			self.url = feed_item['link']
			self.content = feed_item['content']
			self.original_id = feed_item['original_id']
			self.media = try_lookup(feed_item, 'media')
			self.is_dirty = False
			self.is_stale = False
		else:
			# just copy the dict's keys to my instance vars
			for key,value in raw_data.items():
				setattr(self, key, value)
		
		# calculated attributes that aren't stored in the DB
		self.safe_google_id = Item.escape_google_id(self.google_id)
		self.resources_path = "%s/%s/%s" % (app_globals.OPTIONS['output_path'], app_globals.CONFIG['resources_path'], self.safe_google_id)
		self.basename = self.get_basename()
	
	@staticmethod
	def unescape_google_id(safe_google_id):
		return urllib.unquote(safe_google_id)

	@staticmethod
	def escape_google_id(unsafe_google_id):
		return urllib.quote(unsafe_google_id, safe='')

	def get_basename(self):
		return utf8(
			self.date + ' ' +
			filter(lambda x: x not in '"\':#!+/$\\?*', ascii(self.title))[:120] + ' .||' +
			self.safe_google_id + '||' )

	def soup_setup(self):
		self.soup = BeautifulSoup(self.content)
		try:
			self.base = url_dirname(self.original_id)
		except TypeError:
			self.base = None
	
	def soup_teardown(self):
		self.soup 
		self.content = self.soup.prettify()
		
	def process(self):
		debug("item %s -> process()" % self.title)
		self.soup_setup()

		# process
		process.insert_alt_text(self.soup)
		self.download_images(need_soup = False)
		
		# save changes back as content
		self.soup_teardown()
	
	def download_images(self, need_soup=True):
		self.had_errors = False

		if need_soup:
			self.soup_setup()
		
		try: media = self.media
		except AttributeError: media = None

		if media is not None:
			success = process.insert_enclosure_images(self.soup, url_list = self.media)
			if not success:
				self.had_errors = True
		
		success = process.download_images(self.soup,
			dest_folder = self.resources_path,
			href_prefix = app_globals.CONFIG['resources_path'] + '/' + self.safe_google_id + '/',
			base_href = self.base)
		if not success:
			self.had_errors = True

		if need_soup:
			self.soup_teardown()
	
	def save(self):
		app_globals.DATABASE.add_item(self)

	def delete(self):
		app_globals.DATABASE.remove_item(self)
		for f in glob.glob(app_globals.OPTIONS['output_path'] + '/*.' + self.safe_google_id + '.*'):
#.........这里部分代码省略.........
开发者ID:nrolland,项目名称:google-reader-iphone-sync,代码行数:103,代码来源:item.py

示例6: __init__

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
class Item:
	"""
	A wrapper around a GoogleReader item
	"""
	def __init__(self, feed_item = None, tag_name = '(unknown)', raw_data = None):
		self.had_errors = False
		if feed_item is not None:
			try: self.feed_name = feed_item['feed_name']
			except (KeyError, TypeError):
				self.feed_name = tag_name
			self.tag_name = tag_name
			self.title = strip_html_tags(utf8(feed_item['title']))
			self.title = unicode(BeautifulSoup(self.title, convertEntities = BeautifulSoup.HTML_ENTITIES))
			self.google_id = feed_item['google_id']
			self.date = time.strftime('%Y%m%d%H%M%S', time.localtime(float(feed_item['updated'])))
			self.is_read = 'read' in feed_item['categories']
			self.is_starred = 'starred' in feed_item['categories']
			self.is_shared = 'broadcast' in feed_item['categories']
			self.url = utf8(feed_item['link'])
			self.content = utf8(feed_item['content'])
			self.original_id = utf8(feed_item['original_id'])
			self.media = try_lookup(feed_item, 'media')
			self.is_pagefeed = self.any_source_is_pagefeed(map(utf8, feed_item['sources']))
			self.instapaper_url = ""
			self.is_dirty = False
			self.is_stale = False
		else:
			# just copy the dict's keys to my instance vars
			for key,value in raw_data.items():
				setattr(self, key, value)
		
		# calculated attributes that aren't stored in the DB
		self.safe_google_id = Item.escape_google_id(self.google_id)
		self.resources_path = "%s/%s/%s" % (app_globals.OPTIONS['output_path'], app_globals.CONFIG['resources_path'], self.safe_google_id)
		self.basename = self.get_basename()
	
	@staticmethod
	def unescape_google_id(safe_google_id):
		return urllib.unquote(safe_google_id)

	@staticmethod
	def escape_google_id(unsafe_google_id):
		return urllib.quote(unsafe_google_id, safe='')

	def get_basename(self):
		"""A filesystem-safe key, unique to this item"""
		return utf8(
			self.date + ' ' +
			filter(lambda x: x not in '"\':#!+/$\\?*', ascii(self.title))[:120] + ' .||' +
			self.safe_google_id + '||' )

	def soup_setup(self):
		self.soup = BeautifulSoup(self.content)
		try:
			self.base = url_dirname(self.original_id)
		except TypeError:
			self.base = None
	
	def soup_teardown(self):
		self.soup 
		self.content = self.soup.prettify()
		
	def process(self):
		debug("item %s -> process()" % self.title)
		self.soup_setup()
		thread_pool.ping()
		
		# process
		debug("item %s -> insert_alt_text()" % self.title)
		process.insert_alt_text(self.soup)
		thread_pool.ping()
		
		self.download_images(need_soup = False)
		thread_pool.ping()
		
		# save changes back as content
		self.soup_teardown()
	
	def redownload_images(self):
		self.had_errors = False
		self.download_images()
		self.update()
	
	def download_images(self, need_soup=True):
		self.had_errors = False

		if need_soup:
			self.soup_setup()
		
		try: media = self.media
		except AttributeError: media = None

		if media is not None:
			success = process.insert_enclosure_images(self.soup, url_list = self.media)
			if not success:
				self.had_errors = True
		
		debug("item %s -> download_images()" % (self.title,))
		success = process.download_images(self.soup,
			dest_folder = self.resources_path,
#.........这里部分代码省略.........
开发者ID:grvgr,项目名称:google-reader-iphone-sync,代码行数:103,代码来源:item.py

示例7: parse

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import prettify [as 别名]
    def parse(self, output=""):
        self.temp_dir = tempfile.mkdtemp()
        if output == "":
            output = self.temp_dir + "/tmp.html"

        os.system(
            PATH_TO_WV_WARE
            + "wvWare -x"
            + PATH_TO_WV_HTML
            + "/wvHtml.xml --charset=cp1251 %s > %s" % (self.name, output)
        )

        paragraphs = []
        # temp_file = self.file.decode("utf-8")
        file = open(self.temp_dir + "/tmp.html", "r")
        temp_file = prettify.remove_spaces(file.read())
        temp_file = prettify.remove_unnecessary_tags(temp_file)
        soup = BeautifulSoup(temp_file)
        temp_names = soup.findAll(align="center")
        names = []
        titles = []
        for temp_name in temp_names:
            if not re.match(r"^(<.*?>|\s+)*$", str(temp_name)):
                names.append(re.sub(r"\s+", " ", str(temp_name)))
                temp = re.sub(r"(<.*>|\s+)", " ", temp_name.prettify())
                titles.append(re.sub(r"\s+", " ", temp))

        temp_file = re.sub(r"\s+", " ", temp_file.decode("cp1251").encode("utf-8"))
        out = open(self.temp_dir + "/tmp", "w")
        out.write(temp_file)
        out.write("   \n\n\n")
        for name in names:
            out.write(name + "\n\n\n")
        out.close()

        if not names:
            print "not names"
            file = open(self.temp_dir + "/0.html", "w")
            file.write(temp_file)
            file.close()
            self.book.add_file(self.temp_dir + "/0.html", "c0", "")
        for i, name in enumerate(names):
            split_index = temp_file.find(name)
            if i == 0:
                paragraph = ""
            else:
                paragraph = self.HTML_HEADER

            paragraph += temp_file[:split_index]
            soup = BeautifulSoup(paragraph)
            paragraph = soup.prettify()
            paragraphs.append(paragraph)
            temp_file = temp_file[split_index:]
            # soup = BeautifulSoup(temp_file)
            # temp_file = soup.prettify()
        for i, paragraph in enumerate(paragraphs):
            file = open(self.temp_dir + "/%d.html" % i, "w")
            file.write(paragraph)
            file.close()
            self.book.add_file(self.temp_dir + "/%d.html" % i, "c%d" % i, titles[i])
        # for i, image in enumerate(self.images):
        #    self.book.add_file(image, 'im%d' % i, title="", in_spine=False)

        self.book.pack()
        return True
开发者ID:ktisha,项目名称:ebook-service,代码行数:67,代码来源:doc.py


注:本文中的lib.BeautifulSoup.BeautifulSoup.prettify方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。