当前位置: 首页>>代码示例>>Python>>正文


Python parser.HTMLParser类代码示例

本文整理汇总了Python中html.parser.HTMLParser的典型用法代码示例。如果您正苦于以下问题:Python HTMLParser类的具体用法?Python HTMLParser怎么用?Python HTMLParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了HTMLParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

	def __init__(self):
		self.urlList = []
		self.index = 0
		self.nextUrl = ''
		self.tagList = ['li','a']
		self.classList = ['photo-list-padding','pic']
		HTMLParser.__init__(self)
开发者ID:Hearen,项目名称:Scrawlers,代码行数:7,代码来源:imagesMutliThreadCrawler.py

示例2: __init__

 def __init__(self):
     HTMLParser.__init__(self)
     self.collect_data = False
     self.bound = 20
     self.des_tag = "div"
     self.des_attr = ("id", "content")
     self.stations_info = None
开发者ID:ThomasTheBuilder,项目名称:flood-watch,代码行数:7,代码来源:MultiStationTools.py

示例3: __init__

 def __init__(self, builder=None, encoding=None):
     self.__stack = []
     if builder is None:
         builder = ElementTree.TreeBuilder()
     self.__builder = builder
     self.encoding = encoding or "iso-8859-1"
     HTMLParser.__init__(self)
开发者ID:AlexStef,项目名称:stef-sublime-conf,代码行数:7,代码来源:HTMLTreeBuilder.py

示例4: __init__

 def __init__(self):
     HTMLParser.__init__(self)
     self.data = dict()
     self.recordingAuthor = False
     self.recordingBody = False;
     self.data["body"] = ""
     self.save_tags = ['p', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5']
开发者ID:Bolanle,项目名称:G54MIP,代码行数:7,代码来源:WSJScraper.py

示例5: __init__

 def __init__(self):
     HTMLParser.__init__(self)
     self.url = None
     self.params = {}
     self.in_form = False
     self.form_parsed = False
     self.method = "GET"
开发者ID:arkichek,项目名称:vk-mymusic,代码行数:7,代码来源:vk_auth.py

示例6: handle_task

    def handle_task(self, job):
        user = job.get('user', 'root')
        group = job.get('group', 'root')
        mail = job.get('sender', None)

        account = Account(user=user, group=group, mail=mail)

        recipients = job.get('recipients', None)
        subject = ensure_unicode(job.get('subject', ''))
        body = ensure_unicode(job.get('body', ''))
        attachments = job.get('attachments', None)
        smtp_host = job.get('smtp_host', 'localhost')
        smtp_port = job.get('smtp_port', 25)
        html = job.get('html', False)

        template_data = job.get('jobctx', {})
        body = Template(body)(template_data)
        subject = Template(subject)(template_data)

        if not html:
            h = HTMLParser()
            body = h.unescape(body)
            subject = h.unescape(subject)

        # Execute the task
        return self.sendmail(
            account, recipients, subject, body, attachments, smtp_host,
            smtp_port, html)
开发者ID:Anhmike,项目名称:canopsis,代码行数:28,代码来源:task_mail.py

示例7: __init__

	def __init__(self,strict=False):

		# Constructor call of parent class.
		HTMLParser.__init__(self,strict)

		# Defining variables of this class.
		# There are 3 types of variables.
		# data_variables: these are the required information
		# data_check_variables: boolean values corresponding to each of the data_variables
		# 		to keep a check on the data been already extracted or not.
		# tag_check_variables: used for matching the proper format.
		self.h1=False
		self.desc=False
		self.description=' '
		self.sol=False
		self.solution=' '
		self.p=False;
		self.li=False;
		self.ref=False;
		self.references=' '
		self.cvss=False;
		self.cvss_score=0.0;
		self.cve=False
		self.cve_id=' '
		self.links=[]
		self.prod=False;
		self.products=[]
		self.last_h6=' '
		self.h6=False
		self.h7=False
		self.clas=False
		self.attack_from=' '
		self.attk=False
		self.impact=' '
		self.impt=False
开发者ID:deathholes,项目名称:Vulnerability-Database,代码行数:35,代码来源:osvdb_threads.py

示例8: __init__

 def __init__(self):
   # use a list to store literal bytes and escaped Unicode
   if py3:
       super().__init__()
   else:
       HTMLParser.__init__(self)
   self.title = []
开发者ID:coldnight,项目名称:fetchtitle,代码行数:7,代码来源:__init__.py

示例9: __init__

 def __init__(self, base_href):
     HTMLParser.__init__(self)
     self.base_href = base_href
     self.results = {}
     self.group_name = self.group_desc = None
     self.in_group_name = self.in_group_desc = self.in_activity = 0
     self._clear_info()
开发者ID:curiousguy13,项目名称:sugar,代码行数:7,代码来源:microformat.py

示例10: __init__

    def __init__(self):
        """An overload of the HTML Parser constructor.
        We use this initialization code to make sure that every
        variable is flushed.

        Arguments:

        self -- Allows the function to reference parent class
        properties. It is unnecessary to specify self during function
        calls as it is implied.
        """

        # Initialize the HTML Parser.
        HTMLParser.__init__(self)

        # Initialize the variables.
        self._record_name = False
        self._record_meal = False
        self._record_station = False
        self._record_attributes = False
        self._day = EMPTY_STRING
        self._meal = EMPTY_STRING
        self._station = EMPTY_STRING
        self._name_text = []
        self._station_text = []
        self._attributes = []

        # Hold all the dining hall menus.
        self.menu = []
开发者ID:JelloRanger,项目名称:menu-scraper,代码行数:29,代码来源:sodexo_parser.py

示例11: __init__

    def __init__(self, zip_file):
        HTMLParser.__init__(self)
        self._html = StringIO()  # buffer for the processed HTML
        self._zip_file = zip_file

        # used to exclude the contents of script and object tags
        self._excl_nested_level = 0
开发者ID:CenterForOpenScience,项目名称:modular-file-renderer,代码行数:7,代码来源:html_processor.py

示例12: __init__

	def __init__(self, strict = False, reps = None, outs = None, sc = True):
		self.rep = reps
		self.outStream = outs
		self.stripComment = sc
		self.rep.parser = self

		HTMLParser.__init__(self, strict)
开发者ID:MattDiesel,项目名称:staticpress,代码行数:7,代码来源:TransParser.py

示例13: linksh

    def linksh(self, cli, ev):
        try:
            self.chancache[ev.target.lower()]
        except:
            return 1
        if self.yt is True:
            yr = re.compile(".*(youtube\.com\/watch\?.*v=|youtu\.be\/)([A-Za-z"
                                                    "0-9._%-]*)[&\w;=\+_\-]*.*")
            res = yr.search(ev.arguments[0])
            if res is not None:
                self.ytlinks(cli, ev, res)
                return 0
        url = re.compile("((https?):((\/\/)|(\\\\))+[\w\d:#@%\/;$()~_?\+-=\\\."
                                                                        "&]*)")
        res = url.search(ev.arguments[0])
        if res is None:
            return 1
        uri = res.group(1)
        r = urllib.request.urlopen(uri).read().decode('utf-8', 'replace')
        parser = HTMLParser()
        r = parser.unescape(r)
        yr = re.compile(".*<title[^>]*>([^<]+)</title>.*")
        title = yr.search(r)
        if title is None:
            return 1

        cli.msg(ev.target, title.group(1))
开发者ID:IsmaeRLGV,项目名称:pyCoBot,代码行数:27,代码来源:links.py

示例14: get_game_list

def get_game_list (system):
	"""List all the games on Guardiana for a given system."""
	
	response = urllib.request.urlopen ("http://www.guardiana.net/MDG-Database/Complete-List/" + system + "/")
	
	doc = response.read ()
	
	soup = BeautifulSoup(doc)
	html_game_list = soup.find("div", {"id": "MDGD_FullList_Box"})
	
	game_list = re.findall ("""» <a href="(.+?)">(.+?)</a><br/>(?:<em>)?(.*?)(?:</em>)?<br/>""", str (html_game_list))
	
	game_dict_list = []
	
	for game in game_list:
		game_dict = {'url': "http://www.guardiana.net" + game[0], 'title': [ ]}
		
		# Clean up the URL and add it
		result = re.search ("(.*?)\?PHPSESSID=.*?", game[0])
		if result:
			game_dict['url'] = "http://www.guardiana.net" + result.group(1)
		else:
			game_dict['url'] = "http://www.guardiana.net" + game[0]
		
		# Unescape the HTML entities from titles and add them
		pars = HTMLParser()
		game_dict['title'].append (pars.unescape (game[1]))
		game_dict_list.append (game_dict)
	
	return game_dict_list
开发者ID:Kekun,项目名称:badnik-tools,代码行数:30,代码来源:guardiana.py

示例15: get_images

def get_images(current_title, title, titles_length):
    h = HTMLParser()
    print("Fetching images from %s... (%s/%s)" % (title, current_title + 1, titles_length))
    # Escape the title so we can create a valid link
    # title = title.replace('\'', '%27').replace(' ', '%20')
    # Repition is succes
    while True:
        try:
            page = urlopen(SOURCE_LOCATION % title).read().decode(ENCODING)
            break
        except IOError:
            print("\tServer's being lazy, retrying...")

    if not page:
        print("\tFailed to get %s's images!" % title)
        return []
    # Ignore redirects
    if search("#DOORVERWIJZING", page, I | M) is not None or search("#REDIRECT.*", page, I | M) is not None:
        print("\tSkipping redirecting page %s" % title)
        return []
    imagelinks = []
    parser = ImageLocater(imagelinks)

    page = h.unescape(page)

    try:
        parser.feed(page)
    except:
        print("%s is a malformatted page" % title)
        return []

    return imagelinks
开发者ID:ZeusWPI,项目名称:WikiTools,代码行数:32,代码来源:scraper.py


注:本文中的html.parser.HTMLParser类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。