本文整理汇总了Python中html.parser.HTMLParser类的典型用法代码示例。如果您正苦于以下问题:Python HTMLParser类的具体用法?Python HTMLParser怎么用?Python HTMLParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HTMLParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self):
self.urlList = []
self.index = 0
self.nextUrl = ''
self.tagList = ['li','a']
self.classList = ['photo-list-padding','pic']
HTMLParser.__init__(self)
示例2: __init__
def __init__(self):
HTMLParser.__init__(self)
self.collect_data = False
self.bound = 20
self.des_tag = "div"
self.des_attr = ("id", "content")
self.stations_info = None
示例3: __init__
def __init__(self, builder=None, encoding=None):
self.__stack = []
if builder is None:
builder = ElementTree.TreeBuilder()
self.__builder = builder
self.encoding = encoding or "iso-8859-1"
HTMLParser.__init__(self)
示例4: __init__
def __init__(self):
HTMLParser.__init__(self)
self.data = dict()
self.recordingAuthor = False
self.recordingBody = False;
self.data["body"] = ""
self.save_tags = ['p', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5']
示例5: __init__
def __init__(self):
HTMLParser.__init__(self)
self.url = None
self.params = {}
self.in_form = False
self.form_parsed = False
self.method = "GET"
示例6: handle_task
def handle_task(self, job):
user = job.get('user', 'root')
group = job.get('group', 'root')
mail = job.get('sender', None)
account = Account(user=user, group=group, mail=mail)
recipients = job.get('recipients', None)
subject = ensure_unicode(job.get('subject', ''))
body = ensure_unicode(job.get('body', ''))
attachments = job.get('attachments', None)
smtp_host = job.get('smtp_host', 'localhost')
smtp_port = job.get('smtp_port', 25)
html = job.get('html', False)
template_data = job.get('jobctx', {})
body = Template(body)(template_data)
subject = Template(subject)(template_data)
if not html:
h = HTMLParser()
body = h.unescape(body)
subject = h.unescape(subject)
# Execute the task
return self.sendmail(
account, recipients, subject, body, attachments, smtp_host,
smtp_port, html)
示例7: __init__
def __init__(self,strict=False):
# Constructor call of parent class.
HTMLParser.__init__(self,strict)
# Defining variables of this class.
# There are 3 types of variables.
# data_variables: these are the required information
# data_check_variables: boolean values corresponding to each of the data_variables
# to keep a check on the data been already extracted or not.
# tag_check_variables: used for matching the proper format.
self.h1=False
self.desc=False
self.description=' '
self.sol=False
self.solution=' '
self.p=False;
self.li=False;
self.ref=False;
self.references=' '
self.cvss=False;
self.cvss_score=0.0;
self.cve=False
self.cve_id=' '
self.links=[]
self.prod=False;
self.products=[]
self.last_h6=' '
self.h6=False
self.h7=False
self.clas=False
self.attack_from=' '
self.attk=False
self.impact=' '
self.impt=False
示例8: __init__
def __init__(self):
# use a list to store literal bytes and escaped Unicode
if py3:
super().__init__()
else:
HTMLParser.__init__(self)
self.title = []
示例9: __init__
def __init__(self, base_href):
HTMLParser.__init__(self)
self.base_href = base_href
self.results = {}
self.group_name = self.group_desc = None
self.in_group_name = self.in_group_desc = self.in_activity = 0
self._clear_info()
示例10: __init__
def __init__(self):
"""An overload of the HTML Parser constructor.
We use this initialization code to make sure that every
variable is flushed.
Arguments:
self -- Allows the function to reference parent class
properties. It is unnecessary to specify self during function
calls as it is implied.
"""
# Initialize the HTML Parser.
HTMLParser.__init__(self)
# Initialize the variables.
self._record_name = False
self._record_meal = False
self._record_station = False
self._record_attributes = False
self._day = EMPTY_STRING
self._meal = EMPTY_STRING
self._station = EMPTY_STRING
self._name_text = []
self._station_text = []
self._attributes = []
# Hold all the dining hall menus.
self.menu = []
示例11: __init__
def __init__(self, zip_file):
HTMLParser.__init__(self)
self._html = StringIO() # buffer for the processed HTML
self._zip_file = zip_file
# used to exclude the contents of script and object tags
self._excl_nested_level = 0
示例12: __init__
def __init__(self, strict = False, reps = None, outs = None, sc = True):
self.rep = reps
self.outStream = outs
self.stripComment = sc
self.rep.parser = self
HTMLParser.__init__(self, strict)
示例13: linksh
def linksh(self, cli, ev):
try:
self.chancache[ev.target.lower()]
except:
return 1
if self.yt is True:
yr = re.compile(".*(youtube\.com\/watch\?.*v=|youtu\.be\/)([A-Za-z"
"0-9._%-]*)[&\w;=\+_\-]*.*")
res = yr.search(ev.arguments[0])
if res is not None:
self.ytlinks(cli, ev, res)
return 0
url = re.compile("((https?):((\/\/)|(\\\\))+[\w\d:#@%\/;$()~_?\+-=\\\."
"&]*)")
res = url.search(ev.arguments[0])
if res is None:
return 1
uri = res.group(1)
r = urllib.request.urlopen(uri).read().decode('utf-8', 'replace')
parser = HTMLParser()
r = parser.unescape(r)
yr = re.compile(".*<title[^>]*>([^<]+)</title>.*")
title = yr.search(r)
if title is None:
return 1
cli.msg(ev.target, title.group(1))
示例14: get_game_list
def get_game_list (system):
"""List all the games on Guardiana for a given system."""
response = urllib.request.urlopen ("http://www.guardiana.net/MDG-Database/Complete-List/" + system + "/")
doc = response.read ()
soup = BeautifulSoup(doc)
html_game_list = soup.find("div", {"id": "MDGD_FullList_Box"})
game_list = re.findall ("""» <a href="(.+?)">(.+?)</a><br/>(?:<em>)?(.*?)(?:</em>)?<br/>""", str (html_game_list))
game_dict_list = []
for game in game_list:
game_dict = {'url': "http://www.guardiana.net" + game[0], 'title': [ ]}
# Clean up the URL and add it
result = re.search ("(.*?)\?PHPSESSID=.*?", game[0])
if result:
game_dict['url'] = "http://www.guardiana.net" + result.group(1)
else:
game_dict['url'] = "http://www.guardiana.net" + game[0]
# Unescape the HTML entities from titles and add them
pars = HTMLParser()
game_dict['title'].append (pars.unescape (game[1]))
game_dict_list.append (game_dict)
return game_dict_list
示例15: get_images
def get_images(current_title, title, titles_length):
h = HTMLParser()
print("Fetching images from %s... (%s/%s)" % (title, current_title + 1, titles_length))
# Escape the title so we can create a valid link
# title = title.replace('\'', '%27').replace(' ', '%20')
# Repition is succes
while True:
try:
page = urlopen(SOURCE_LOCATION % title).read().decode(ENCODING)
break
except IOError:
print("\tServer's being lazy, retrying...")
if not page:
print("\tFailed to get %s's images!" % title)
return []
# Ignore redirects
if search("#DOORVERWIJZING", page, I | M) is not None or search("#REDIRECT.*", page, I | M) is not None:
print("\tSkipping redirecting page %s" % title)
return []
imagelinks = []
parser = ImageLocater(imagelinks)
page = h.unescape(page)
try:
parser.feed(page)
except:
print("%s is a malformatted page" % title)
return []
return imagelinks