本文整理汇总了Python中html5lib.HTMLParser类的典型用法代码示例。如果您正苦于以下问题:Python HTMLParser类的具体用法?Python HTMLParser怎么用?Python HTMLParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HTMLParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean_html
def clean_html(input, sanitize=False):
"""
Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.
:param sanitize: Remove unwanted HTML tags and attributes.
>>> clean_html("<p>Foo<b>bar</b></p>")
u'<p>Foo<b>bar</b></p>'
>>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
>>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>'
"""
parser_kwargs = {}
serializer_kwargs = {}
if sanitize:
if HTMLSanitizer is None:
# new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
serializer_kwargs["sanitize"] = True
else:
parser_kwargs["tokenizer"] = HTMLSanitizer
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
dom_tree = p.parseFragment(input)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
return "".join(s.serialize(stream))
示例2: login
def login(self, username, password):
"""
Login to o2online.ie
Returns true if successful or false if fails.
"""
if self.resumable():
self.logger.info("Resuming from login.")
return True
else:
self.logger.info("Unable to resume, running connect from login.")
self.connect()
post = [
('IDButton', 'Go'),
('org', 'o2ext'),
('CONNECTFORMGET', 'TRUE'),
('IDToken1', username),
('IDToken2', password)
]
handle = self.post('https://www.o2online.ie/amserver/UI/Login', post)
from html5lib import HTMLParser, treebuilders
parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
soup = parser.parse(handle)
if unicode(soup.html.head.title.string).strip() == u"LoginCheck":
self.logger.info("login has correct HTML title.")
return True
return False
示例3: news
def news():
global url
global ns
global headers
opener = urllib2.build_opener()
opener.addheaders = headers
pagetext = opener.open(url)
parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
page = parser.parse(pagetext)
main = page.find("//%sdiv[@class='centre-wide-main-content-column']"%ns)
for entry in main.findall("%sdiv"%ns):
title = entry.find("%sdiv[@class='news-item news-title']"%ns).text.strip()
number = int(filter(lambda c: c in string.digits, (entry.attrib.get("onclick","0"))))
url = "http://www.guernseyfc.com/news.details.php?id=%d&random=%s"%(number,ourhash(number))
head_tag = entry.find("%sdiv[@class='news-item news-brief-descript']/%stable/%stbody/%str/%std/%sh1"%(ns,ns,ns,ns,ns,ns))
if head_tag is None:
head = ""
else:
head = head_tag.text
scraperwiki.sqlite.save(unique_keys=["number"],data={"title":title, "number":number, "url":url, "head":head})
示例4: wiki_string_to_tiddlers
def wiki_string_to_tiddlers(content):
"""
Turn a string that is a wiki into tiddler.
"""
parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
doc = parser.parse(content)
# minidom will not provide working getElementById without
# first having a valid document, which means some very specific
# doctype hooey. So we traverse
body = doc.getElementsByTagName('body')[0]
body_divs = body.getElementsByTagName('div')
is_wiki = False
for div in body_divs:
if div.hasAttribute('id') and div.getAttribute('id') == 'storeArea':
divs = div.getElementsByTagName('div')
is_wiki = True
break
if is_wiki:
tiddlers = []
for tiddler_div in divs:
tiddlers.append(_get_tiddler_from_div(tiddler_div))
return tiddlers
else:
raise ValueError('content not a tiddlywiki 2.x')
示例5: get_spaces_available
def get_spaces_available(dept_abbr, course_num):
# define
post_data = {
'classyear' : '2008', #don't know WHY!?!
'subj': dept_abbr,
'crsenum': course_num,
}
url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch'
# get the html
cj = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
headers = {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
request = urllib2.Request(url, urllib.urlencode(post_data), headers)
handle = urllib2.urlopen(request)
html = handle.read()
# parse the html
parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
soup = parser.parse(html)
tbody = soup.find('th', text='Term').parent.parent.parent
cells = tbody.findAll('tr')[2]('td')
enrolled = int(cells[-2].contents[0])
capacity = int(cells[-3].contents[0])
print "%i spaces left (capacity of %i with %i enrolled)" % (capacity-enrolled, capacity, enrolled)
示例6: test_debug_log
def test_debug_log():
parser = HTMLParser(debug=True)
parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")
expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]
if PY2:
for i, log in enumerate(expected):
log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
expected[i] = tuple(log)
assert parser.log == expected
示例7: shallow_scrape
def shallow_scrape():
urns = set([])
br = mechanize.Browser()
resultspage = br.open("http://www.education.gov.uk/edubase/quickSearchResult.xhtml")
moreWorkToDo = True
c = 1
while moreWorkToDo and (c<3):
print "Handling page %d..."%c
### extract data from page
parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
page = parser.parse(resultspage)
for u in page.getroot().findall(path(["body","div","div","div","div","table","tbody","tr","td","table","tbody","tr","td","a"],"")):
#href = u.attrib.get("href","")
href = u.get("href")
print "href: %s"%href
urn = re.search("urn=([0-9]{6})",href).group(1)
urns.add(urn)
print "%s, "%urn
print
### get new page
try:
resultspage = br.follow_link(text="Next")
c += 1
except mechanize.LinkNotFoundError:
moreWorkToDo = False
return urns
示例8: extract_html_urls
def extract_html_urls(self, html):
"""
Take all ``<img src="..">`` from the HTML
"""
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
dom = p.parse(html)
urls = []
for img in dom.getElementsByTagName('img'):
src = img.getAttribute('src')
if src:
urls.append(unquote_utf8(src))
srcset = img.getAttribute('srcset')
if srcset:
urls += self.extract_srcset(srcset)
for source in dom.getElementsByTagName('source'):
srcset = source.getAttribute('srcset')
if srcset:
urls += self.extract_srcset(srcset)
for source in dom.getElementsByTagName('a'):
href = source.getAttribute('href')
if href:
urls.append(unquote_utf8(href))
return urls
示例9: get_dom
def get_dom(self, buf):
buf = buf.strip()
if not buf:
return None
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
tokenizer=self.token_sanitizer())
return p.parseFragment(buf)
示例10: scraper
def scraper(request):
post_data = {
'classyear' : '2008', # why??
'subj': 'COSC',
'crsenum': '50'
}
url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch'
# scrape the html
cj = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
headers = {'User-agent' : 'Mozilla/c.0 (compatible; MSIE 5.5; Windows NT)'}
request = urllib2.Request(url, urllib.urlencode(post_data), headers)
handle = urllib2.urlopen(request)
html = handle.read()
# parse for the dept and course number
parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
soup = parser.parse(html)
#tbody = soup.find('th', text='Term').parent.parent.parent
#soup = tbody.findAll('tr')[2]('td')
return render_to_response("scraper.html", {'soup': soup})
示例11: runParserEncodingTest
def runParserEncodingTest(data, encoding):
p = HTMLParser()
assert p.documentEncoding is None
p.parse(data, useChardet=False)
encoding = encoding.lower().decode("ascii")
assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
示例12: html_parser
def html_parser(html):
try:
soup = BeautifulSoup(html)
except:
parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
soup = parser.parse(html)
return soup
示例13: get_first_result_index_from_quick_search_results
def get_first_result_index_from_quick_search_results(html):
parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
soup = parser.parse(html)
block = soup.find('', {'id' : 'photoresult'}) # isolate the table of data on the first result
block = block.findAll('', {'class' : 'photobox'})[0]
id = block.find('p').find('a').contents[0]
id = int(id)
return id
示例14: parse
def parse(f):
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
doc = p.parse(f)
walker = treewalkers.getTreeWalker("dom")
tokens = []
bintokens = []
waitfor = None
for tok in walker(doc):
if waitfor:
if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]:
waitfor = None
continue
if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"):
waitfor = ("EndTag", tok["name"])
if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"):
bintokens.append(1)
tokens.append(tok)
elif tok["type"] in ("Characters",):
for tok1 in tok["data"].split():
bintokens.append(0)
tokens.append({"type": "Characters", "data": tok1})
elif tok["type"] in ("SpaceCharacters", "Doctype"):
pass
else:
raise ValueError("unrecognizable token type: %r" % tok)
cumbintokens = [bintokens[0]]
for tok in bintokens[1:]:
cumbintokens.append(cumbintokens[-1] + tok)
length = len(cumbintokens)
midx = None
m = None
for i in range(length):
for j in range(i + 1, length):
end_tag = cumbintokens[-1] - cumbintokens[j]
start_tag = cumbintokens[i]
text_between = (j - i) - (cumbintokens[j] - cumbintokens[i])
nm = end_tag + start_tag + text_between
if not midx or nm > m:
midx = i, j
m = nm
i, j = midx
return serialize_tokens(tokens[i:j + 1])
示例15: scrape_others
def scrape_others(pct_name,url):
types = ["doctor","dentist","pharmacy","optician"]
for facility_type,i in zip(types,range(2,6)):
parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
page = parser.parse(scrape(url+"&v=%d"%i))
root = page.getroot()
s = root.find("body/div/form/div/div/div/div/div/dl")
extract_table_data(pct_name,s,facility_type)