当前位置: 首页>>代码示例>>Python>>正文


Python html5lib.HTMLParser类代码示例

本文整理汇总了Python中html5lib.HTMLParser的典型用法代码示例。如果您正苦于以下问题:Python HTMLParser类的具体用法?Python HTMLParser怎么用?Python HTMLParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了HTMLParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: clean_html

def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs["sanitize"] = True
        else:
            parser_kwargs["tokenizer"] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
开发者ID:django-fluent,项目名称:django-fluent-contents,代码行数:29,代码来源:html.py

示例2: login

	def login(self, username, password):
		"""
		Login to o2online.ie

		Returns true if successful or false if fails.
		"""
		if self.resumable():
			self.logger.info("Resuming from login.")
			return True
		else:
			self.logger.info("Unable to resume, running connect from login.")
			self.connect()

		post = [
			('IDButton', 'Go'),
			('org', 'o2ext'),
			('CONNECTFORMGET', 'TRUE'),
			('IDToken1', username),
			('IDToken2', password)
		]

		handle = self.post('https://www.o2online.ie/amserver/UI/Login', post)
		from html5lib import HTMLParser, treebuilders
		parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
		soup = parser.parse(handle)

		if unicode(soup.html.head.title.string).strip() == u"LoginCheck":
			self.logger.info("login has correct HTML title.")
			return True
		return False
开发者ID:d-fens,项目名称:smspie,代码行数:30,代码来源:o2.py

示例3: news

def news():
    global url
    global ns
    global headers

    opener = urllib2.build_opener()
    opener.addheaders = headers

    pagetext = opener.open(url)
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(pagetext)
    main = page.find("//%sdiv[@class='centre-wide-main-content-column']"%ns)
    for entry in main.findall("%sdiv"%ns):
        title = entry.find("%sdiv[@class='news-item news-title']"%ns).text.strip()

        number = int(filter(lambda c: c in string.digits, (entry.attrib.get("onclick","0"))))
        url = "http://www.guernseyfc.com/news.details.php?id=%d&random=%s"%(number,ourhash(number))

        head_tag = entry.find("%sdiv[@class='news-item news-brief-descript']/%stable/%stbody/%str/%std/%sh1"%(ns,ns,ns,ns,ns,ns))
        if head_tag is None:
            head = ""
        else:
            head = head_tag.text
        
        scraperwiki.sqlite.save(unique_keys=["number"],data={"title":title, "number":number, "url":url, "head":head})
开发者ID:flyeven,项目名称:scraperwiki-scraper-vault,代码行数:25,代码来源:guernsey_fc_news.py

示例4: wiki_string_to_tiddlers

def wiki_string_to_tiddlers(content):
    """
    Turn a string that is a wiki into tiddler.
    """
    parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    doc = parser.parse(content)
    # minidom will not provide working getElementById without
    # first having a valid document, which means some very specific
    # doctype hooey. So we traverse
    body = doc.getElementsByTagName('body')[0]
    body_divs = body.getElementsByTagName('div')
    is_wiki = False
    for div in body_divs:
        if div.hasAttribute('id') and div.getAttribute('id') == 'storeArea':
            divs = div.getElementsByTagName('div')
            is_wiki = True
            break

    if is_wiki:
        tiddlers = []
        for tiddler_div in divs:
            tiddlers.append(_get_tiddler_from_div(tiddler_div))
        return tiddlers
    else:
        raise ValueError('content not a tiddlywiki 2.x')
开发者ID:bengillies,项目名称:tiddlywebplugins.twimport,代码行数:25,代码来源:twimport.py

示例5: get_spaces_available

def get_spaces_available(dept_abbr, course_num):
	# define
	post_data = {
		'classyear' : '2008', #don't know WHY!?!
		'subj': dept_abbr,
		'crsenum': course_num,
	}
	url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch'

	# get the html
	cj = cookielib.LWPCookieJar()
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	urllib2.install_opener(opener)
	headers =  {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
	request = urllib2.Request(url, urllib.urlencode(post_data), headers)
	handle = urllib2.urlopen(request)
	html = handle.read()

	# parse the html
	parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
	soup = parser.parse(html)
	tbody = soup.find('th', text='Term').parent.parent.parent
	cells = tbody.findAll('tr')[2]('td')
	enrolled = int(cells[-2].contents[0])
	capacity = int(cells[-3].contents[0])

	print "%i spaces left (capacity of %i with %i enrolled)" % (capacity-enrolled, capacity, enrolled)
开发者ID:DartmouthHackerClub,项目名称:coursewatch,代码行数:27,代码来源:coursewatch.py

示例6: test_debug_log

def test_debug_log():
    parser = HTMLParser(debug=True)
    parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")

    expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
                ('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
                ('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
                ('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                ('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
                ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
                ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]

    if PY2:
        for i, log in enumerate(expected):
            log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
            expected[i] = tuple(log)

    assert parser.log == expected
开发者ID:adamchainz,项目名称:html5lib-python,代码行数:28,代码来源:test_parser2.py

示例7: shallow_scrape

def shallow_scrape():
    urns = set([])

    br = mechanize.Browser()
    resultspage = br.open("http://www.education.gov.uk/edubase/quickSearchResult.xhtml")

    moreWorkToDo = True
    c = 1

    while moreWorkToDo and (c<3):
        print "Handling page %d..."%c
    
        ### extract data from page
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
        page = parser.parse(resultspage)

        for u in page.getroot().findall(path(["body","div","div","div","div","table","tbody","tr","td","table","tbody","tr","td","a"],"")):
            #href = u.attrib.get("href","")
            href = u.get("href")
            print "href: %s"%href
            urn = re.search("urn=([0-9]{6})",href).group(1)
            urns.add(urn)
            print "%s, "%urn
        print

        ### get new page
        try:
            resultspage = br.follow_link(text="Next")
            c += 1
        except mechanize.LinkNotFoundError:
            moreWorkToDo = False

    return urns
开发者ID:psd,项目名称:edubase-schools-data,代码行数:33,代码来源:scraper.py

示例8: extract_html_urls

    def extract_html_urls(self, html):
        """
        Take all ``<img src="..">`` from the HTML
        """
        p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom = p.parse(html)
        urls = []

        for img in dom.getElementsByTagName('img'):
            src = img.getAttribute('src')
            if src:
                urls.append(unquote_utf8(src))

            srcset = img.getAttribute('srcset')
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName('source'):
            srcset = source.getAttribute('srcset')
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName('a'):
            href = source.getAttribute('href')
            if href:
                urls.append(unquote_utf8(href))

        return urls
开发者ID:django-fluent,项目名称:django-fluent-contents,代码行数:28,代码来源:find_contentitem_urls.py

示例9: get_dom

 def get_dom(self, buf):
     buf = buf.strip()
     if not buf:
         return None
     p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                             tokenizer=self.token_sanitizer())
     return p.parseFragment(buf)
开发者ID:riffm,项目名称:iktomi,代码行数:7,代码来源:html.py

示例10: scraper

def scraper(request):
    post_data = {
            'classyear' : '2008', # why??
            'subj': 'COSC',
            'crsenum': '50'
        }
    url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch'

    
    # scrape the html
    cj = cookielib.LWPCookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    headers =  {'User-agent' : 'Mozilla/c.0 (compatible; MSIE 5.5; Windows NT)'}
    request = urllib2.Request(url, urllib.urlencode(post_data), headers)
    handle = urllib2.urlopen(request)
    html = handle.read()

    # parse for the dept and course number
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    #tbody = soup.find('th', text='Term').parent.parent.parent
    #soup = tbody.findAll('tr')[2]('td')
    

    return render_to_response("scraper.html", {'soup': soup})
开发者ID:deloschang,项目名称:auto_class_scheduler,代码行数:26,代码来源:views.py

示例11: runParserEncodingTest

def runParserEncodingTest(data, encoding):
    p = HTMLParser()
    assert p.documentEncoding is None
    p.parse(data, useChardet=False)
    encoding = encoding.lower().decode("ascii")

    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
开发者ID:adamchainz,项目名称:html5lib-python,代码行数:7,代码来源:test_encoding.py

示例12: html_parser

def html_parser(html):
    try:
        soup = BeautifulSoup(html)
    except:
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
        soup = parser.parse(html)
    return soup
开发者ID:mzmttks,项目名称:jsm,代码行数:7,代码来源:util.py

示例13: get_first_result_index_from_quick_search_results

def get_first_result_index_from_quick_search_results(html):
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    block = soup.find('', {'id' : 'photoresult'}) # isolate the table of data on the first result
    block = block.findAll('', {'class' : 'photobox'})[0]
    id = block.find('p').find('a').contents[0]
    id = int(id)
    return id
开发者ID:gameguy43,项目名称:usable_image_scraper,代码行数:8,代码来源:parser.py

示例14: parse

def parse(f):
    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    doc = p.parse(f)
    walker = treewalkers.getTreeWalker("dom")

    tokens = []
    bintokens = []

    waitfor = None

    for tok in walker(doc):

        if waitfor:
            if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]:
                waitfor = None
            continue

        if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"):
            waitfor = ("EndTag", tok["name"])

        if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"):
            bintokens.append(1)
            tokens.append(tok)

        elif tok["type"] in ("Characters",):
            for tok1 in tok["data"].split():
                bintokens.append(0)
                tokens.append({"type": "Characters", "data": tok1})

        elif tok["type"] in ("SpaceCharacters", "Doctype"):
            pass

        else:
            raise ValueError("unrecognizable token type: %r" % tok)

    cumbintokens = [bintokens[0]]

    for tok in bintokens[1:]:
        cumbintokens.append(cumbintokens[-1] + tok)

    length = len(cumbintokens)

    midx = None
    m = None

    for i in range(length):
        for j in range(i + 1, length):
            end_tag = cumbintokens[-1] - cumbintokens[j]
            start_tag = cumbintokens[i]
            text_between = (j - i) - (cumbintokens[j] - cumbintokens[i])
            nm = end_tag + start_tag + text_between

            if not midx or nm > m:
                midx = i, j
                m = nm

    i, j = midx
    return serialize_tokens(tokens[i:j + 1])
开发者ID:balajig17,项目名称:MetadataExtraction,代码行数:58,代码来源:tag_ratio.py

示例15: scrape_others

def scrape_others(pct_name,url):
    types = ["doctor","dentist","pharmacy","optician"]
    for facility_type,i in zip(types,range(2,6)):
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
        page = parser.parse(scrape(url+"&v=%d"%i))
        root = page.getroot()

        s = root.find("body/div/form/div/div/div/div/div/dl")
        extract_table_data(pct_name,s,facility_type)
开发者ID:flyeven,项目名称:scraperwiki-scraper-vault,代码行数:9,代码来源:nhs-primary-care-trusts.py


注:本文中的html5lib.HTMLParser类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。