Python etree.HTML类代码示例

本文整理汇总了Python中lxml.etree.HTML类的典型用法代码示例。如果您正苦于以下问题：Python HTML类的具体用法？Python HTML怎么用？Python HTML使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了HTML类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: edit_message

def edit_message(base_url, username, password, message_id, new_body):
    url_opener = _utils.login_and_go_to_faq(base_url, username, password)

    # calculate some more URLs
    faq_url = urljoin(base_url, "faq.php")
    edit_url = urljoin(base_url, "misc.php")

    # go to the FAQ page (page with low backend complexity) to get the security token
    print("fetching security token")
    faq_response = url_opener.open(faq_url)
    faq = HTML(faq_response.read())
    token_field = faq.find(".//input[@name='securitytoken']")
    security_token = token_field.attrib["value"]

    # encode the message
    request_string = \
        "do=vsacb_editmessage&s=&securitytoken={0}&id={1}&vsacb_editmessage={2}".format(
            security_token, message_id, encode_outgoing_message(new_body)
        )
    request_bytes = request_string.encode(server_encoding)

    print("updating message")
    edit_response = url_opener.open(edit_url, data=request_bytes)
    edit_response.read()

    print("done")

开发者ID:RavuAlHemio，项目名称:vbcbbot，代码行数:26，代码来源:edit_message.py

示例2: decode_first

def decode_first(d):
    h = HTML(d)
    inner_js=''.join(h.xpath('//div/text()')).replace('_','')
    inner_js=inner_js.replace('&','').replace('%','')
    inner_js=inner_js.replace('=','').replace('undefined','')
    inner_js=inner_js.decode('hex')
    return inner_js

开发者ID:binjo，项目名称:ekdeco，代码行数:7，代码来源:landing.py

示例3: parse_xpath_content

 def parse_xpath_content(self, url):
     result = dict()
     content = self.get_content(url)
     if not content:
         return result
     result["url"] = url
     result["md5"] = self.md5(url)
     result["creat_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     tree = HTML(content)
     for key in self.config.Xpath.keys():
         if not self.config.Xpath.get(key):
             continue
         elif isinstance(self.config.Xpath.get(key), dict):
             # 字符串截取
             if self.config.Xpath[key]['op'] == 'cut':
                 pos1 = content.find(self.config.Xpath[key]['start'])
                 if pos1 != -1:
                     pos2 = content[pos1:].find(self.config.Xpath[key]['end'])
                     result[key] = content[pos1+len(self.config.Xpath[key]['start']):pos1+pos2]
                 else:
                     result[key] = ""
         else:
             list_content = tree.xpath(self.config.Xpath[key].replace('tbody/', ''))
             if list_content:
                 result[key] = "".join(list_content)
             else:
                 result[key] = ""
     result['publish_time'] = self.parse_time(result['publish_time'])
     return result

开发者ID:fdrong，项目名称:spider_news，代码行数:29，代码来源:base.py

示例4: testRemoteInfoCore

    def testRemoteInfoCore(self):
        header, body = getRequest(port=self.httpPort, path='/remote/info/core', arguments=dict(name='main'), parse=False)
        self.assertFalse('Traceback' in body, body)  # only tested for MultiLucene situation for now!
        bodyLxml = HTML(body)
        lists = bodyLxml.xpath('//ul')
        fieldList = lists[0]
        fields = fieldList.xpath('li/a/text()')
        self.assertEquals(19, len(fields))
        self.assertEqual([
                '$facets',
                '__id__',
                '__key__.field',
                'copy',
                'field1',
                'field2',
                'field3',
                'field4',
                'field5',
                'field_missing',
                'intfield1',
                'intfield2',
                'intfield3',
                'intfield_missing',
                'sorted.field2',
                'sorted.field4',
                'sorted.intfield1',
                'sorted.intfield_missing',
                'untokenized.field3',
            ], fields)

        drilldownFieldList = lists[1]
        drilldownFields = drilldownFieldList.xpath('li/a/text()')
        self.assertEquals(set(['untokenized.field2', 'untokenized.fieldHier', 'untokenized.field2.copy']), set(drilldownFields))

开发者ID:seecr，项目名称:meresco-lucene，代码行数:33，代码来源:luceneremoteservicetest.py

示例5: hijack

def hijack(content):
    html = HTML(content)
    body = html.xpath('//body')[0]
    script = Element('script')
    script.text = 'alert(/hijacked/);'
    body.append(script)
    content = tostring(html)
    return content

开发者ID:xujun10110，项目名称:death-star，代码行数:8，代码来源:web_proxy.py

示例6: save_download

 def save_download(self, url, data, index):
     page = HTML(data)
     body = page.xpath('//body')[0]
     bundles = elquery.get_elements_by_class(body, 'olpc-bundle')
     bundle = bundles[index]
     links = bundle.xpath('descendant-or-self::a[@href]')
     for link in links:
         href = urlparse.urljoin(url, link.attrib['href'])
         print 'got one page:', href
         self.store.save_page_set(href)

开发者ID:iitwebdev，项目名称:lectures，代码行数:10，代码来源:proxy.py

示例7: decode_first_js

def decode_first_js(data):
    h = HTML(data)
    off = get_off(h)
    off.append(0)
    for el in h.xpath("//*[@id]"):
        if el.text:
            txt = decode_payload(off, el.text)
            if not txt:
                continue
            yield txt

开发者ID:MeteorAdminz，项目名称:ekdeco，代码行数:10，代码来源:landing.py

示例8: testRemoteInfoCore

    def testRemoteInfoCore(self):
        header, body = getRequest(port=self.httpPort, path='/remote/info/core', arguments=dict(name='main'), parse=False)
        bodyLxml = HTML(body)
        lists = bodyLxml.xpath('//ul')
        fieldList = lists[0]
        fields = fieldList.xpath('li/a/text()')
        self.assertEquals(12, len(fields))

        drilldownFieldList = lists[1]
        drilldownFields = drilldownFieldList.xpath('li/a/text()')
        self.assertEquals(['untokenized.field2', 'untokenized.fieldHier'], drilldownFields)

开发者ID:jerryba，项目名称:meresco-lucene，代码行数:11，代码来源:luceneremoteservicetest.py

示例9: main

def main(wf):
    kw = wf.args[0]
    r = web.get(kw)
    r.raise_for_status()
    reg = re.compile('<ul id="dl-btn">.*</ul>', flags=re.DOTALL + re.MULTILINE)
    match = reg.search(r.text)
    if match:
        html = match.group(0)
        node = HTML(html).find('.//a')
        log.info(node.text)
        call(["open", node.get('href')])

开发者ID:mtxs007，项目名称:Mac-App-Downloader-Alfred-Workflow，代码行数:11，代码来源:tr_download.py

示例10: parse_urls

 def parse_urls(self):
     content = self.get_content(self.config.Root)
     if content:
         tree = HTML(content)
         url_list = tree.xpath(u"//a/@href")
         pattern = re.compile(self.config.Regex)
         url_joined_list = [urlparse.urljoin(self.config.Root, url) for url in url_list]
         url_joined_list = list(set(url_joined_list))   # 去重
         return filter(pattern.match, url_joined_list)
     else:
         return []

开发者ID:fdrong，项目名称:spider_news，代码行数:11，代码来源:base.py

示例11: get

 def get(self, url, depth=1):
     counter_processed.update((depth, ))
     logging.info('[{}] Processing {} ({}).'.format(threading.current_thread().name, url, depth))
     rsp = self.session.get(url)
     rsp.encoding = 'GB2312'
     html = HTML(rsp.text)
     urls = html.xpath('//a/@href')
     urls = list(set(filter(lambda url: re.search(self.url_loc, url), urls)))
     for url in urls:
         self.data.put((url, depth + 1))
     counter.update([depth + 1] * len(urls))

开发者ID:Kxrr，项目名称:ThreadingSpider，代码行数:11，代码来源:spiders.py

示例12: doit

def doit(d):
    if '<div' in d:
        d = decode_first(d)
        
    for p in decode_payloads(d):
        urls = []
        if 'application/x-shockwave-flash' in p:
            t = 'flash'
            x=p.strip().splitlines()[-2].replace("'",'"').split('"')
            url_b=x[1].split('/')[1]
            sh =x[-2].decode('hex').strip("\x00")
            urls = re.findall('"(/'+url_b+'.*?)"',p)
            payload_url = re.findall('(http.*)',sh)[0]
            
        elif 'data:application/x-silverlight' in p:
            t = 'silverlight'
            x = HTML(re.findall('"(.*?)"',p)[0])
            for i in x.xpath('//param'):
                if i.attrib['name'] == 'source':
                    urls = [i.attrib['value']]
                elif i.attrib['name'] == 'initParams':
                    vals = dict(map(lambda x: tuple(x.split('=')),i.attrib['value'].split('&')))
                    sh   = vals['shell32'].decode('hex').strip("\x00")
                    payload_url = re.findall('(http.*)',sh)[0]
                    
        elif 'CollectGarbage' in p:
            t = 'ie'
            x= p.strip().splitlines()[-1].replace("'",'"').split('"')
            payload_url = x[1] + ' rc4 key: %s' % x[-2]
            sh = re.findall('"([0-9a-f]+)"\+',p,re.I)[0].decode('hex')            
        else:
            t = 'unknown'

        sh_hash = hashlib.sha256(sh).hexdigest()
        print '[+] found %s exploit' % t
        if urls:
            print '[+] additional exploits:', ', '.join(urls)
        print '[+] payload url:', payload_url
        print '[+] shellcode hash:',sh_hash

        if args.save:
            n = args.dir + '/exp.%s.%s.txt' % (t,hashlib.sha256(p).hexdigest())
            with open(n,'w') as f:
                f.write(p)
            print '[+] js saved to', n
            if sh:
                n = args.dir + '/exp.%s.%s.sh.bin' % (t,sh_hash)
                with open(n,'w') as f:
                    f.write(sh)
                print '[+] shellcode saved to', n

开发者ID:binjo，项目名称:ekdeco，代码行数:50，代码来源:landing.py

示例13: link_tag_url

def link_tag_url(html):
    '''
    extracts a relative url from an HTML document's link tag, like

        <link rel="shortcut icon" href="images-template/favicon.ico" type="image/x-icon" />

    '''
    from lxml.etree import HTML
    doc = HTML(html)
    link_tag = doc.find('.//link[@rel="shortcut icon"]')
    if link_tag is not None:
        favicon_url = link_tag.get('href', '')
        if favicon_url:
            return favicon_url

开发者ID:AlexUlrich，项目名称:digsby，代码行数:14，代码来源:favicons.py

示例14: html_to_table

def html_to_table(input_filename, encoding='utf-8'):
    with open(input_filename) as fobj:
        html = fobj.read().decode(encoding).replace('\xa0', ' ')
    tree = HTML(html)

    data = tree.xpath('//body/b')
    for index, element in enumerate(data):
        text = element.text
        if text.startswith('Valores') and text.endswith('R$'):
            break
    new = []
    for element in data[index + 1:]:
        text = element.text
        if text.startswith('FATURA DE '):
            continue
        elif REGEXP_PAGE.findall(text):
            continue
        else:
            new.append(element.text)
    data = new

    chunks = [[value.strip() for value in row]
              for row in partition(data, 4) if len(row) == 4]
    table = rows.Table(fields=FIELDS)
    current_year = datetime.datetime.now().year
    months = set(extract_month(row) for row in chunks)
    subtract_year = 'DEZ' in months and 'JAN' in months
    for row in chunks:
        try:
            category = convert_text(row[0])
            description = convert_text(row[1])
            value = convert_value(row[2])
        except:
            print('WARNING: Ignoring row: {}'.format(row))
            continue
        year = current_year
        month = extract_month(row)
        if subtract_year and month in ('NOV', 'DEZ'):
            year = current_year - 1
        date = convert_date(row[3], year)
        table.append({'category': category,
                      'description': description,
                      'value': value,
                      'date': date, })

    return table

开发者ID:turicas，项目名称:nubank-to-csv，代码行数:46，代码来源:nubank.py

示例15: fake

def fake(base_url, username, password, game_id, time, score, game_name=None):
    url_opener = _utils.login_and_enter_arcade(base_url, username, password)

    # calculate some more URLs
    play_game_url = urljoin(base_url, "arcade.php?do=play&gameid={0}".format(game_id))
    score_url = urljoin(base_url, "index.php?act=Arcade&do=newscore")

    # pretend to play the game
    print("playing the game")
    play_game_response = url_opener.open(play_game_url)
    play_game = HTML(play_game_response.read())

    if game_name is None:
        # (meanwhile, find the game's name)
        game_flash = play_game.find(".//embed[@type='application/x-shockwave-flash']")
        if game_flash is None:
            print("didn't find the flash plugin on the game page :'-(")
            return

        flash_vars = game_flash.attrib['flashvars'].split("&")
        for var in flash_vars:
            if var.startswith("gamename="):
                game_name = var[len("gamename="):]

    if game_name is None:
        print("game name not found :'-(")
        return

    # wait the given time
    print("waiting")
    sleep(time)

    post_values = {
        "gscore": score,
        "gname": game_name
    }
    post_data = _utils.encode_post_data(post_values)
    print("submitting fake score")
    score_response = url_opener.open(score_url, data=post_data)
    score_response.read()

    print("done")

开发者ID:RavuAlHemio，项目名称:vbcbbot，代码行数:42，代码来源:fakescore.py

注：本文中的lxml.etree.HTML类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。