本文整理汇总了Python中lxml.etree.HTML类的典型用法代码示例。如果您正苦于以下问题:Python HTML类的具体用法?Python HTML怎么用?Python HTML使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HTML类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: edit_message
def edit_message(base_url, username, password, message_id, new_body):
url_opener = _utils.login_and_go_to_faq(base_url, username, password)
# calculate some more URLs
faq_url = urljoin(base_url, "faq.php")
edit_url = urljoin(base_url, "misc.php")
# go to the FAQ page (page with low backend complexity) to get the security token
print("fetching security token")
faq_response = url_opener.open(faq_url)
faq = HTML(faq_response.read())
token_field = faq.find(".//input[@name='securitytoken']")
security_token = token_field.attrib["value"]
# encode the message
request_string = \
"do=vsacb_editmessage&s=&securitytoken={0}&id={1}&vsacb_editmessage={2}".format(
security_token, message_id, encode_outgoing_message(new_body)
)
request_bytes = request_string.encode(server_encoding)
print("updating message")
edit_response = url_opener.open(edit_url, data=request_bytes)
edit_response.read()
print("done")
示例2: decode_first
def decode_first(d):
h = HTML(d)
inner_js=''.join(h.xpath('//div/text()')).replace('_','')
inner_js=inner_js.replace('&','').replace('%','')
inner_js=inner_js.replace('=','').replace('undefined','')
inner_js=inner_js.decode('hex')
return inner_js
示例3: parse_xpath_content
def parse_xpath_content(self, url):
result = dict()
content = self.get_content(url)
if not content:
return result
result["url"] = url
result["md5"] = self.md5(url)
result["creat_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
tree = HTML(content)
for key in self.config.Xpath.keys():
if not self.config.Xpath.get(key):
continue
elif isinstance(self.config.Xpath.get(key), dict):
# 字符串截取
if self.config.Xpath[key]['op'] == 'cut':
pos1 = content.find(self.config.Xpath[key]['start'])
if pos1 != -1:
pos2 = content[pos1:].find(self.config.Xpath[key]['end'])
result[key] = content[pos1+len(self.config.Xpath[key]['start']):pos1+pos2]
else:
result[key] = ""
else:
list_content = tree.xpath(self.config.Xpath[key].replace('tbody/', ''))
if list_content:
result[key] = "".join(list_content)
else:
result[key] = ""
result['publish_time'] = self.parse_time(result['publish_time'])
return result
示例4: testRemoteInfoCore
def testRemoteInfoCore(self):
header, body = getRequest(port=self.httpPort, path='/remote/info/core', arguments=dict(name='main'), parse=False)
self.assertFalse('Traceback' in body, body) # only tested for MultiLucene situation for now!
bodyLxml = HTML(body)
lists = bodyLxml.xpath('//ul')
fieldList = lists[0]
fields = fieldList.xpath('li/a/text()')
self.assertEquals(19, len(fields))
self.assertEqual([
'$facets',
'__id__',
'__key__.field',
'copy',
'field1',
'field2',
'field3',
'field4',
'field5',
'field_missing',
'intfield1',
'intfield2',
'intfield3',
'intfield_missing',
'sorted.field2',
'sorted.field4',
'sorted.intfield1',
'sorted.intfield_missing',
'untokenized.field3',
], fields)
drilldownFieldList = lists[1]
drilldownFields = drilldownFieldList.xpath('li/a/text()')
self.assertEquals(set(['untokenized.field2', 'untokenized.fieldHier', 'untokenized.field2.copy']), set(drilldownFields))
示例5: hijack
def hijack(content):
html = HTML(content)
body = html.xpath('//body')[0]
script = Element('script')
script.text = 'alert(/hijacked/);'
body.append(script)
content = tostring(html)
return content
示例6: save_download
def save_download(self, url, data, index):
page = HTML(data)
body = page.xpath('//body')[0]
bundles = elquery.get_elements_by_class(body, 'olpc-bundle')
bundle = bundles[index]
links = bundle.xpath('descendant-or-self::a[@href]')
for link in links:
href = urlparse.urljoin(url, link.attrib['href'])
print 'got one page:', href
self.store.save_page_set(href)
示例7: decode_first_js
def decode_first_js(data):
h = HTML(data)
off = get_off(h)
off.append(0)
for el in h.xpath("//*[@id]"):
if el.text:
txt = decode_payload(off, el.text)
if not txt:
continue
yield txt
示例8: testRemoteInfoCore
def testRemoteInfoCore(self):
header, body = getRequest(port=self.httpPort, path='/remote/info/core', arguments=dict(name='main'), parse=False)
bodyLxml = HTML(body)
lists = bodyLxml.xpath('//ul')
fieldList = lists[0]
fields = fieldList.xpath('li/a/text()')
self.assertEquals(12, len(fields))
drilldownFieldList = lists[1]
drilldownFields = drilldownFieldList.xpath('li/a/text()')
self.assertEquals(['untokenized.field2', 'untokenized.fieldHier'], drilldownFields)
示例9: main
def main(wf):
kw = wf.args[0]
r = web.get(kw)
r.raise_for_status()
reg = re.compile('<ul id="dl-btn">.*</ul>', flags=re.DOTALL + re.MULTILINE)
match = reg.search(r.text)
if match:
html = match.group(0)
node = HTML(html).find('.//a')
log.info(node.text)
call(["open", node.get('href')])
示例10: parse_urls
def parse_urls(self):
content = self.get_content(self.config.Root)
if content:
tree = HTML(content)
url_list = tree.xpath(u"//a/@href")
pattern = re.compile(self.config.Regex)
url_joined_list = [urlparse.urljoin(self.config.Root, url) for url in url_list]
url_joined_list = list(set(url_joined_list)) # 去重
return filter(pattern.match, url_joined_list)
else:
return []
示例11: get
def get(self, url, depth=1):
counter_processed.update((depth, ))
logging.info('[{}] Processing {} ({}).'.format(threading.current_thread().name, url, depth))
rsp = self.session.get(url)
rsp.encoding = 'GB2312'
html = HTML(rsp.text)
urls = html.xpath('//a/@href')
urls = list(set(filter(lambda url: re.search(self.url_loc, url), urls)))
for url in urls:
self.data.put((url, depth + 1))
counter.update([depth + 1] * len(urls))
示例12: doit
def doit(d):
if '<div' in d:
d = decode_first(d)
for p in decode_payloads(d):
urls = []
if 'application/x-shockwave-flash' in p:
t = 'flash'
x=p.strip().splitlines()[-2].replace("'",'"').split('"')
url_b=x[1].split('/')[1]
sh =x[-2].decode('hex').strip("\x00")
urls = re.findall('"(/'+url_b+'.*?)"',p)
payload_url = re.findall('(http.*)',sh)[0]
elif 'data:application/x-silverlight' in p:
t = 'silverlight'
x = HTML(re.findall('"(.*?)"',p)[0])
for i in x.xpath('//param'):
if i.attrib['name'] == 'source':
urls = [i.attrib['value']]
elif i.attrib['name'] == 'initParams':
vals = dict(map(lambda x: tuple(x.split('=')),i.attrib['value'].split('&')))
sh = vals['shell32'].decode('hex').strip("\x00")
payload_url = re.findall('(http.*)',sh)[0]
elif 'CollectGarbage' in p:
t = 'ie'
x= p.strip().splitlines()[-1].replace("'",'"').split('"')
payload_url = x[1] + ' rc4 key: %s' % x[-2]
sh = re.findall('"([0-9a-f]+)"\+',p,re.I)[0].decode('hex')
else:
t = 'unknown'
sh_hash = hashlib.sha256(sh).hexdigest()
print '[+] found %s exploit' % t
if urls:
print '[+] additional exploits:', ', '.join(urls)
print '[+] payload url:', payload_url
print '[+] shellcode hash:',sh_hash
if args.save:
n = args.dir + '/exp.%s.%s.txt' % (t,hashlib.sha256(p).hexdigest())
with open(n,'w') as f:
f.write(p)
print '[+] js saved to', n
if sh:
n = args.dir + '/exp.%s.%s.sh.bin' % (t,sh_hash)
with open(n,'w') as f:
f.write(sh)
print '[+] shellcode saved to', n
示例13: link_tag_url
def link_tag_url(html):
'''
extracts a relative url from an HTML document's link tag, like
<link rel="shortcut icon" href="images-template/favicon.ico" type="image/x-icon" />
'''
from lxml.etree import HTML
doc = HTML(html)
link_tag = doc.find('.//link[@rel="shortcut icon"]')
if link_tag is not None:
favicon_url = link_tag.get('href', '')
if favicon_url:
return favicon_url
示例14: html_to_table
def html_to_table(input_filename, encoding='utf-8'):
with open(input_filename) as fobj:
html = fobj.read().decode(encoding).replace('\xa0', ' ')
tree = HTML(html)
data = tree.xpath('//body/b')
for index, element in enumerate(data):
text = element.text
if text.startswith('Valores') and text.endswith('R$'):
break
new = []
for element in data[index + 1:]:
text = element.text
if text.startswith('FATURA DE '):
continue
elif REGEXP_PAGE.findall(text):
continue
else:
new.append(element.text)
data = new
chunks = [[value.strip() for value in row]
for row in partition(data, 4) if len(row) == 4]
table = rows.Table(fields=FIELDS)
current_year = datetime.datetime.now().year
months = set(extract_month(row) for row in chunks)
subtract_year = 'DEZ' in months and 'JAN' in months
for row in chunks:
try:
category = convert_text(row[0])
description = convert_text(row[1])
value = convert_value(row[2])
except:
print('WARNING: Ignoring row: {}'.format(row))
continue
year = current_year
month = extract_month(row)
if subtract_year and month in ('NOV', 'DEZ'):
year = current_year - 1
date = convert_date(row[3], year)
table.append({'category': category,
'description': description,
'value': value,
'date': date, })
return table
示例15: fake
def fake(base_url, username, password, game_id, time, score, game_name=None):
url_opener = _utils.login_and_enter_arcade(base_url, username, password)
# calculate some more URLs
play_game_url = urljoin(base_url, "arcade.php?do=play&gameid={0}".format(game_id))
score_url = urljoin(base_url, "index.php?act=Arcade&do=newscore")
# pretend to play the game
print("playing the game")
play_game_response = url_opener.open(play_game_url)
play_game = HTML(play_game_response.read())
if game_name is None:
# (meanwhile, find the game's name)
game_flash = play_game.find(".//embed[@type='application/x-shockwave-flash']")
if game_flash is None:
print("didn't find the flash plugin on the game page :'-(")
return
flash_vars = game_flash.attrib['flashvars'].split("&")
for var in flash_vars:
if var.startswith("gamename="):
game_name = var[len("gamename="):]
if game_name is None:
print("game name not found :'-(")
return
# wait the given time
print("waiting")
sleep(time)
post_values = {
"gscore": score,
"gname": game_name
}
post_data = _utils.encode_post_data(post_values)
print("submitting fake score")
score_response = url_opener.open(score_url, data=post_data)
score_response.read()
print("done")