本文整理汇总了Python中lxml.etree.HTMLParser方法的典型用法代码示例。如果您正苦于以下问题:Python etree.HTMLParser方法的具体用法?Python etree.HTMLParser怎么用?Python etree.HTMLParser使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.etree
的用法示例。
在下文中一共展示了etree.HTMLParser方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: resolve_url
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def resolve_url(self):
url = URL.format('/potd')
try:
r = requests.get(url)
if r.status_code == 200:
parser = etree.HTMLParser(recover=True)
html = etree.HTML(r.content, parser)
for element in html.iter('img'):
if 'href' in element.getparent().attrib:
url = URL.format(element.getparent().attrib['href'])
break
if url is not None:
r = requests.get(url)
if r.status_code == 200:
html = etree.HTML(r.content, parser)
for element in html.iter('div'):
if 'class' in element.attrib and \
element.attrib['class'] == 'photo':
if 'data-xlarge' in element.attrib:
self._url = element.attrib['data-xlarge']
return True
except Exception:
pass
return False
示例2: resolve_url
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def resolve_url(self):
url = URL
try:
r = requests.get(url)
if r.status_code == 200:
parser = etree.HTMLParser(recover=True)
html = etree.HTML(r.content, parser)
images = html.iter('img')
if images is not None:
images = list(images)
if len(images) > 0:
image_url = images[0].getparent().attrib['href']
self._url = 'https://apod.nasa.gov/' + image_url
return True
except Exception:
pass
return False
示例3: country_population
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def country_population():
from lxml import etree
from urllib.request import urlopen
page = urlopen('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population').read()
parser = etree.HTMLParser()
tree = etree.fromstring(page, parser=parser)
tables = tree.findall('.//table')
for table in tables:
if 'wikitable' in table.attrib.get('class', ''):
rows = table.findall('.//tr')
for row in rows:
cells = row.findall('td')
if len(cells) > 3:
name = cells[1].find('.//a').attrib.get('title')
population = cells[2].text
yield(dict(
name=name,
population=population
))
示例4: __init__
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def __init__(self, twitter_request_url, twitter_request_header,
twitter_request_params=None, twitter_request_proxies=None, scrape_pages=2,
twitter_file_path=None, twitter_file_format='csv'):
self.__twitter_request_url__ = twitter_request_url
if twitter_request_header is not None:
self.__twitter_request_header__ = twitter_request_header
self.__twitter_request_params__ = twitter_request_params
self.__twitter_request_proxies__ = twitter_request_proxies
self.scrape_pages = scrape_pages
self.__twitter_tweet_persist_file_path__ = twitter_file_path
self.__twitter_tweet_persist_file_format__ = twitter_file_format
self.hashtag_capture = re.compile(self._tweet_hastag_pattern_)
self.html_parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
self.proxy_json = None
示例5: parse_page
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def parse_page(url):
# headers = build_headers()
#
# result = requests.get(url,headers=headers).text
#
# parse = etree.HTMLParser(encoding='utf-8')
# html = etree.HTML(result,parser=parse)
#
# hrefs = html.xpath(r'//div[@id="shop-all-list"]//div[@class="tit"]/a/@href')
hrefs = ['http://www.dianping.com/shop/23093707', 'http://www.dianping.com/brands/b23093707', 'http://www.dianping.com/shop/2461336', 'http://www.dianping.com/shop/90085699', 'http://www.dianping.com/shop/13810171', 'http://www.dianping.com/brands/b13810171', 'http://www.dianping.com/shop/58322041', 'http://www.dianping.com/shop/80620237', 'http://www.dianping.com/shop/130946881', 'http://www.dianping.com/brands/b130946881', 'http://www.dianping.com/shop/32704021', 'http://www.dianping.com/brands/b18005322', 'http://www.dianping.com/shop/75141698', 'http://www.dianping.com/brands/b10008473', 'http://www.dianping.com/shop/92384680', 'http://www.dianping.com/shop/47008792', 'http://www.dianping.com/brands/b47008792', 'http://www.dianping.com/shop/67997136', 'http://www.dianping.com/brands/b4087801', 'http://www.dianping.com/shop/111533101', 'http://www.dianping.com/shop/98779037', 'http://www.dianping.com/shop/102025765', 'http://www.dianping.com/brands/b23093707']
every_page_headers = build_headers(url)
print(every_page_headers)
for href in hrefs:
result = requests.get(href,headers=every_page_headers).text
with open('test.html','w',encoding='utf-8') as fp:
fp.write(result)
break
示例6: se
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def se(self, ctx, em:str):
"""Returns a steam emoji image"""
em = em.lower()
desc = None
if em == ':b1:' or em == 'b1':
b = self.files_path('b1.png')
else:
url = "https://steamcommunity-a.akamaihd.net/economy/emoticonhover/{0}".format(em)
txt = await self.get_text(url)
if not txt:
await self.bot.say(":warning: `Emoticon Not Found/Invalid`\nRemember to do :steam_emoticon: (optional ':').")
return
root = etree.fromstring(txt, etree.HTMLParser())
base = root.find('.//img[@class="emoticon_large"]')
b = BytesIO(base64.b64decode(base.attrib['src'][22:]))
desc = '**{0}**'.format(root.find('.//div[@class="emoticon_hover_desc"]').text)
await self.bot.upload(b, filename='steam.png', content=desc)
示例7: create_pdf_from_bookmark
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def create_pdf_from_bookmark(bookmark):
logging.info('Processing %s', bookmark.title)
# add some introductory HTML to the page (title, etc.)
stylesheet_html = ('<head><style>body {font-family: Verdana;'
'font-size: 11pt;}</style></head>')
txt = bookmark.get_text()['data']
txt = txt.decode('utf-8')
parser = etree.HTMLParser()
tree = etree.fromstring(txt, parser)
tree.insert(0, etree.XML(stylesheet_html))
new_html = etree.tostring(tree)
# create/manage the directory structure for the article
date = datetime.datetime.fromtimestamp(bookmark.time)
year_dir = str(date.year)
month_dir = str(date.month)
dest_dir = os.path.join(PDF_DEST_FOLDER, year_dir, month_dir)
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
pdf_filename = os.path.join(dest_dir, '%s.pdf' % bookmark.title)
tmp_file = tempfile.NamedTemporaryFile(delete=False)
tmp_file.write(new_html)
tmp_file.close()
html_filename = '%s.html' % tmp_file.name
os.rename(tmp_file.name, html_filename)
# generate the PDF and cleanup
pdf_cmd = ['wkhtmltopdf', html_filename, pdf_filename]
proc = subprocess.Popen(
pdf_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
cmd_output, return_code = proc.communicate()
os.unlink(html_filename)
return pdf_filename
示例8: __init__
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def __init__(self, html_data, is_file):
self.es_ops = ElasticSearchOperate()
self.html_data = html_data
self.new_line_non_break_pattern = re.compile(self.new_line_non_break_regex)
parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
if is_file:
self.html_tree = etree.parse(self.html_data, parser)
else:
self.html_tree = etree.fromstring(self.html_data, parser)
示例9: _extract_urls
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def _extract_urls(self, content, base_url):
'''
Get the URLs out of a WAF index page
'''
try:
parser = etree.HTMLParser()
tree = etree.fromstring(content, parser=parser)
except Exception, inst:
msg = 'Couldn\'t parse content into a tree: %s: %s' \
% (inst, content)
raise Exception(msg)
示例10: __init__
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def __init__(self, url, pageEncoding=ENCODING):
self.url = url
self.encoding = pageEncoding
self.html = restClient.get(self.url, self.encoding)
if self.html is not None:
self.valid = True
self.encode = pageEncoding
self.parser = etree.HTMLParser(encoding=self.encode)
self.tree = etree.HTML(self.html, parser=self.parser)
else:
self.valid = False
raise ValueError('could not fetch data from: ""' + self.url + '""')
示例11: _login
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def _login(self):
path = "/login.php"
text = self._get(path).text
html = etree.fromstring(text, etree.HTMLParser())
user_token = html.xpath("//input[@name='user_token']/@value")[0]
data = {
"username": "admin",
"password": "password",
"Login": "Login",
"user_token": user_token
}
r = self._post(path, data)
assert r.headers["Location"] == "index.php"
示例12: _get_forecast_data
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def _get_forecast_data(year, quarter, pageNo, dataArr):
ct._write_console()
try:
gparser = etree.HTMLParser(encoding='GBK')
html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
ct.PAGES['fd'], year, quarter, pageNo,
ct.PAGE_NUM[1]),
parser=gparser)
res = html.xpath("//table[@class=\"list_table\"]/tr")
if ct.PY3:
sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)
sarr = sarr.replace('--', '0')
sarr = '<table>%s</table>'%sarr
df = pd.read_html(sarr)[0]
df = df.drop([4, 5, 8], axis=1)
df.columns = ct.FORECAST_COLS
dataArr = dataArr.append(df, ignore_index=True)
nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
if len(nextPage)>0:
pageNo = re.findall(r'\d+',nextPage[0])[0]
return _get_forecast_data(year, quarter, pageNo, dataArr)
else:
return dataArr
except Exception as e:
print(e)
示例13: parse_html
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def parse_html(html):
"""
Returns `html` parsed with lxml.
:param html: Unicode content
"""
from defusedxml import lxml as dlxml
from lxml import etree
# lxml requires argument to be bytes
# see https://github.com/kibitzr/kibitzr/issues/47
encoded = html.encode('utf-8')
return dlxml.fromstring(encoded, parser=etree.HTMLParser())
示例14: test_file_path_correct
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def test_file_path_correct(self):
"""
Test that output is placed in the correct directory.
"""
ocr = self.ocropus.ocr_ocropus.run((('test', 'segmentation.xml'),
('test', 'image_png.png')),
model='ocropus')
try:
parser = etree.HTMLParser()
etree.parse(open(os.path.join(self.storage_path, *ocr)), parser)
except etree.XMLSyntaxError:
self.fail(msg='The output was not valid html/xml!')
示例15: test_file_outpath_png
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTMLParser [as 别名]
def test_file_outpath_png(self):
"""
Test that ocropus creates hocr output for pngs.
"""
ocr = self.ocropus.ocr_ocropus.run((('test', 'segmentation.xml'),
('test', 'image_png.png')),
model='ocropus')
try:
parser = etree.HTMLParser()
etree.parse(open(os.path.join(self.storage_path, *ocr)), parser)
except etree.XMLSyntaxError:
self.fail(msg='The output was not valid html/xml!')