本文整理汇总了Python中lxml.etree.HTML属性的典型用法代码示例。如果您正苦于以下问题:Python etree.HTML属性的具体用法?Python etree.HTML怎么用?Python etree.HTML使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类lxml.etree
的用法示例。
在下文中一共展示了etree.HTML属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fetch_data
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def fetch_data(url):
r = requests.get(url, proxies=get_proxies())
root = etree.HTML(r.text)
trs = root.xpath('//tr')[1:]
result = []
typ = ''
for tr in trs:
tr = list(map(lambda x: x.text, tr.iter()))
if len(tr) == 4:
# This is type
typ = tr[2].strip(' ')
else:
# This is the row data
result.append(make_row_tuple(typ, tr))
return result
示例2: get_title
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def get_title(url):
code = 0
try:
r = req.get(url)
code = r.status_code
coding = chardet.detect(r.content).get('encoding')
text = r.content[:10000].decode(coding)
html = etree.HTML(text)
title = html.xpath('//title/text()')
if title:
return url + ' | ' + title[0]
else:
return url + ' | Status_code: ' + str(code)
except:
pass
return url + ' | Status_code: ' + str(code)
示例3: jsparse
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def jsparse(self, r):
try:
html = etree.HTML(r.text)
result = html.xpath('//script/@src')
for i in result:
if not re.search(
r'jquery|bootstrap|adsbygoogle|angular|javascript|#|vue|react|51.la/=|map\.baidu\.com|canvas|cnzz\.com|slick\.js|autofill-event\.js|tld\.js|clipboard|Chart\.js',
i):
if '://' not in i:
i = re.sub(r'^/|^\.\./', '', i)
i = self.host + '/' + i
self.js.append(i)
except (AttributeError, AttributeError, ValueError):
pass
except Exception as e:
logging.exception(e)
示例4: resolve_url
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def resolve_url(self):
url = URL.format('/potd')
try:
r = requests.get(url)
if r.status_code == 200:
parser = etree.HTMLParser(recover=True)
html = etree.HTML(r.content, parser)
for element in html.iter('img'):
if 'href' in element.getparent().attrib:
url = URL.format(element.getparent().attrib['href'])
break
if url is not None:
r = requests.get(url)
if r.status_code == 200:
html = etree.HTML(r.content, parser)
for element in html.iter('div'):
if 'class' in element.attrib and \
element.attrib['class'] == 'photo':
if 'data-xlarge' in element.attrib:
self._url = element.attrib['data-xlarge']
return True
except Exception:
pass
return False
示例5: resolve_url
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def resolve_url(self):
url = URL
try:
r = requests.get(url)
if r.status_code == 200:
parser = etree.HTMLParser(recover=True)
html = etree.HTML(r.content, parser)
images = html.iter('img')
if images is not None:
images = list(images)
if len(images) > 0:
image_url = images[0].getparent().attrib['href']
self._url = 'https://apod.nasa.gov/' + image_url
return True
except Exception:
pass
return False
示例6: get_districts
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def get_districts(city):
"""
获取各城市的区县中英文对照信息
:param city: 城市
:return: 英文区县名列表
"""
url = 'https://{0}.ke.com/xiaoqu/'.format(city)
headers = create_headers()
response = requests.get(url, timeout=10, headers=headers)
html = response.content
root = etree.HTML(html)
elements = root.xpath("/html/body/div[3]/div[1]/dl[2]/dd/div/div/a")
en_names = list()
ch_names = list()
for element in elements:
link = element.attrib['href']
en_names.append(link.split('/')[-2])
ch_names.append(element.text)
# 打印区县英文和中文名列表
for index, name in enumerate(en_names):
chinese_city_district_dict[name] = ch_names[index]
print(name + ' -> ' + ch_names[index])
return en_names
示例7: rdoc
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
elements = []
for i in range(num_elements):
choice = random.randint(0,3)
if choice == 0:
# New tag.
tag_name = random.choice(tag_names)
elements.append("<%s>" % tag_name)
elif choice == 1:
elements.append(rsentence(random.randint(1,4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
示例8: benchmark_parsers
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data)
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
try:
a = time.time()
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
示例9: basicinfo_spider
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def basicinfo_spider(self, url):
"""基本信息解析"""
html = self.get_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')[0]
category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()')
desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()')
ps = selector.xpath('//div[@class="mt20 articl-know"]/p')
infobox = []
for p in ps:
info = p.xpath('string(.)') \
.replace('\r', '') \
.replace('\n', '') \
.replace('\xa0', '') \
.replace(' ', '') \
.replace('\t', '')
infobox.append(info)
basic_data = {}
basic_data['category'] = category
basic_data['name'] = title.split('的简介')[0]
basic_data['desc'] = desc
basic_data['attributes'] = infobox
return basic_data
示例10: symptom_spider
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def symptom_spider(self, url):
"""症状信息解析"""
html = self.get_html(url)
selector = etree.HTML(html)
symptoms = selector.xpath('//a[@class="gre" ]/text()')
ps = selector.xpath('//p')
detail = []
for p in ps:
info = p.xpath('string(.)') \
.replace('\r', '') \
.replace('\n', '') \
.replace('\xa0', '') \
.replace(' ', '') \
.replace('\t', '')
detail.append(info)
symptoms_data = {}
symptoms_data['symptoms'] = symptoms
symptoms_data['symptoms_detail'] = detail
return symptoms, detail
示例11: common_spider
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def common_spider(self, url):
"""通用解析模块"""
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//p')
infobox = []
for p in ps:
info = p.xpath('string(.)') \
.replace('\r', '') \
.replace('\n', '') \
.replace('\xa0', '') \
.replace(' ', '') \
.replace('\t', '')
if info:
infobox.append(info)
return '\n'.join(infobox)
示例12: showbrowser
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def showbrowser(self):
"""
Show this response in a browser window (for debugging purposes,
when it's hard to read the HTML).
"""
import webbrowser
import tempfile
f = tempfile.NamedTemporaryFile(prefix='webtest-page',
suffix='.html')
name = f.name
f.close()
f = open(name, 'w')
if PY3:
f.write(self.body.decode(self.charset or 'ascii', 'replace'))
else:
f.write(self.body)
f.close()
if name[0] != '/': # pragma: no cover
# windows ...
url = 'file:///' + name
else:
url = 'file://' + name
webbrowser.open_new(url)
示例13: getNeedInfo
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def getNeedInfo(sourceHtml):
"""
获取SS_SSR的请求地址
"""
selector = etree.HTML(sourceHtml)
lists = []
for i in range(5, 9):
ca_1 = selector.xpath(
'/html/body/section/div[3]/div/div[1]/table/tbody/tr[' +
str(i) +
']/td/a/@href')
for j in ca_1:
print(j)
lists.append(j)
return lists
# lists = [j for j in selector.xpath('/html/body/section/div[3]/div/div[1]/table/tbody/tr['+str(i)+']/td/a/@href')]
示例14: _parse_company_detail
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def _parse_company_detail(self, detail_url):
resp = self._request('get', detail_url)
resp.encoding = resp.apparent_encoding
html = etree.HTML(resp.text)
name = html.xpath('//div[@class="company_main"]/h1/a/text()')
# 这里最好先判断一下,以免没提取到出现异常
if not name:
self.logger.debug('请求到错误页面')
time.sleep(30)
return self._parse_company_detail(detail_url)
# 返回的键必须包含这些,否则写入会报错
supply = {
'details': unescape(str(etree.tostring(html.xpath(
'//span[@class="company_content"]')[0]), encoding='utf8')).replace(
'<span class="company_content">', '').replace('\n', '').replace('\xa0', ''),
'website': html.xpath('//div[@class="company_main"]/a[1]/@href')[0].split('?')[0],
}
return supply
示例15: get_content
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import HTML [as 别名]
def get_content(session, urls):
log = ""
for url in urls:
r = session.get(url)
if r.status_code == 200:
root = etree.HTML(r.content)
try:
article = root.xpath("//article/div[@class='content']")[0]
content = etree.tostring(article, method="text", encoding='utf-8')
if sys.version_info > (3, 0):
content = content.decode("utf-8")
# remove first two lines
content = '\n'.join(content.split('\n')[2:-1])
log += "# {version}\n{content}\n\n".format(
version=url.split("-")[-1],
content=content,
)
except IndexError:
pass
return log