本文整理汇总了Python中lxml.html.fromstring方法的典型用法代码示例。如果您正苦于以下问题:Python html.fromstring方法的具体用法?Python html.fromstring怎么用?Python html.fromstring使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.html
的用法示例。
在下文中一共展示了html.fromstring方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: resolve_url
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def resolve_url(self):
url = URL.format('photo_of_the_day/')
try:
r = requests.get(url)
if r.status_code == 200:
doc = fromstring(r.text)
results = doc.cssselect('a.article__pic')
url = URL.format(results[0].get('href'))
r = requests.get(url, stream=True)
if r.status_code == 200:
doc = fromstring(r.text)
results = doc.cssselect('img')
for index, result in enumerate(results):
posible = result.get('src')
if re.match(r'/img/bx/iblock/.*\.jpg$',
posible.lower()):
self._url = URL.format(posible[1:])
return True
except Exception as e:
print(e)
pass
return False
示例2: resolve_url
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def resolve_url(self):
try:
r = requests.get(URL)
if r.status_code == 200:
doc = fromstring(r.text)
results = doc.cssselect('figure')
if results:
wallpaper_id = results[0].get('data-wallpaper-id')
wallpaper_image = 'wallhaven-{0}.jpg'.format(
wallpaper_id)
self._url = 'https://w.wallhaven.cc/full/{}/{}'.format(
wallpaper_id[0:2], wallpaper_image)
return True
except Exception:
pass
return False
示例3: get_text_from_markdown
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_text_from_markdown(markdown_text):
renderer = HtmlRenderer()
markdown = Markdown(renderer, extensions=('tables', 'autolink', 'strikethrough', 'quote', 'superscript', 'fenced-code'))
html = markdown(markdown_text)
parsed_html = fromstring(html)
# remove quoted text
[x.getparent().remove(x) for x in parsed_html.xpath('//blockquote')]
# remove automatically added links
for link in parsed_html.xpath('//a'):
if link.text_content() == link.get('href'):
link.getparent().remove(link)
text = ''.join(parsed_html.text_content()).strip()
return text
# https://stackoverflow.com/a/3155023
示例4: get_weekly_horoscope
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_weekly_horoscope(sunsign):
url = "http://www.ganeshaspeaks.com/horoscopes/weekly-horoscope/" + sunsign
response = requests.get(url)
tree = html.fromstring(response.content)
week = str(tree.xpath(
"//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
week = week.replace("']", "").replace("['", "")
horoscope = str(tree.xpath(
"//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()"))
horoscope = horoscope.replace("\\n", "").replace(" ", "").replace("']", "").replace("['", "")
dict = {
'week': week,
'horoscope': horoscope,
'sunsign': sunsign
}
return dict
示例5: get_monthly_horoscope
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_monthly_horoscope(sunsign):
url = "http://www.ganeshaspeaks.com/horoscopes/monthly-horoscope/" + sunsign
response = requests.get(url)
tree = html.fromstring(response.content)
month = str(tree.xpath(
"//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
month = month.replace("']", "").replace("['", "")
horoscope = str(tree.xpath(
"//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()[1]"))
horoscope = horoscope.replace("\\n", "").replace(" ", "").replace("']", "").replace("['", "")
dict = {
'month': month,
'horoscope': horoscope,
'sunsign': sunsign
}
return dict
示例6: get_yearly_horoscope
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_yearly_horoscope(sunsign):
url = "http://www.ganeshaspeaks.com/horoscopes/yearly-horoscope/" + sunsign
response = requests.get(url)
tree = html.fromstring(response.content)
year = str(tree.xpath(
"//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
year = year.replace("']", "").replace("['", "")
horoscope = str(tree.xpath(
"//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()"))
horoscope = horoscope.replace("\\n", "").replace(" ", "").replace("']", "").replace("['", "")
dict = {
'year': year,
'horoscope': horoscope,
'sunsign': sunsign
}
return dict
示例7: get_page_urls
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_page_urls():
start_url = 'http://girl-atlas.com/'
response = get_response(start_url)
page_urls = []
page_urls.append(start_url)
while True:
parsed_body = html.fromstring(response.text)
# Xpath 提取访问下个页面的url
next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href')
if not next_url:
break
next_url = start_url + next_url[0]
page_urls.append(next_url)
response = get_response(next_url)
print "get_page_urls done!!!"
return page_urls
# 获取每个girl专辑的Url
示例8: get_image_urls
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_image_urls(girl_urls):
girl_list = []
for url in girl_urls:
# print "in get_image_urls" + url[0]
response = get_response(url)
parsed_body = html.fromstring(response.text)
# 专辑名
girl_title = parsed_body.xpath('//title/text()')
image_urls = parsed_body.xpath('//li[@class="slide "]/img/@src | //li[@class="slide "]/img/@delay')
girl_dict = {girl_title[0] : image_urls}
girl_list.append(girl_dict)
print "get_girl_urls done!!!"
return girl_list
# 开始下载图片
示例9: get_page_urls
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_page_urls():
start_url = 'http://girl-atlas.com/'
response = get_response(start_url)
page_urls = []
page_urls.append(start_url)
while True:
parsed_body = html.fromstring(response.text)
next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href')
if not next_url:
break
next_url = start_url + next_url[0]
page_urls.append(next_url)
response = get_response(next_url)
print "get_page_urls done!!!"
return page_urls
# 获取每个girl专辑的url
示例10: main
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def main():
for url in url_list :
try:
r = requests.get(url)
except : continue
tree = html.fromstring(r.text)
script = tree.xpath('//script[@language="javascript"]/text()')[0]
json_string = regex.findall(script)[0]
json_data = json.loads(json_string)
next_page_url = tree.xpath('//footer/a/@href')
links = [domain + x['nodeRef'] for x in json_data]
for link in links:
extract(link)
示例11: html_doc
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def html_doc(self):
"""
:returns: the lxml processed html document
:rtype: ``lxml.html.document_fromstring`` output
"""
if self.__lx_doc is None:
cn = NHLCn()
if hasattr(cn, self.report_type):
html = getattr(cn, self.report_type)(self.game_key)
else:
raise ValueError('Invalid report type: %s' % self.report_type)
if cn.req_err is None:
self.__lx_doc = fromstring(html)
else:
self.req_err = cn.req_err
return self.__lx_doc
示例12: addpositionstodict
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def addpositionstodict(gendict):
print("Downloading position information from web...")
for accidwithloc in tqdm(gendict):
if 'Start' in gendict[accidwithloc]:
continue
accid = '_'.join(accidwithloc.split('_')[:-1])
url = ('http://crispr.i2bc.paris-saclay.fr/crispr/crispr_db.php?'
'checked%5B%5D={}'.format(accid))
page = requests.get(url)
htmltable = html.fromstring(page.content).xpath(
"//table[normalize-space(@class)='primary_table']")[1]
strtable = etree.tostring(htmltable)
# converts to pandas df and then to numpy array then drop titles
arrtable = pandas.read_html(strtable)[0].as_matrix()[2:]
for row in arrtable:
if row[0] in gendict:
gendict[row[0]]['Start'] = row[2]
gendict[row[0]]['Stop'] = row[3]
else:
if row[1] != 'questionable':
print("Can't find %s in local files" % row[0])
return gendict
示例13: sns_notification
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def sns_notification(body):
json_body = body.decode('utf8')
js = json.loads(json_body.replace('\n', ''))
if js["Type"] == "Notification":
arg_info = js["Message"]
arg_info = json.loads(arg_info)
content = arg_info['content']
subject = arg_info['mail']['commonHeaders']['subject']
html_content = content.partition('Content-Type: text/html; charset=UTF-8')[2]
if 'Content-Transfer-Encoding' in html_content:
html_content = html_content.partition('Content-Transfer-Encoding: quoted-printable')[2]
text = html_content.replace('\r\n', '')
table = html.fromstring(text)
content = ''
for item in table:
if item.text:
content += item.text.strip()
mail_content = str(content)
from_mail = arg_info['mail']['source']
to_mail = arg_info['mail']['destination'][0]
hash_code = arg_info['mail']['destination'][0].split('@')[0]
return subject, from_mail, to_mail, hash_code, mail_content
示例14: __token_info_fallback
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def __token_info_fallback(self, token_address):
"""
Get token info using ArthurStandardToken interface
:param token_address:
:return:
"""
page = requests.get(
'https://etherscan.io/readContract?v=0xb9469430eabcbfa77005cd3ad4276ce96bd221e3&a=' + token_address)
tree = html.fromstring(page.content)
return {
"address": token_address,
"name": tree.xpath(
'//a[contains(text(), "name")]/../../following-sibling::div//div[@class="form-group"]/text()')[
0].strip(),
"symbol": tree.xpath(
'//a[contains(text(), "symbol")]/../../following-sibling::div//div[@class="form-group"]/text()')[
0].strip(),
"decimals": int(tree.xpath(
'//a[contains(text(), "decimals")]/../../following-sibling::div//div[@class="form-group"]/text()')[
0].strip())
}
示例15: get_cat_image_url
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_cat_image_url(timeout: float) -> str:
api_url = 'http://thecatapi.com/api/images/get'
async with aiohttp.ClientSession() as session:
while True:
try:
async with session.get(
api_url, params={'format': 'xml', 'type': 'jpg,png'}
) as res:
if res.status != 200:
raise APIServerError
xml_result = await res.read()
tree = etree.fromstring(xml_result)
url = tree.find('data/images/image/url').text
except aiohttp.client_exceptions.ServerDisconnectedError:
await asyncio.sleep(0.1)
continue
try:
async with async_timeout.timeout(timeout=timeout):
async with session.get(url) as res:
async with res:
if res.status == 200:
return url
except (aiohttp.ClientConnectorError, asyncio.TimeoutError):
continue