当前位置: 首页>>代码示例>>Python>>正文


Python html.fromstring方法代码示例

本文整理汇总了Python中lxml.html.fromstring方法的典型用法代码示例。如果您正苦于以下问题:Python html.fromstring方法的具体用法?Python html.fromstring怎么用?Python html.fromstring使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lxml.html的用法示例。


在下文中一共展示了html.fromstring方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: resolve_url

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def resolve_url(self):
        url = URL.format('photo_of_the_day/')
        try:
            r = requests.get(url)
            if r.status_code == 200:
                doc = fromstring(r.text)
                results = doc.cssselect('a.article__pic')
                url = URL.format(results[0].get('href'))
                r = requests.get(url, stream=True)
                if r.status_code == 200:
                    doc = fromstring(r.text)
                    results = doc.cssselect('img')
                    for index, result in enumerate(results):
                        posible = result.get('src')
                        if re.match(r'/img/bx/iblock/.*\.jpg$',
                                    posible.lower()):
                            self._url = URL.format(posible[1:])
                            return True
        except Exception as e:
            print(e)
            pass
        return False 
开发者ID:atareao,项目名称:daily-wallpaper,代码行数:24,代码来源:vokrugsveta.py

示例2: resolve_url

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def resolve_url(self):
        try:
            r = requests.get(URL)
            if r.status_code == 200:
                doc = fromstring(r.text)
                results = doc.cssselect('figure')
                if results:
                    wallpaper_id = results[0].get('data-wallpaper-id')
                    wallpaper_image = 'wallhaven-{0}.jpg'.format(
                        wallpaper_id)
                    self._url = 'https://w.wallhaven.cc/full/{}/{}'.format(
                        wallpaper_id[0:2], wallpaper_image)
                    return True
        except Exception:
            pass
        return False 
开发者ID:atareao,项目名称:daily-wallpaper,代码行数:18,代码来源:wallhaven.py

示例3: get_text_from_markdown

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_text_from_markdown(markdown_text):
	renderer = HtmlRenderer()
	markdown = Markdown(renderer, extensions=('tables', 'autolink', 'strikethrough', 'quote', 'superscript', 'fenced-code'))
	html = markdown(markdown_text)
	parsed_html = fromstring(html)
	
	# remove quoted text
	[x.getparent().remove(x) for x in parsed_html.xpath('//blockquote')]
	
	# remove automatically added links 
	for link in parsed_html.xpath('//a'):
		if link.text_content() == link.get('href'):			 
			link.getparent().remove(link)
	
	text = ''.join(parsed_html.text_content()).strip()
	return text

# https://stackoverflow.com/a/3155023 
开发者ID:crisbal,项目名称:PlayStoreLinks_Bot,代码行数:20,代码来源:utils.py

示例4: get_weekly_horoscope

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_weekly_horoscope(sunsign):
        url = "http://www.ganeshaspeaks.com/horoscopes/weekly-horoscope/" + sunsign
        response = requests.get(url)
        tree = html.fromstring(response.content)
        week = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
        week = week.replace("']", "").replace("['", "")
        horoscope = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()"))
        horoscope = horoscope.replace("\\n", "").replace("  ", "").replace("']", "").replace("['", "")
        dict = {
            'week': week,
            'horoscope': horoscope,
            'sunsign': sunsign
        }

        return dict 
开发者ID:tapaswenipathak,项目名称:Horoscope-API,代码行数:19,代码来源:pyhoroscope.py

示例5: get_monthly_horoscope

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_monthly_horoscope(sunsign):
        url = "http://www.ganeshaspeaks.com/horoscopes/monthly-horoscope/" + sunsign
        response = requests.get(url)
        tree = html.fromstring(response.content)
        month = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
        month = month.replace("']", "").replace("['", "")
        horoscope = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()[1]"))
        horoscope = horoscope.replace("\\n", "").replace("  ", "").replace("']", "").replace("['", "")
        dict = {
            'month': month,
            'horoscope': horoscope,
            'sunsign': sunsign
        }

        return dict 
开发者ID:tapaswenipathak,项目名称:Horoscope-API,代码行数:19,代码来源:pyhoroscope.py

示例6: get_yearly_horoscope

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_yearly_horoscope(sunsign):
        url = "http://www.ganeshaspeaks.com/horoscopes/yearly-horoscope/" + sunsign
        response = requests.get(url)
        tree = html.fromstring(response.content)
        year = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
        year = year.replace("']", "").replace("['", "")
        horoscope = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()"))
        horoscope = horoscope.replace("\\n", "").replace("  ", "").replace("']", "").replace("['", "")
        dict = {
            'year': year,
            'horoscope': horoscope,
            'sunsign': sunsign
        }

        return dict 
开发者ID:tapaswenipathak,项目名称:Horoscope-API,代码行数:19,代码来源:pyhoroscope.py

示例7: get_page_urls

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_page_urls():

    start_url = 'http://girl-atlas.com/'
    response = get_response(start_url)
    page_urls = []

    page_urls.append(start_url)
    while True:
        parsed_body = html.fromstring(response.text)
        # Xpath 提取访问下个页面的url
        next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href')

        if not next_url:
            break

        next_url = start_url + next_url[0]
        page_urls.append(next_url)
        response = get_response(next_url)

    print "get_page_urls done!!!"

    return page_urls

# 获取每个girl专辑的Url 
开发者ID:pein0119,项目名称:girl-atlas-crawler,代码行数:26,代码来源:get_image.py

示例8: get_image_urls

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_image_urls(girl_urls):

    girl_list = []
    
    for url in girl_urls:
        # print "in get_image_urls" + url[0]
        response = get_response(url)
        parsed_body = html.fromstring(response.text)

        # 专辑名
        girl_title  = parsed_body.xpath('//title/text()')
        image_urls = parsed_body.xpath('//li[@class="slide "]/img/@src | //li[@class="slide "]/img/@delay')

        girl_dict = {girl_title[0] : image_urls}
        girl_list.append(girl_dict)
        
    print "get_girl_urls done!!!"
    return girl_list

# 开始下载图片 
开发者ID:pein0119,项目名称:girl-atlas-crawler,代码行数:22,代码来源:get_image.py

示例9: get_page_urls

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_page_urls():

    start_url = 'http://girl-atlas.com/'
    response = get_response(start_url)
    page_urls = []

    page_urls.append(start_url)
    while True:
        parsed_body = html.fromstring(response.text)
        next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href')

        if not next_url:
            break

        next_url = start_url + next_url[0]
        page_urls.append(next_url)
        response = get_response(next_url)

    print "get_page_urls done!!!"

    return page_urls

# 获取每个girl专辑的url 
开发者ID:pein0119,项目名称:girl-atlas-crawler,代码行数:25,代码来源:get_image_gevent.py

示例10: main

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def main():
    for url in url_list :
        try:
            r = requests.get(url)
        except : continue
        tree = html.fromstring(r.text)

        script = tree.xpath('//script[@language="javascript"]/text()')[0]

        json_string = regex.findall(script)[0]
        json_data = json.loads(json_string)

        next_page_url = tree.xpath('//footer/a/@href')

        links = [domain + x['nodeRef'] for x in json_data]
        for link in links:
            extract(link) 
开发者ID:bendidi,项目名称:X-ray-classification,代码行数:19,代码来源:get_data.py

示例11: html_doc

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def html_doc(self):
        """
        :returns: the lxml processed html document
        :rtype: ``lxml.html.document_fromstring`` output
        """
        
        if self.__lx_doc is None:
            cn = NHLCn()
          
            if hasattr(cn, self.report_type):
                html = getattr(cn, self.report_type)(self.game_key)
            else:
                raise ValueError('Invalid report type: %s' % self.report_type)
          
            if cn.req_err is None:
                self.__lx_doc = fromstring(html)
            else:
                self.req_err = cn.req_err
            
        return self.__lx_doc 
开发者ID:robhowley,项目名称:nhlscrapi,代码行数:22,代码来源:reportloader.py

示例12: addpositionstodict

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def addpositionstodict(gendict):
    print("Downloading position information from web...")
    for accidwithloc in tqdm(gendict):
        if 'Start' in gendict[accidwithloc]:
            continue
        accid = '_'.join(accidwithloc.split('_')[:-1])
        url = ('http://crispr.i2bc.paris-saclay.fr/crispr/crispr_db.php?'
               'checked%5B%5D={}'.format(accid))
        page = requests.get(url)
        htmltable = html.fromstring(page.content).xpath(
            "//table[normalize-space(@class)='primary_table']")[1]
        strtable = etree.tostring(htmltable)
        # converts to pandas df and then to numpy array then drop titles
        arrtable = pandas.read_html(strtable)[0].as_matrix()[2:]
        for row in arrtable:
            if row[0] in gendict:
                gendict[row[0]]['Start'] = row[2]
                gendict[row[0]]['Stop'] = row[3]
            else:
                if row[1] != 'questionable':
                    print("Can't find %s in local files" % row[0])
    return gendict 
开发者ID:phageParser,项目名称:phageParser,代码行数:24,代码来源:populate.py

示例13: sns_notification

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def sns_notification(body):
    json_body = body.decode('utf8')
    js = json.loads(json_body.replace('\n', ''))
    if js["Type"] == "Notification":
        arg_info = js["Message"]
        arg_info = json.loads(arg_info)
        content = arg_info['content']
        subject = arg_info['mail']['commonHeaders']['subject']
        html_content = content.partition('Content-Type: text/html; charset=UTF-8')[2]
        if 'Content-Transfer-Encoding' in html_content:
            html_content = html_content.partition('Content-Transfer-Encoding: quoted-printable')[2]
        text = html_content.replace('\r\n', '')
        table = html.fromstring(text)
        content = ''
        for item in table:
            if item.text:
                content += item.text.strip()
        mail_content = str(content)
        from_mail = arg_info['mail']['source']
        to_mail = arg_info['mail']['destination'][0]
        hash_code = arg_info['mail']['destination'][0].split('@')[0]
        return subject, from_mail, to_mail, hash_code, mail_content 
开发者ID:MicroPyramid,项目名称:django-email-gateway,代码行数:24,代码来源:receiving_mail.py

示例14: __token_info_fallback

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def __token_info_fallback(self, token_address):
        """
        Get token info using ArthurStandardToken interface
        :param token_address:
        :return:
        """
        page = requests.get(
            'https://etherscan.io/readContract?v=0xb9469430eabcbfa77005cd3ad4276ce96bd221e3&a=' + token_address)
        tree = html.fromstring(page.content)
        return {
            "address": token_address,
            "name": tree.xpath(
                '//a[contains(text(), "name")]/../../following-sibling::div//div[@class="form-group"]/text()')[
                0].strip(),
            "symbol": tree.xpath(
                '//a[contains(text(), "symbol")]/../../following-sibling::div//div[@class="form-group"]/text()')[
                0].strip(),
            "decimals": int(tree.xpath(
                '//a[contains(text(), "decimals")]/../../following-sibling::div//div[@class="form-group"]/text()')[
                0].strip())
        } 
开发者ID:gnosis,项目名称:safe-relay-service,代码行数:23,代码来源:token_repository.py

示例15: get_cat_image_url

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import fromstring [as 别名]
def get_cat_image_url(timeout: float) -> str:
    api_url = 'http://thecatapi.com/api/images/get'
    async with aiohttp.ClientSession() as session:
        while True:
            try:
                async with session.get(
                    api_url, params={'format': 'xml', 'type': 'jpg,png'}
                ) as res:
                    if res.status != 200:
                        raise APIServerError
                    xml_result = await res.read()
                    tree = etree.fromstring(xml_result)
                    url = tree.find('data/images/image/url').text
            except aiohttp.client_exceptions.ServerDisconnectedError:
                await asyncio.sleep(0.1)
                continue
            try:
                async with async_timeout.timeout(timeout=timeout):
                    async with session.get(url) as res:
                        async with res:
                            if res.status == 200:
                                return url
            except (aiohttp.ClientConnectorError, asyncio.TimeoutError):
                continue 
开发者ID:item4,项目名称:yui,代码行数:26,代码来源:animal.py


注:本文中的lxml.html.fromstring方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。