当前位置: 首页>>代码示例>>Python>>正文


Python PyQuery.html方法代码示例

本文整理汇总了Python中pyquery.PyQuery.html方法的典型用法代码示例。如果您正苦于以下问题:Python PyQuery.html方法的具体用法?Python PyQuery.html怎么用?Python PyQuery.html使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyquery.PyQuery的用法示例。


在下文中一共展示了PyQuery.html方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: no_fonts

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def no_fonts (pq):  # yuk - lxml etree and PyQuery objs get confused - nested ones arent removed, this goes only 2 levels
    raise Exception, "yuk - it's a mess, use tidy!"

    pq = PyQuery (pq)
    #print fonts.__class__.__name__
    for font in pq ('font'):
        font = PyQuery (font)
        #font ('a').remove()
        #print font.__class__.__name__
        #print len (font), font [0]
        #print dir (font)
        #import sys
        #sys.exit()

        #inner = innerhtml (font)  # .text() #.replace (':','').strip()
        #print 'Replacing font with:', font.html()
        font.replaceWith (font.html())
        #font.getparent().replace (font, PyQuery (inner))
        print 'font replaced:', font [:60]

        #font = no_fonts (font)

    for font in pq ('font'):
        font = PyQuery (font)
        font.replaceWith (font.html())
        print 'font 2 replaced:', font [:60]

    return pq
开发者ID:satyadevi-nyros,项目名称:eracks,代码行数:30,代码来源:scrape_utils.py

示例2: ReadURL

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def ReadURL(url):
  trytime = 0
  pq = None
  while (trytime < 3):
    try:
      pq = PyQuery(url = url)
      break
    except Exception as e:
      print 'Exception!', url
      trytime += 1
      raise e
      time.sleep(SLEEP_BETWEEN_REQUEST)
  if pq == None or pq.html() == None:
    return ''
  return pq.html()
开发者ID:fmacias64,项目名称:deepdive_ocr_app,代码行数:17,代码来源:sdspider.py

示例3: scrape

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )
开发者ID:satyadevi-nyros,项目名称:eracks,代码行数:62,代码来源:scrape_pages.py

示例4: test_mount_tag

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def test_mount_tag():
    root = PyQuery('<root></root>')
    tag = {'name': 'custom', 'html': '<custom><text>{opts.txt}</text></custom>'}
    dom = vdom.mount_tag(root, tag, {'txt': 'hello world'})
    assert dom and dom.uuid # dom created
    assert vdom.get_dom(dom.uuid) # dom cached
    assert root.html() # mounted something
开发者ID:pombredanne,项目名称:riotpy,代码行数:9,代码来源:test_virtal_dom.py

示例5: fixLinks

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue

                print '// Drop queryString in included src'
                print 'from: ', href
                result = urlparse(href)

                if result.scheme == 'https':
                    href = href
                elif result.scheme == '':
                    href = result.path + (('#' + result.fragment) if result.fragment != '' else '')
                print 'to: ', href
  
                new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'/index\.html$', '/', new_href)

                if href != new_href:
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
开发者ID:copywrite,项目名称:buster,代码行数:32,代码来源:buster.py

示例6: _split

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def _split(inputfile, outputdir):
    source = open(inputfile, 'r')
    html = source.read()
    source.close()

    if not os.path.isdir(outputdir):
        os.mkdir(outputdir)

    idx_slide=0
    idx_section=0

    parsed = PyQuery(html)
    
    for section in parsed('section'):
        slide = PyQuery(section)        
        if slide.has_class('stack'):
            idx_section+=1
            stack_path = os.path.join(outputdir,'%02d' % idx_section )
            os.mkdir(stack_path)
            for sub_slide in PyQuery(slide.html())('section'):
                idx_slide+=1
                _dump_slide(sub_slide, idx_slide, stack_path)
        else: 
            if not slide.parent().has_class('stack'):
                idx_slide+=1
                _dump_slide(slide, idx_slide, outputdir)                    
开发者ID:tiry,项目名称:reveal-js-tools,代码行数:28,代码来源:split.py

示例7: _enhance_text

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
    def _enhance_text(self):
        """
        Transforms a simplified text into a valid mail.template text.
        :return: mail.template text
        """
        self.ensure_one()
        # Parse and set back the keywords into raw template code
        html_text = PyQuery(self.simplified_text.replace('\n', ''))

        def sort_keywords(kw):
            # Replace first if/for-clauses, then var, then code
            index = kw.position
            if kw.type == 'if' or 'for' in kw.type:
                index += 2*len(self.body_html) * kw.nested_position
                # Take if and for in the appearing order in the text
                index -= kw.position
            elif kw.type == 'var':
                index += len(self.body_html)
            return index

        keywords = self.keyword_ids.sorted(sort_keywords, reverse=True)
        # Replace automatic-generated keywords
        for keyword in keywords:
            keyword_text = html_text('#' + keyword.html_id)
            keyword_text.replace_with(keyword.final_text)

        # Replace user added keywords
        template_text = html_text.html()
        for keyword in keywords.filtered(lambda k: k.type == 'code'):
            to_replace = u"[{}]".format(keyword.short_code)
            template_text = template_text.replace(to_replace, keyword.raw_code)
        final_text = PyQuery(BeautifulSoup(template_text).prettify())
        return final_text('body').html()
开发者ID:maxime-beck,项目名称:compassion-modules,代码行数:35,代码来源:communication_revision.py

示例8: sanitize_description

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def sanitize_description(value):
    cleaned = PyQuery(value)
    cleaned = cleaned.remove('span.playMetaText')
    cleaned.remove('span.playMetaText')
    cleaned.remove('time')
    cleaned.remove('strong')

    return cleaned.html().split('<span>')[-1:][0].replace('</span>', '')
开发者ID:peppelorum,项目名称:SVT-oppetarkiv-crawler,代码行数:10,代码来源:__init__.py

示例9: sanitize_html2

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def sanitize_html2(value):
    soup = PyQuery(value)
    soup = soup.remove("span.playMetaText")
    soup.remove("span.playMetaText")
    soup.remove("time")
    soup.remove("strong")

    return soup.html().split("<span>")[-1:]
开发者ID:peppelorum,项目名称:WeLovePublicService-VHS,代码行数:10,代码来源:span.py

示例10: get_pastes

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
    def get_pastes ( self ):
        Logger ().log ( 'Getting pastes', True )
        try:
            page = PyQuery ( url = self.PASTES_URL )
        except KeyboardInterrupt:
            raise
        except:
            return self.CONNECTION_FAIL,None


        """
        There are a set of encoding issues which, coupled with some bugs in etree (such as in the Raspbian packages) can
        trigger encoding exceptions here. As a workaround, we try every possible encoding first, and even if that fails,
        we resort to a very hacky workaround whereby we manually get the page and attempt to encode it as utf-8. It's
        ugly, but it works for now.
        """
        try:
            page_html = page.html ()
        except KeyboardInterrupt:
            raise
        except:
            worked = False
            for enc in all_python_encodings():
                try:
                    page_html = page.html(encoding=enc)
                    worked = True
                    break
                except KeyboardInterrupt:
                    raise
                except:
                    pass
            if not worked:
                # One last try...
                try:
                    f = urllib.request.urlopen(Crawler.PASTES_URL)
                    page_html = PyQuery(str(f.read()).encode('utf8')).html()
                    f.close()
                except KeyboardInterrupt:
                    raise
                except:
                    return self.OTHER_ERROR, None
        if re.match ( r'Pastebin\.com - Access Denied Warning', page_html, re.IGNORECASE ) or 'blocked your IP' in page_html:
            return self.ACCESS_DENIED,None
        else:
            return self.OK,page('.maintable img').next('a')
开发者ID:Jorl17,项目名称:Pastebin-Crawler,代码行数:47,代码来源:pastebin_crawler.py

示例11: clean_body

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def clean_body(body):
	site = Site.objects.get_current()
	html = PyQuery('<body>' + body + '</body>')
	
	for p in html('p'):
		p = PyQuery(p)
		p.replaceWith('\n\n%s\n\n' % p.html())
	
	html('.alignright').addClass('pull-right').removeClass('alignright')
	html('.alignleft').addClass('pull-left').removeClass('alignleft')
	html('[style="float: left;"]').removeAttr('style').addClass('alignleft')
	html('[style="float: right;"]').removeAttr('style').addClass('alignright')
	
	while '\n\n\n' in body:
		body = body.replace('\n\n\n', '\n\n')
	
	while '\r\r\r' in body:
		body = body.replace('\r\r\r', '\r\r')
	
	body = html.html()
	body = body.replace('<br />', '  \n')
	body = body.replace('<br/>', '  \n')
	body = body.replace('<br>', '  \n')
	body = body.replace('\r\n', '\n')
	body = body.replace('\n\r', '\n')
	
	while body.find('\n\n\n') > -1:
		body = body.replace('\n\n\n', '\n\n')
	
	while body.startswith('\n'):
		body = body[1:]
	
	while body.endswith('\n'):
		body = body[:-1]
	
	while body.startswith('\r'):
		body = body[1:]
	
	while body.endswith('\r'):
		body = body[:-1]
	
	while body.startswith('\t'):
		body = body[1:]
	
	return body
开发者ID:iamsteadman,项目名称:bambu-tools,代码行数:47,代码来源:xmlrpc.py

示例12: extract

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        content_node = doc('div#contentText')
        
        content_node.remove('script')
        content_node.remove('style')
        content_node.remove('.line')
        content_node.remove('#shareIn')
        content_node.remove('.tagHotg')
        content_node.remove('.blank8')
        content_node.remove('."editShare clear"')
        content_node.remove('select')
        #content_node.remove('table[width = "100%"]')('td[align = "center"]')
        content_node.remove('div[class = "jingbian_travel01_04"]')
        content_node.remove('div[class = "txt2"]')
        content_node.remove('iframe')
        content_node.remove('embed')
        content_node.remove('td[style = "font-size: 14px; font-weight: bold;"]')
        content_node.remove('table[style = "margin-right: 20px;"]')
        content_node.remove('digi_perpage_bottom')
        content_node.remove('div[class = "extract clear"]')
        content_node.remove('table[bgcolor = "#eeeeee"]')
        content_node.remove('img[alt = "搜狐教育频道"]')
        content_node.remove('table[bgcolor = "#e2e2e2"]')
        content_node.remove('table[bgcolor = "#66ccff"]')
        content_node.remove('div[class = "digi_digest"]')
        item = ContentItem()
        imgs = content_node('img')
        img_all = []
        for img in imgs:
            if".gif" in img.get('src'):
                continue
            else:  
                imgs.eq(imgs.index(img)).append('<br>')
                imgs.eq(imgs.index(img)).before('<br>')
                img_all.append(self.getRealURI(img.get('src')))
        item['image_urls'] = img_all
        
        item['title'] = self.title = doc('h1').text()
        item['content'] = self.content = content_node.__unicode__()
        t = re.compile(u'var club_artinputdate = "(.*?)";')
        release_time = t.search(doc.html())
        if release_time:
            item['release_time'] = self.release_time = release_time.group(1)
#        item['release_switch_time'] = time.mktime(time.strptime(t.search(doc.html()).group(1),'%Y-%m-%d %H:%M:%S'))
        item['source'] = u'搜狐'
        author = doc('div[class = "function clear"]')
        self.author = author('div.l')('a').text()
        item['author'] = self.author
        item['pic_url'] = ''
        
        return item
开发者ID:hw20686832,项目名称:iCrawler,代码行数:55,代码来源:zj_yule_tz.py

示例13: sanitize_description

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def sanitize_description(value):
    cleaned = PyQuery(value)
    cleaned = cleaned.remove('span.playMetaText')
    cleaned.remove('span.playMetaText')
    cleaned.remove('span.playCount')
    cleaned.remove('time')
    cleaned.remove('strong')

    desc = cleaned.html()

    if desc is None: return ""

    return desc.split('<span>')[-1:][0].replace('</span>', '').strip()
开发者ID:Pehrsons,项目名称:SvtCrawler,代码行数:15,代码来源:__init__.py

示例14: fixLinks

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', href)
             new_href = re.sub(r'index.html', '/', new_href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
开发者ID:invictusjs,项目名称:buster,代码行数:15,代码来源:buster.py

示例15: render_md5

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
 def render_md5(self, post_content):
     config = Config()
     self.body = markdown2.markdown(
         post_content,
         extras=config.mdextras,
     )
     # rewrite relative img-srcs to full paths.
     d = PyQuery(self.body)
     for img in d.find('img'):
         if '/' not in img.attrib['src']:
             img.attrib['src'] = '{}{}/{}'.format(config.blogurl,
                                                  self.outputpath,
                                                  img.attrib['src'])
     self.body = d.html()
开发者ID:hnrd,项目名称:blogtopoid,代码行数:16,代码来源:blogtopoid.py


注:本文中的pyquery.PyQuery.html方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。