当前位置: 首页>>代码示例>>Python>>正文


Python PyQuery.__unicode__方法代码示例

本文整理汇总了Python中pyquery.PyQuery.__unicode__方法的典型用法代码示例。如果您正苦于以下问题:Python PyQuery.__unicode__方法的具体用法?Python PyQuery.__unicode__怎么用?Python PyQuery.__unicode__使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyquery.PyQuery的用法示例。


在下文中一共展示了PyQuery.__unicode__方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fixLinks

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue

                print '// Drop queryString in included src'
                print 'from: ', href
                result = urlparse(href)

                if result.scheme == 'https':
                    href = href
                elif result.scheme == '':
                    href = result.path + (('#' + result.fragment) if result.fragment != '' else '')
                print 'to: ', href
  
                new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'/index\.html$', '/', new_href)

                if href != new_href:
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
开发者ID:copywrite,项目名称:buster,代码行数:32,代码来源:buster.py

示例2: extract

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
    def extract(self):
        item = ContentItem()

        self.html = re.sub('<!--.*?-->', '', self.html)
        content_node = self.hxs.select("//div[@class = 'art_con']").extract()
        content_node = PyQuery(content_node[0])
        
        content_node.remove('div[class = "pconline_page"]')
        content_node.remove('div[class = "pc3g"]')
        content_node.remove('div[class = "pageTips"]')
        content_node.remove('div[class = "art_nav_box mt10"]')
        content_node.remove('div[class = "art_bottom"]')
        content_node.remove('div[class = "art_con_top"]')

        

        item['image_urls'] = [self.getRealURI(img.get('src')) for img in content_node('img') if not img.get('src').endswith('.gif')]
        item['title'] = self.title = self.hxs.select("//h1/text()").extract()[0]
        if not item['title']:
            item['title'] = self.title = self.hxs.select("//div[@id = 'UC_newsInfoDetail_lbl_newsTitle']/text()").extract()[0]
        item['content'] = self.content = content_node.__unicode__()
        release_time = self.hxs.select("//div[@class = 'art_con_top']").extract()[0]
        doc_t = PyQuery(release_time)
        release_time = doc_t('span').text()
        p = re.compile(u'20\d\d年\d\d月\d\d日')
        #item['release_time'] = self.release_time = doc('div[class="art_con_top"]').find('span').eq(0).text()
        item['release_time'] = self.release_time = p.search(release_time).group()
        item['source'] = u'pconline'
        item['author'] = ''
        item['pic_url'] = ''

        return item
开发者ID:hw20686832,项目名称:iCrawler,代码行数:34,代码来源:zj_pconline.py

示例3: extract

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        content_node = doc('div#rightdiv1')
        content_node.remove('span.white12')
        item = ContentItem()
        content_node = content_node.__unicode__()
        img_all = []
        img='leftsmallimgurl\[1\]\=\"(.*?)\"\;'
        ob = re.compile(img)
        imgs = ob.findall(doc.__unicode__())
        if not imgs:
            image=''
        else:
            image='<br/><img src="'+imgs[0]+'"/><br/>'
            img_all.append(self.getRealURI(imgs[0]))
        content_node=image+content_node
        item['image_urls'] = img_all
                
        item['title'] = self.title = doc('h1').text()
        item['content'] = self.content = content_node
                    
        item['release_time'] = ''
#        item['release_switch_time'] = self.release_switch_time = time.time()
        item['source'] = u"瑞丽服饰网"
        item['author'] = ''
        item['pic_url'] = ''
        
        self.title = item['title']
        self.content = item['content']
        
        return item
开发者ID:hw20686832,项目名称:iCrawler,代码行数:34,代码来源:lw_xw.py

示例4: extract

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
    def extract(self):
        item = ContentItem()
        self.html = re.sub('<!--.*?-->', '', self.html)

        tz_title=self.hxs.select("//h1/text()").extract()
        content=self.hxs.select("//ul[@class='content']/li/div").extract()
        tz_content=''
        for con in content:
            if "fromposty" in con:
                tz_content=self.hxs.select("//ul[@class='content']/li/div")[2].extract()
                break
            else:
                tz_content=self.hxs.select("//ul[@class='content']/li/div")[1].extract()
        
        release_time=self.hxs.select("//div[@class='gray']/text()").extract()

        imgs=PyQuery(tz_content)
        ob=re.compile('src="(.*?)"')
        imgs=ob.findall(imgs.__unicode__())
        img_all=[]
        for img in imgs:
            if ".gif" in img:
                continue
            if ".GIF" in img:
                continue
            else:
                img_all.append(self.getRealURI(img))
                
        author=self.hxs.select("//td[@class='bbsname']/b/span/a/text()").extract()
        tz_content = PyQuery(tz_content)
        cont_div = tz_content('div[style = "color:#FCFCCC"]')
        for cont in cont_div:
            cont_div.eq(cont_div.index(cont)).removeAttr('style')
        tz_content = tz_content.__unicode__()
        item['image_urls'] = img_all
        item['title'] = self.title = tz_title[0].strip()
        item['content'] = self.content = tz_content
        item['release_time'] = ''
        item['source'] = u"铁血网"
        item['author'] = author[0]
    
        item['pic_url'] = ''
#        item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(self.release_time,u'%Y-%m-%d %H:%M'))
        
        return item
开发者ID:hw20686832,项目名称:iCrawler,代码行数:47,代码来源:lw_tz.py

示例5: fixLinks

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
 def fixLinks(text):
     d = PyQuery(text, parser='html')
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     return d.__unicode__().encode('utf8')
开发者ID:shaunlebron,项目名称:buster,代码行数:12,代码来源:buster.py

示例6: fixLinks

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', href)
             new_href = re.sub(r'index.html', '/', new_href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
开发者ID:invictusjs,项目名称:buster,代码行数:15,代码来源:buster.py

示例7: fix_share_links

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
 def fix_share_links(text,parser):
     td_regex = re.compile(target_domain + '|' )
     
     assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for share_class in ['.icon-twitter','.icon-facebook','.icon-google-plus']:
         for element in d(share_class):
             e = PyQuery(element)
             href = e.attr('href')
             new_href = re.sub(domain, target_domain, href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
开发者ID:danmaclean,项目名称:buster,代码行数:17,代码来源:buster.py

示例8: extract

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        content_node = doc('div.kb_zw')
        if not content_node:
#            content_node = doc('div.zw_text')
            content_node = PyQuery(self.hxs.select("//div[@class = 'zw_text']").extract()[0])
        
        content_node.remove('script')
        content_node.remove('style')
        content_node.remove('iframe')
        content_node.remove('div[style = "float:left; width:303px; height:250px; display:inline; margin:10px 10px 10px 10px;"]')
        content_node.remove('input')
        
        

        item = ContentItem()
        item['title'] = self.title = doc('td[align = "center"]')('b').text()
        if item['title'] == None:
            item['title'] = self.title = doc('div.zw_bt').text()
        if item['title'] == None:
            item['title'] = self.title = doc('h1.zw_title').text()
        
        
        item['release_time'] = ''
        
        item['source'] = u"新浪"
        item['author'] = ''
        item['pic_url'] = ''

        imgs = content_node('img')
        image_urls = []
        for img in imgs:
            if ".gif" in img.get('src'):
                continue
            if not img.get('src'):
                continue
            else:
                imgs.eq(imgs.index(img)).before('<br>')
                imgs.eq(imgs.index(img)).append('<br>')
                image_urls.append(self.getRealURI(img.get('src')))
        item['image_urls'] = image_urls

        content = content_node.__unicode__()
        item['content'] = self.content = content
        return item
开发者ID:hw20686832,项目名称:iCrawler,代码行数:48,代码来源:zj_a006.py

示例9: fixLinks

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue

                new_href = re.sub(r'(rss/index\.html)|((?<!\.)rss/?)$', 'rss/index.rss', href)
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'/index\.html$', '/', new_href)

                if href != new_href:
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
开发者ID:dianita9902,项目名称:buster,代码行数:22,代码来源:buster.py

示例10: fixLinks

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         print href
         if href is None:
             continue
         new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', new_href)
         if href != new_href:
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     # remove ?v=XXXXXXXXX in css
     for element in d('link'):
         e = PyQuery(element)
         href = e.attr('href')
         if href is None:
             continue
         if re.match(r'http://fonts',href) is not None:
             continue
         new_href = re.sub(r'\?.*', '',href)  
         if href != new_href:
             e.attr('href',new_href)
             print "\t", href, "=>", new_href     
     # remove ?v=XXXXXXXXX in js                  
     for element in d('script'):
         e = PyQuery(element)
         src = e.attr('src')
         if src is None:
             continue
         new_src = re.sub(r'\?.*', '',src) 
         if src != new_src:
             e.attr('src',new_src)
             print "\t", src, "=>", new_src
     ################### 
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
开发者ID:ash0080,项目名称:buster,代码行数:42,代码来源:buster.py

示例11: extract

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
    def extract(self):
        item = ContentItem()
        self.html = re.sub('<!--.*?-->', '', self.html)

        tz_title=self.hxs.select("//h1/text()").extract()
        tz_content=self.hxs.select("//div[@class='text']").extract()
        release_time=self.hxs.select("//div[@class='user']/ul/li/text()").extract()
        ob=re.compile(u'20\d\d.*:\d\d')
        release_time=ob.findall(release_time[0])

        imgs=self.hxs.select("//div[@class='text']/div/div/p/a/img/@src").extract()
        img_all=[]
        for img in imgs:
            if ".gif" in img:
                continue
            if ".GIF" in img:
                continue
            else:
                img_all.append(self.getRealURI(img))
        
        item['image_urls'] = img_all
        item['title'] = self.title = tz_title[0]
        
        content = tz_content[0]
        content_html = PyQuery(content)
        cont_div = content_html('div[style = "color:#f9f9f9"]')
        for cont in cont_div:
            cont_div.eq(cont_div.index(cont)).removeAttr('style')
        content_html = content_html.__unicode__()
        item['content'] = self.content = content_html
        item['release_time'] = release_time[0]
        item['source'] = u"铁血网"
        item['author'] = ''
    
        item['pic_url'] = ''
#        item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(self.release_time,u'%Y-%m-%d %H:%M'))
        
        return item
开发者ID:hw20686832,项目名称:iCrawler,代码行数:40,代码来源:lw_tz_post2.py


注:本文中的pyquery.PyQuery.__unicode__方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。