当前位置: 首页>>代码示例>>Python>>正文


Python utils.stripHtml函数代码示例

本文整理汇总了Python中utils.utils.stripHtml函数的典型用法代码示例。如果您正苦于以下问题:Python stripHtml函数的具体用法?Python stripHtml怎么用?Python stripHtml使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了stripHtml函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __getThreadPage

 def __getThreadPage( self ):
         """
         It will fetch each thread and its associate infomarmation
         and add the tasks
         """
         threads = [x.findParent('tr') for x in self.soup.find('table',id=re.compile('ViewAllThread')).findAll('a','ForumPostHead')]
         for thread in threads:
             try:
                 thread_info = thread.findAll('td',recursive=False)
                 if not len(thread_info)==6:
                     log.info(self.log_msg('Not enough fiels'))
                     continue
                 last_post_info = [x.strip() for x in stripHtml(thread_info[-1].renderContents()).split('\n')]
                 thread_time = datetime.strptime( last_post_info[0],'%m/%d/%Y %I:%M:%S %p   by')
                 #page['edate_thread_last_post_date'] = datetime.strftime(thread_time,"%Y-%m-%dT%H:%M:%SZ")
                 self.last_timestamp = max(thread_time , self.last_timestamp )
             except:
                 log.exception(self.log_msg('Todays Post, so ignoring'))
                 continue
             if  self.total_posts_count > self.max_posts_count:
                 log.info(self.log_msg('Reaching maximum post,Return false'))
                 return False
             self.total_posts_count = self.total_posts_count + 1
             try:
                 if checkSessionInfo('Search',self.session_info_out, thread_time,\
                                     self.task.instance_data.get('update')) and \
                                     self.max_posts_count >= self.total_posts_count:
                     continue
                 temp_task=self.task.clone()
                 try:
                     title_tag = thread_info[1].find('a','ForumPostHead')
                     temp_task.pagedata['title']= stripHtml(title_tag.renderContents())
                     temp_task.instance_data[ 'uri' ] =  title_tag['href']
                     log.info(stripHtml(title_tag.renderContents()))
                 except:
                     log.info(self.log_msg('Cannot find the uri'))
                     continue
                 try:
                     temp_task.pagedata['et_author_name'] = stripHtml(thread_info[2].renderContents())
                 except:
                     log.info(self.log_msg('Cannot find author name'))
                 try:
                     temp_task.pagedata['et_thread_last_post_author']  = last_post_info[-1]
                 except:
                     log.info(self.log_msg('Cannot find the replies count'))
                 try:
                     view_reply = {'ei_thread_replies_count':3,'ei_thread_views_count':4}
                     for each in view_reply.keys():
                         temp_task.pagedata[each] = int(stripHtml(thread_info[view_reply[each]].renderContents()))
                 except:
                     log.info(self.log_msg('Cannot find the views count'))
                 try:
                     temp_task.pagedata['edate_last_post_date']=  datetime.strftime(thread_time,"%Y-%m-%dT%H:%M:%SZ")
                 except:
                     log.info(self.log_msg('Cannot find the last posted'))
                 self.linksOut.append( temp_task )
                 log.info(self.log_msg('Task Added'))
             except:
                 log.info(self.log_msg('Cannot add the Task'))
         return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:csharpcornerconnector.py

示例2: __getData

 def __getData(self, post):
     page = {}
     try:
         post_tag = BeautifulSoup(post.__str__().replace('/>>','/>'))
         table_tag = post_tag.find('table')
         if table_tag:
             table_tag.extract()
         try:    
             page['data'] = stripHtml(post_tag.renderContents())
             page['title']= ''
         except:
             log.exception(self.log_msg('Data not found for the url %s'%self.currenturi))
             return        
     
         try:
             date_str = stripHtml(table_tag.findAll('strong')[-1].renderContents())
             page['posted_date'] = datetime.strftime(datetime.\
                                     strptime(re.sub("(\d+)(st|nd|rd|th)",r"\1",date_str).\
                                     strip(),"%d %B %Y"),"%Y-%m-%dT%H:%M:%SZ")             
         except:
             log.exception(self.log_msg('Posted date not found'))
             page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
         try:
             page['et_author_name'] = stripHtml(table_tag.findAll('strong')[0].renderContents())
         except:
             log.exception(self.log_msg('author name not found'))
     except:
         log.exception(self.log_msg('post tag not found'))        
      
     return page                                                                                                                                                                                      
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:bankguideconnector.py

示例3: __getData

 def __getData(self, post):
     """ This will return the page dictionry
     """
     page = {}
     try:
         date_str = date_str = stripHtml(post.find('a',href = re.compile('comment\d+')).\
                     renderContents()).replace('PST','').replace('PDT','').\
                     replace('at','').strip()
         page['posted_date']= datetime.strptime(date_str,"%d %b %Y %I:%M %p").\
                             strftime("%Y-%m-%dT%H:%M:%SZ") 
     except:
         log.exception(self.log_msg('posted_date not be found in %s'% self.currenturi))
         #log.info(date_str)
         page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
     try:
         page['data'] = stripHtml(post.find('div','single_comment',id = re.compile('comment\d+_show')).\
                         renderContents())
         page['title'] = ''
     except:
         log.info(self.log_msg('post  not found in %s'% self.currenturi))
         return
     try:
         page['et_author_name'] = stripHtml(post.find('a').renderContents())
     except:
         log.exception(self.log_msg('Author name not found'))
     return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:26,代码来源:gamasutraconnector.py

示例4: __getData

 def __getData(self, post, is_question):
     """ This will return the page dictionry
     """
     page = {'entity':'question' if is_question else 'answer'}
     try:
         data_tag = post.find('div', id=re.compile('post_message_\d+'))
         [each.findParent('div').extract() for each in data_tag.findAll('div', text='Quote:')]
         page['data'] = stripHtml(data_tag.renderContents())
         page['title'] = stripHtml(self.soup.find('td', 'navbar').renderContents())
         if not is_question:
             page['title'] = 'Re:' + page['title']
     except:
         log.exception(self.log_msg('Data not found'))
         page['data'] = ''
     if not page['data']: 
         log.info(self.log_msg("Data is not found for discarding this Post"))
         return False 
     try:
         page['et_author_name'] = stripHtml(post.find('a', 'bigusername').renderContents())
     except:
         log.info(self.log_msg('author name not found'))
     try:
         date_str = stripHtml(post.find('td', 'thead').renderContents())
         date_str = re.sub("(\d+)(st|nd|rd|th)",r"\1", date_str).strip()
         page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%B %d, %Y, %I:%M %p'), "%Y-%m-%dT%H:%M:%SZ")
     except:
         log.exception(self.log_msg('posted date not found'))
         page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
     
     return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:htcpediaconnector.py

示例5: __getData

 def __getData(self, post):
     """ This will return the page dictionry
     """
     page = {'entity':'answer', 'uri':self.currenturi, 'title': 'Re: ' + self.__thread_topic, 'et_thread_topic':self.__thread_topic}
     try:
         page['data'] = stripHtml(post.find('div', 'post_fmt').renderContents())
     except:
         log.info(self.log_msg('Data not found for the url %s'%self.currenturi))
         return True
     try:
         author_tag_str = stripHtml(post.find('div', 'post_hdr_fmt').renderContents())
         if 'responded:' in author_tag_str:
             page['et_author_name'] = author_tag_str.replace('responded:', '').strip()
         else:
             author_split = author_tag_str.split('replied to')
             page['et_author_name'] = author_split[0].strip()
             page['et_data_replied_to'] = author_split [1].split(" 's ")[0].strip()
     except:
         log.info(self.log_msg('Authors info not avbl'))
     try:
         date_str = stripHtml(post.find('div', 'posted_fmt').renderContents()).split('GMT')[0].strip().replace("document.write(DateDelta('", '').strip()
         page['posted_date'] = datetime.strftime(datetime.strptime(date_str, '%a %B %d %Y %H:%M:%S'), "%Y-%m-%dT%H:%M:%SZ")
     except:
         log.info(self.log_msg('posted_date not found in url %s'%self.currenturi))
     return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:25,代码来源:webmdexchangesconnector.py

示例6: __getData

 def __getData(self, post):
     """ This will return the page dictionry
     """
     page = {'entity':'question' if self.__is_question else 'answer'}
     try:
         data_tag = post.find('span', 'postbody')
         [each.extract() for each in data_tag.findAll('div', 'quote_container')]
         page['data'] = stripHtml(data_tag.renderContents())
         page['title'] = stripHtml(self.soup.find('a','maintitle').renderContents())
         if not self.__is_question:
             page['title'] = 'Re:' + page['title']
     except:
         log.exception(self.log_msg('Data not found'))
         page['data'] = ''
     if not page['data']: 
         log.info(self.log_msg("Data is not found for discarding this Post"))
         return False 
     try:
         page['et_author_name'] = stripHtml(post.find('span', 'name').renderContents())
     except:
         log.info(self.log_msg('author name not found'))
     try:
         date_str = stripHtml(post.findAll('span', 'postdetails')[1].renderContents()).split('Post subject:')[0].replace('Posted:','').strip()
         page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%a %b %d, %Y %I:%M %p'), "%Y-%m-%dT%H:%M:%SZ")
     except:
         log.exception(self.log_msg('posted date not found'))
         page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
     
     return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:29,代码来源:blackberryblastforumsconnector.py

示例7: __getData

    def __getData(self, review, post_type ):
        """ This will return the page dictionry
        """
        page = {'title':'','posted_date':datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")}
        try:
            page['et_data_post_type'] = post_type
            title_info = review.findAll('td')
            page['title'] = stripHtml(title_info[1].renderContents())
            page['posted_date'] = datetime.strftime(datetime.strptime(stripHtml\
                            (title_info[0].renderContents()),'%Y-%m-%d %H:%M'),\
                                                            "%Y-%m-%dT%H:%M:%SZ")

            page['et_author_name'] = stripHtml(title_info[2].renderContents())
        except:
            log.info(self.log_msg('title or posted date not found'))
        try:
            td_tag = review.findNext('tr')
            div_tag = td_tag.find('div')
            if div_tag:
                div_tag.extract()
            page['data'] = '\n'.join([x for x in  stripHtml(td_tag.renderContents()).split('\n') if not x.strip()=='' and not x.strip().startswith('>') and not re.match('.*wrote:$',x.strip()) and not re.search('napisa.a:$',x.strip()) and not re.search('napisa.\(a\):$',x.strip())])
        except:
            log.exception(self.log_msg('Posted date not found for this post'))
            page['data'] = ''
        try:
            if page['title']=='':
                if len(page['data']) > 50:
                    page['title'] = page['data'][:50] + '...'
                else:
                    page['title'] = page['data']
        except:
            log.exception(self.log_msg('title not found'))
            page['title'] = ''
        return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:34,代码来源:moneygrupydyskusyjneconnector.py

示例8: __get_data

    def __get_data(self, post, is_original_post):
        page = {'entity':'question' if is_original_post else 'answer'}
        page['uri'] = self.currenturi

        page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")

        auth_info = self.__get_author_info(post, is_original_post)
        if auth_info['name']:
            page['et_author_name'] = auth_info['name']
        if auth_info['location']:
            page['et_author_location'] = auth_info['location']

        if is_original_post:
            page['data'] = stripHtml([e.strip() for e in post.findAll(text=True) if e.strip()][3])
        else:
            #page['data'] = stripHtml([e.strip() for e in post.findAll(text=True) if e.strip()][1])
            page['data'] = stripHtml(str(post.find('div', 'textLine12 text12 commentBodyBlock')))

        if page['data'] == 'Add as friend':
            log.info(self.log_msg('post data could not be found in url %s' % self.currenturi))
            return False

        title = stripHtml(str(self.soup.find('h1', 'text16 global-mfc pm0all textLine12')))
        if not is_original_post:
            page['title'] = 'Re: ' + title
        else:
            page['title'] = title
        return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:28,代码来源:wellsphereconnector.py

示例9: _getUserInfo

    def _getUserInfo(self,author_profile_link):
        try:
            user_profile_id = re.findall("http://myaccount\.ibibo\.com/MyIbibo\.aspx\?uId=(.*)$",author_profile_link)[0]
            self.currenturi = "http://my.ibibo.com/Profile/view/" + user_profile_id
            res=self._getHTML(self.currenturi)
            self.rawpage=res['result']
            self._setCurrentPage()
            try:
                self.current_page['ei_author_age']= str(int(stripHtml(self.soup.find('span',attrs={'id':'UserAgeSexLocationInfo'}).previous)))
            except:
                log.info(self.log_msg("Error occured while fetching author's age"))
           
            try:
                self.current_page['et_author_gender']= stripHtml(self.soup.find('span',attrs={'id':'UserAgeSexLocationInfo'}).findNext('span').renderContents().replace(',','').replace('\n',' ').strip().split()[0])

            except:
                log.info(self.log_msg("Error occured while fetching author's gender"))

            try:
                self.current_page['et_author_location']= ' '.join(stripHtml(self.soup.find('span',attrs={'id':'UserAgeSexLocationInfo'}).findNext('span').renderContents().replace(',','').replace('\n',' ').strip().split()[1:]))
            except:
                log.info(self.log_msg("Error occured while fetching author's location"))

            log.debug("Fetched user info from the url %s" %author_profile_link)
            return True
        except:
            log.exception(self.log_msg("Exception occured while fetching user profile"))
            return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:28,代码来源:ibiboopinionsconnector.py

示例10: __getData

 def __getData(self, post):
     page = {}
     try:
         page['data'] = stripHtml(post.find('div','footer').findPrevious('p').\
                         renderContents())
         page['title'] = ''
     except:
         log.exception(self.log_msg('data not found'))
         return
     try:
         date_str = stripHtml(post.find('div','wraptocenter').renderContents()).\
                     split('posted')[-1].strip()
         page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%b-%d-%Y'),"%Y-%m-%dT%H:%M:%SZ")
     except:
         log.exception(self.log_msg('posted_date nt found %s'%self.currenturi))
         page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
     try:
         page['et_author_name'] = stripHtml(post.find('div','user').find('a').renderContents())
     except:
         log.exception(self.log_msg('author_name not found %s'%self.currenturi))
     try:
         page['ef_rating_overall'] = float(stripHtml(post.find('div','rating').\
                                     renderContents()).split('/')[0])
     except:
         log.exception(self.log_msg('rating tag not found'))    
    
     return page             
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:27,代码来源:resellerratingsconnector.py

示例11: __getAuthorInfo

 def __getAuthorInfo(self,page):
     '''It will fetch the author info
     '''
     try:
         self.currenturi = 'http://kin.naver.com/userinfo/index.php?member_id=%s'%page['et_author_name']
         log.info(self.currenturi)
         if not self.__setSoup():
             return page
     except:
         log.info(self.log_msg('author url not found'))
         return page
     try:
         aut_info = [int(re.sub('[^\d]','',stripHtml(x.findNext('dd').renderContents()))) for x in  self.soup.find('dl','info_count').findAll('dt')]
         page['ei_author_questions_count'] =aut_info[0]
         page['ei_author_answers_count'] =aut_info[1]
         page['ei_author_referals_count'] =aut_info[2]
     except:
         log.info(self.log_msg('author info count not found'))
     try:
         aut_info = [float(stripHtml(x.findNext('dd').renderContents()[:-1])) for x in  self.soup.find('dl','info_graph').findAll('dd','graph')]
         page['ef_author_questioning_percentage'] = aut_info[0]
         page['ef_author_answering_percentage'] = aut_info[1]
         page['ef_author_writing_percentage'] = aut_info[2]
     except:
         log.info(self.log_msg('Author info not found , float '))
     try:
         aut_info = [stripHtml(x.renderContents()) for x in self.soup.find('dl','info_rank').findAll('dd')]
         page['ei_author_energy'] =int( re.sub('[^\d]','',aut_info[0]))
         page['ei_author_rank'] =int(re.sub('[^\d]','',aut_info[1]))
     except:
         log.info(self.log_msg('rank not found'))
     return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:32,代码来源:naverconnector.py

示例12: __getData

    def __getData(self, review, post_type ):
        """ This will return the page dictionry
        """
        page = {'et_data_post_type':post_type}
##        try:
##            unique_kye = review.find('div','dt').findPrevious('a')['name']
##        except:
##            log.info(self.log_msg('unique not found'))
##            return False
        try:
            post_info = review.find('div','postleft')
            author_info = post_info.find('dt')
            page['et_author_name'] = stripHtml(author_info.renderContents())
            profile = author_info.find('a',href=True)
            if profile:
                page['et_author_profile'] = self.base_url + profile['href']
            page['et_author_title'] = stripHtml(post_info.find('dd','usertitle').renderContents())
            aut_info = ['Zarejestrowany:','Posty:']
            for each in aut_info:
                info_str = post_info.find('dd',text= re.compile( each+'.*'))#
                if info_str.startswith(aut_info[0]):
                    date_str = info_str.replace(aut_info[0],'').strip()
                    page['edate_author_member_since'] =  datetime.strftime(datetime.strptime(date_str, '%Y-%m-%d'),"%Y-%m-%dT%H:%M:%SZ")
                if info_str.startswith(aut_info[1]):
                    page['ei_author_posts_count'] = int(info_str.replace(aut_info[1],'').strip())
        except:
            log.info(self.log_msg('post info not found'))
        prev_soup = copy.copy(self.soup)
        prev_uri = self.currenturi
        try:
            self.currenturi = page['et_author_profile']
            if self.__setSoup():
                author_stat = [int(stripHtml(x.find('span').renderContents())) for x in self.soup.find('div',id='column_center').findAll('p')[1:]]
                page['ei_author_opinions_count'] = author_stat[0]
                page['ei_author_comments_count'] = author_stat[1]
                page['ei_author_rating'] = author_stat[3]
        except:
            log.info(self.log_msg('Author info not found'))
        self.soup =copy.copy(prev_soup)
        self.currenturi = prev_uri
        try:
            date_str = stripHtml(review.find('h2').find('a').renderContents())
            page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%Y-%m-%d %H:%M:%S'),"%Y-%m-%dT%H:%M:%SZ")
        except:
            log.info(self.log_msg('Posted date not found'))
            page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
        try:
            page['data'] =  stripHtml(review.find('div','postmsg').renderContents())
        except:
            log.exception(self.log_msg('Posted date not found for this post'))
            page['data'] = ''
        try:
            if len(page['data']) > 50:
                page['title'] = page['data'][:50] + '...'
            else:
                page['title'] = page['data']
        except:
            log.exception(self.log_msg('title not found'))
            page['title'] = ''
        return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:oceanconnector.py

示例13: __getThreads

 def __getThreads( self ):
     """ Get thread information and create tasks.
     """
     threads = [each.findParent('tr') for each in self.soup.findAll('a', 
                                                         'SUBJECT_STYLE')]
     if not threads:
         log.info(self.log_msg('No Results in url %s'%self.currenturi))
         return False
     for thread in threads:
         if self.__total_threads_count > self.__max_threads_count:
             log.info(self.log_msg('Reaching maximum post,Return false'))
             return False
         self.__total_threads_count +=  1
         thread_info = thread.findAll('td', recursive=False)
         try:
             date_str = re.sub('\s+', ' ', stripHtml(thread_info[-1].\
                                                         renderContents()))
             thread_time = datetime.strptime(date_str, "%m-%d-%Y %I:%M %p")
         except Exception, exce:
             log.info(self.log_msg('Posted date not found in url %s'\
                                                         %self.currenturi))
             continue
         if checkSessionInfo('Search', self.session_info_out, thread_time, 
                                     self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info return True'))
             return False
         self.__last_timestamp = max(thread_time, self.__last_timestamp)
         temp_task =  self.task.clone()
         try:                
             temp_task.pagedata['title'] = stripHtml(thread.find('a', \
                     'SUBJECT_STYLE').renderContents().strip()).split(':')\
                                                             [-1].strip()
         except Exception, exce:
             log.info(self.log_msg('Thread title not available'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:34,代码来源:ficoforumsconnector.py

示例14: __getData

 def __getData(self, review):
     '''This will get review Div tag and return a Dictionary if all fields 
     captured, if no data found, it will return False'''
     page = {'title':self.__title_str}  # Title Changed
     author_tag = review.find('a', 'avatar_link', href=True)
     if author_tag:
         page['et_author_name'] = stripHtml(author_tag.renderContents())
         page['et_author_profile'] = author_tag['href']
     try:
         date_str = stripHtml(review.find('a', 'avatar_time').renderContents())
         date_obj = datetime.strptime(date_str,'%m/%d/%y')
     except:
         log.info(self.log_msg('posted date cannot be parsed in url %s'%self.currenturi))
         date_obj = datetime.utcnow()
     page['posted_date'] = datetime.strftime(date_obj,"%Y-%m-%dT%H:%M:%SZ")
     try:
         data_str = stripHtml(review.find('span' ,'ctedit').renderContents())
         reply_author_match = re.search('@\s*.+?:', data_str)
         if reply_author_match:
             author_name = reply_author_match.group() # Variable Fixed
             page['et_data_replied_author'] = author_name[1:-1].strip()
             data_str = data_str.replace(author_name,'',1).strip()
         page['data'] = data_str
     except:
         log.info(self.log_msg('Data not found in url %s'%self.currenturi)) # Url Fixed
         page['data'] = ''
     if not page['data']:
         log.info(self.log_msg('Empty data is found for url %s'%self.currenturi)) # URL Fixed
         return False
     return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:gizmodoconnector.py

示例15: _getParentPage

    def _getParentPage(self,parent_uri):#NAMING CONVENTION IS WRONG
            ##J- I think these needs to be in a try except- if th title fails or rating fails - coz the html changed---what crash?
            ## a try-except-raise
            try:
                page={}
                try:
                    page['data']= re.sub('\n{2,}','\n',stripHtml(self.soup.find('dd',{'class':'rContent'}).renderContents()))
                except:
                    log.exception(self.log_msg('data could not be parsed'))
                    raise e

                try:
                    page['title'] = stripHtml(self.soup.find('strong',{'id':'q_title'}).renderContents())
                except Exception, e:
                    log.exception(self.log_msg('could not parse page title'))
                    raise e

                try:
                    page['et_author_name'] = stripHtml(self.soup.find('p',{'class':'nickArea'}).a.renderContents())
                except:
                    log.info('could not parse author name')

                try:
                    page['ei_num_views'] = int(self.soup.find('span',{'id':'viewCount'}).renderContents())
                except Exception, e:
                    log.info(self.log_msg('could not parse number of views'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:26,代码来源:daumknowledgeconnector.py


注:本文中的utils.utils.stripHtml函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。