当前位置: 首页>>代码示例>>Python>>正文


Python Menu.sortLinks方法代码示例

本文整理汇总了Python中Menu.Menu.sortLinks方法的典型用法代码示例。如果您正苦于以下问题:Python Menu.sortLinks方法的具体用法?Python Menu.sortLinks怎么用?Python Menu.sortLinks使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Menu.Menu的用法示例。


在下文中一共展示了Menu.sortLinks方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from Menu import Menu [as 别名]
# 或者: from Menu.Menu import sortLinks [as 别名]
class ContentExtractor:

    def __init__(self, extractorUrl=None):
        self.garbageTags = ['script', 'style', 'noscript', 'form', 'input']
        self.articleContainer = None
        self.containers = {}
        self.url = ''
        self.extractorUrl = extractorUrl
        self.robotparser = robotparser.RobotFileParser()
        self.userAgentString = 'Shrinkr/0.9 (http://shrinkr.jonathanjanssens.com/about.php)'
        self.urlComponents = urlparse(self.url)
        self.head = ''
        self.allLinks = []

    def read(self, url):
        self.url = url
        self.fixUrl()
        self.robotparser.set_url('%s://%s/robots.txt' % (self.urlComponents.scheme, self.urlComponents.netloc))
        self.robotparser.read()
        if self.robotparser.can_fetch(self.userAgentString, self.url) is False:
            raise RuntimeError('Shrinkr is blocked by the robots.txt file for this site.')
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', self.userAgentString)]
        html = opener.open(self.url)
        self.soup = BeautifulSoup(html)
        self.clean()

    def fixUrl(self):
        if self.url[:4] != 'http':
            self.url = 'http://%s' % (self.url)
        self.urlComponents = urlparse(self.url)


    def clean(self):
        comments = self.soup.find_all(text=lambda text:isinstance(text, Comment))
        for comment in comments:
            comment.extract()
        for tag in self.garbageTags:
            for element in self.soup.find_all(tag):
                element.decompose()
        self.head = self.soup.find('head').extract()
        self.allLinks = self.soup.find_all('a', href=True)
        self.fixRelativeUrls()

    def makeMenu(self):
        self.menu = Menu(self.allLinks, self.urlComponents.netloc, extractorUrl=self.extractorUrl)
        self.menu.sortLinks()
        self.menuHTML = self.menu.make()
        with open('tpl/controls.tpl', 'r') as html:
            self.controlsHTML = html.read().replace('{{ orginal_url }}', self.url)

    def extractArticle(self):
        if self.extractorUrl is not None:
            self.extractLinkedArticles()
        for tag in self.soup.find_all()[::-1]:
            evaluate = Evaluate()
            if tag.name == 'p':
                evaluate.asParagraph(tag)
            else:
                for child in tag.children:
                    if isinstance(child, NavigableString):
                        text = unicode(child).strip()
                        if len(text) > 10:
                            evaluate.asParagraph(child)
                            continue
                    else:
                        evaluate.asContainer(child)
                evaluate.asContainer(tag)
                self.containers[tag] = evaluate.score
        self.containers = OrderedDict(sorted(self.containers.items(), key=lambda t: t[1])) # sort based on value (ASC)
        
        self.articleContainer = self.containers.popitem()[0]

    def getExtractedArticle(self, prependHead=True):
        article = ''
        # append menu and controls to self.articleContainer
        self.articleContainer.insert(1, self.menuHTML)
        self.articleContainer.insert(2, self.controlsHTML)
        # append js and the end and css to self.head
        with open('static/shrinkr.css', 'r') as css:
            self.head.insert(2, '<style>%s</style>' % css.read())
        with open('static/shrinkr.js', 'r') as js:
            self.articleContainer.insert(len(self.articleContainer.contents), '<script>%s</script>' % js.read())
        #
        if prependHead is True:
            article = unicode.join(u'\n',map(unicode,self.head))
        article += unicode.join(u'\n',map(unicode,self.articleContainer))
        return article

    def getExtractedArticleText(self):
        return self.articleContainer.get_text()

    def fixRelativeUrls(self):
        for a in self.soup.find_all('a', href=True):
            if a['href'][:4] != 'http':
                if a['href'][:1] == '#' or a['href'][:1] == '?':
                    a['href'] = '%s://%s%s%s' % (self.urlComponents.scheme, self.urlComponents.netloc, self.urlComponents.path, a['href'])
                else:
                    a['href'] = '%s://%s/%s' % (self.urlComponents.scheme, self.urlComponents.netloc, a['href'])

#.........这里部分代码省略.........
开发者ID:gvsurenderreddy,项目名称:shrinkr,代码行数:103,代码来源:ContentExtractor.py


注:本文中的Menu.Menu.sortLinks方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。