当前位置: 首页>>代码示例>>Python>>正文


Python Article.parser_version方法代码示例

本文整理汇总了Python中article.Article.parser_version方法的典型用法代码示例。如果您正苦于以下问题:Python Article.parser_version方法的具体用法?Python Article.parser_version怎么用?Python Article.parser_version使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在article.Article的用法示例。


在下文中一共展示了Article.parser_version方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: go

# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import parser_version [as 别名]
    def go(self, reparse=False, limit=0, urls=None):
        """ Run a scraping pass from all roots in the scraping database """

        version = Article.parser_version()

        # Go through the roots and scrape them, inserting into the articles table
        with SessionContext(commit=True) as session:

            if urls is None and not reparse:

                def iter_roots():
                    """ Iterate the roots to be scraped """
                    for r in session.query(Root).filter(Root.scrape == True).all():
                        yield r

                # Use a multiprocessing pool to scrape the roots

                pool = Pool(4)
                pool.imap_unordered(self._scrape_single_root, iter_roots())
                pool.close()
                pool.join()

                # noinspection PyComparisonWithNone
                def iter_unscraped_articles():
                    """ Go through any unscraped articles and scrape them """
                    # Note that the query(ArticleRow) below cannot be directly changed
                    # to query(ArticleRow.root, ArticleRow.url) since
                    # ArticleRow.root is a joined subrecord
                    seq = 0
                    for a in (
                        session.query(ArticleRow)
                        .filter(ArticleRow.scraped == None)
                        .filter(ArticleRow.root_id != None)
                        .yield_per(100)
                    ):
                        yield ArticleDescr(seq, a.root, a.url)
                        seq += 1

                # Use a multiprocessing pool to scrape the articles

                pool = Pool(8)
                pool.imap_unordered(
                    self._scrape_single_article, iter_unscraped_articles()
                )
                pool.close()
                pool.join()

            # noinspection PyComparisonWithNone
            def iter_unparsed_articles(reparse, limit):
                """ Go through articles to be parsed """
                # Fetch 100 rows at a time
                # Note that the query(ArticleRow) below cannot be directly changed
                # to query(ArticleRow.root, ArticleRow.url) since
                # ArticleRow.root is a joined subrecord
                q = session.query(ArticleRow).filter(ArticleRow.scraped != None)
                if reparse:
                    # Reparse articles that were originally parsed with an older
                    # grammar and/or parser version
                    q = q.filter(ArticleRow.parser_version < version).order_by(
                        ArticleRow.parsed
                    )
                else:
                    # Only parse articles that have no parse tree
                    q = q.filter(ArticleRow.tree == None)
                q = q.filter(ArticleRow.root_id != None).yield_per(100)
                if limit > 0:
                    # Impose a limit on the query, if given
                    q = q.limit(limit)
                for seq, a in enumerate(q):
                    yield ArticleDescr(seq, a.root, a.url)

            def iter_urls(urls):
                """ Iterate through the text file whose name is given in urls """
                seq = 0
                with open(urls, "r") as f:
                    for url in f:
                        url = url.strip()
                        if url:
                            a = (
                                session.query(ArticleRow)
                                .filter(ArticleRow.url == url)
                                .one_or_none()
                            )
                            if a is not None:
                                # Found the article: yield it
                                yield ArticleDescr(seq, a.root, a.url)
                                seq += 1

            # Use a multiprocessing pool to parse the articles.
            # Let the pool work on chunks of articles, recycling the
            # processes after each chunk to contain memory creep.

            CPU_COUNT = cpu_count()
            # Distribute the load between the CPUs, although never exceeding
            # 100 articles per CPU per process cycle
            if limit > 0:
                CHUNK_SIZE = min(100 * CPU_COUNT, limit)
            else:
                CHUNK_SIZE = 100 * CPU_COUNT
            if urls is None:
#.........这里部分代码省略.........
开发者ID:vthorsteinsson,项目名称:Reynir,代码行数:103,代码来源:scraper.py


注:本文中的article.Article.parser_version方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。