本文整理汇总了Python中article.Article.parser_version方法的典型用法代码示例。如果您正苦于以下问题:Python Article.parser_version方法的具体用法?Python Article.parser_version怎么用?Python Article.parser_version使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类article.Article
的用法示例。
在下文中一共展示了Article.parser_version方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: go
# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import parser_version [as 别名]
def go(self, reparse=False, limit=0, urls=None):
""" Run a scraping pass from all roots in the scraping database """
version = Article.parser_version()
# Go through the roots and scrape them, inserting into the articles table
with SessionContext(commit=True) as session:
if urls is None and not reparse:
def iter_roots():
""" Iterate the roots to be scraped """
for r in session.query(Root).filter(Root.scrape == True).all():
yield r
# Use a multiprocessing pool to scrape the roots
pool = Pool(4)
pool.imap_unordered(self._scrape_single_root, iter_roots())
pool.close()
pool.join()
# noinspection PyComparisonWithNone
def iter_unscraped_articles():
""" Go through any unscraped articles and scrape them """
# Note that the query(ArticleRow) below cannot be directly changed
# to query(ArticleRow.root, ArticleRow.url) since
# ArticleRow.root is a joined subrecord
seq = 0
for a in (
session.query(ArticleRow)
.filter(ArticleRow.scraped == None)
.filter(ArticleRow.root_id != None)
.yield_per(100)
):
yield ArticleDescr(seq, a.root, a.url)
seq += 1
# Use a multiprocessing pool to scrape the articles
pool = Pool(8)
pool.imap_unordered(
self._scrape_single_article, iter_unscraped_articles()
)
pool.close()
pool.join()
# noinspection PyComparisonWithNone
def iter_unparsed_articles(reparse, limit):
""" Go through articles to be parsed """
# Fetch 100 rows at a time
# Note that the query(ArticleRow) below cannot be directly changed
# to query(ArticleRow.root, ArticleRow.url) since
# ArticleRow.root is a joined subrecord
q = session.query(ArticleRow).filter(ArticleRow.scraped != None)
if reparse:
# Reparse articles that were originally parsed with an older
# grammar and/or parser version
q = q.filter(ArticleRow.parser_version < version).order_by(
ArticleRow.parsed
)
else:
# Only parse articles that have no parse tree
q = q.filter(ArticleRow.tree == None)
q = q.filter(ArticleRow.root_id != None).yield_per(100)
if limit > 0:
# Impose a limit on the query, if given
q = q.limit(limit)
for seq, a in enumerate(q):
yield ArticleDescr(seq, a.root, a.url)
def iter_urls(urls):
""" Iterate through the text file whose name is given in urls """
seq = 0
with open(urls, "r") as f:
for url in f:
url = url.strip()
if url:
a = (
session.query(ArticleRow)
.filter(ArticleRow.url == url)
.one_or_none()
)
if a is not None:
# Found the article: yield it
yield ArticleDescr(seq, a.root, a.url)
seq += 1
# Use a multiprocessing pool to parse the articles.
# Let the pool work on chunks of articles, recycling the
# processes after each chunk to contain memory creep.
CPU_COUNT = cpu_count()
# Distribute the load between the CPUs, although never exceeding
# 100 articles per CPU per process cycle
if limit > 0:
CHUNK_SIZE = min(100 * CPU_COUNT, limit)
else:
CHUNK_SIZE = 100 * CPU_COUNT
if urls is None:
#.........这里部分代码省略.........