本文整理汇总了Python中article.Article.scrape_from_url方法的典型用法代码示例。如果您正苦于以下问题:Python Article.scrape_from_url方法的具体用法?Python Article.scrape_from_url怎么用?Python Article.scrape_from_url使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类article.Article
的用法示例。
在下文中一共展示了Article.scrape_from_url方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: page
# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import scrape_from_url [as 别名]
def page():
""" Handler for a page displaying the parse of an arbitrary web
page by URL or an already scraped article by UUID """
url = request.args.get("url", None)
uuid = request.args.get("id", None)
if url:
url = url.strip()[0:_MAX_URL_LENGTH]
if uuid:
uuid = uuid.strip()[0:_MAX_UUID_LENGTH]
if url:
# URL has priority, if both are specified
uuid = None
if not url and not uuid:
# !!! TODO: Separate error page
return redirect(url_for("routes.main"))
with SessionContext(commit=True) as session:
if uuid:
a = ArticleProxy.load_from_uuid(uuid, session)
elif url.startswith("http:") or url.startswith("https:"):
a = ArticleProxy.scrape_from_url(url, session) # Forces a new scrape
else:
a = None
if a is None:
# !!! TODO: Separate error page
return redirect(url_for("routes.main"))
# Prepare the article for display (may cause it to be parsed and stored)
a.prepare(session, verbose=True, reload_parser=True)
register = a.create_register(session, all_names=True)
# Fetch names of article topics, if any
topics = (
session.query(ArticleTopic).filter(ArticleTopic.article_id == a.uuid).all()
)
topics = [dict(name=t.topic.name, id=t.topic.identifier) for t in topics]
# Fetch similar (related) articles, if any
DISPLAY = 10 # Display at most 10 matches
similar = Search.list_similar_to_article(session, a.uuid, n=DISPLAY)
return render_template(
"page.html", article=a, register=register, topics=topics, similar=similar
)
示例2: scrape_article
# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import scrape_from_url [as 别名]
def scrape_article(self, url, helper):
""" Scrape a single article, retrieving its HTML and metadata """
if helper.skip_url(url):
logging.info("Skipping article {0}".format(url))
return
# Fetch the root URL and scrape all child URLs that refer
# to the same domain suffix and we haven't seen before
logging.info("Scraping article {0}".format(url))
t0 = time.time()
with SessionContext(commit=True) as session:
a = Article.scrape_from_url(url, session)
if a is not None:
a.store(session)
t1 = time.time()
logging.info("Scraping completed in {0:.2f} seconds".format(t1 - t0))