本文整理汇总了Python中scrapy.Request.meta["article_type"]方法的典型用法代码示例。如果您正苦于以下问题:Python Request.meta["article_type"]方法的具体用法?Python Request.meta["article_type"]怎么用?Python Request.meta["article_type"]使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.Request
的用法示例。
在下文中一共展示了Request.meta["article_type"]方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_node
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["article_type"] [as 别名]
def parse_node(self, response, node):
"""Parse the XML file and yield a request to scrape for the PDF."""
node.remove_namespaces()
if response.meta.get("rich"):
article_type = node.xpath('./ArticleID/@Type').extract_first()
dois = node.xpath('.//DOI/text()').extract()
date_published = self._get_date_published_rich(node)
journal_title = node.xpath(
'.//JournalShortTitle/text()|//JournalTitle/text()').extract_first()
else:
article_type = node.xpath('@article-type').extract_first()
dois = node.xpath(
'.//article-id[@pub-id-type="doi"]/text()').extract()
date_published = self._get_published_date(node)
journal_title = node.xpath(
'.//abbrev-journal-title/text()|//journal-title/text()').extract_first()
self.logger.info("Got article_type {0}".format(article_type))
if article_type is None or article_type not in self.allowed_article_types:
# Filter out non-interesting article types
return None
if dois and journal_title in self.OPEN_ACCESS_JOURNALS:
# We should get the pdf only for open access journals
link = "http://dx.doi.org/" + dois[0]
request = Request(link, callback=self.scrape_for_pdf)
request.meta["record"] = node.extract()
request.meta["article_type"] = article_type
request.meta["dois"] = dois
request.meta["rich"] = response.meta.get("rich")
request.meta["date_published"] = date_published
request.meta["journal_title"] = journal_title
return request
else:
response.meta["record"] = node.extract()
response.meta["article_type"] = article_type
response.meta["dois"] = dois
response.meta["date_published"] = date_published
response.meta["journal_title"] = journal_title
if response.meta.get("rich"):
return self.build_item_rich(response)
else:
return self.build_item_jats(response)