当前位置: 首页>>代码示例>>Python>>正文


Python Request.meta["urls"]方法代码示例

本文整理汇总了Python中scrapy.Request.meta["urls"]方法的典型用法代码示例。如果您正苦于以下问题:Python Request.meta["urls"]方法的具体用法?Python Request.meta["urls"]怎么用?Python Request.meta["urls"]使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.Request的用法示例。


在下文中一共展示了Request.meta["urls"]方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
    def parse(self, response):
        """Parse Philpapers JSON file into a HEP record."""

        jsonresponse = json.loads(response.body_as_unicode())
        for jsonrecord in jsonresponse:
            urls_in_record = jsonrecord.get("links")
            if urls_in_record:
                link = urls_in_record[0]
                request = Request(link, callback=self.scrape_for_pdf)
                request.meta["urls"] = urls_in_record
                request.meta["jsonrecord"] = jsonrecord
                yield request
            else:
                response.meta["urls"] = []
                request.meta["jsonrecord"] = jsonrecord
                yield self.build_item(response)
开发者ID:gitter-badger,项目名称:hepcrawl,代码行数:18,代码来源:phil_spider.py

示例2: parse_node

# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
    def parse_node(self, response, node):
        """Parse Alpha web page into a HEP record."""
        authors = self.get_authors(node)
        title = node.xpath("./td[3]/span/span/text()").extract()
        date = node.xpath("./td[4]/span/span/text()").extract()
        urls = self.get_splash_links(node)

        response.meta["node"] = node
        response.meta["authors"] = authors
        response.meta["title"] = title
        response.meta["date"] = date
        if not urls:
            return self.build_item(response)

        request = Request(urls[0], callback=self.scrape_for_pdf)
        request.meta["node"] = node
        request.meta["authors"] = authors
        request.meta["urls"] = urls
        request.meta["title"] = title
        request.meta["date"] = date
        return request
开发者ID:ksachs,项目名称:hepcrawl,代码行数:23,代码来源:t2k_spider.py

示例3: parse_node

# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
    def parse_node(self, response, node):
        """Iterate through all the record nodes in the XML.

        With each node it checks if direct link exists, and sends
        a request to scrape the direct link or calls build_item() to build
        the HEPrecord.
        """
        urls_in_record = self.get_urls_in_record(node)
        direct_link = self.find_direct_links(urls_in_record)

        if not direct_link and urls_in_record:
            # Probably all links lead to same place, so take first
            link = urls_in_record[0]
            request = Request(link, callback=self.scrape_for_pdf)
            request.meta["urls"] = urls_in_record
            request.meta["record"] = node.extract()
            return request
        elif direct_link:
            response.meta["direct_link"] = direct_link
            response.meta["urls"] = urls_in_record
            response.meta["record"] = node.extract()
            return self.build_item(response)
开发者ID:bittirousku,项目名称:hepcrawl,代码行数:24,代码来源:base_spider.py

示例4: parse_node

# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
    def parse_node(self, response, node):
        """Parse MAGIC web page into a HEP record."""

        urls = self.get_splash_links(node)
        title = node.xpath(".//a/text()").extract_first()
        author_date = node.xpath(".//br/following-sibling::text()").extract()
        try:
            date = author_date[1].strip().strip("()")
        except IndexError:
            date = ''

        if not urls:
            response.meta["title"] = title
            response.meta["date"] = date
            return self.build_item(response)

        request = Request(urls[0], callback=self.scrape_for_pdf)
        request.meta["urls"] = urls
        request.meta["title"] = title
        request.meta["date"] = date
        request.meta["handle_httpstatus_list"] = self.ERROR_CODES
        return request
开发者ID:gitter-badger,项目名称:hepcrawl,代码行数:24,代码来源:magic_spider.py

示例5: parse_node

# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
    def parse_node(self, response, node):
        """Iterate through all the record nodes in the XML.

        With each node it checks if splash page link exists, and sends
        a request to scrape the abstract or calls `build_item` to build
        the HEPrecord.
        """
        urls_in_record = self.get_urls_in_record(node)
        direct_links, splash_links = self.find_direct_links(urls_in_record)
        if not splash_links:
            response.meta["urls"] = urls_in_record
            response.meta["record"] = node.extract()
            if direct_links:
                response.meta["direct_links"] = direct_links
            return self.build_item(response)

        link = splash_links[0]
        request = Request(link, callback=self.scrape_for_abstract)
        request.meta["urls"] = urls_in_record
        request.meta["record"] = node.extract()
        if direct_links:
            request.meta["direct_links"] = direct_links
        return request
开发者ID:gitter-badger,项目名称:hepcrawl,代码行数:25,代码来源:dnb_spider.py


注:本文中的scrapy.Request.meta["urls"]方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。