当前位置: 首页>>代码示例>>Python>>正文


Python scrapy.Item方法代码示例

本文整理汇总了Python中scrapy.Item方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.Item方法的具体用法?Python scrapy.Item怎么用?Python scrapy.Item使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy的用法示例。


在下文中一共展示了scrapy.Item方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_process_item

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def test_process_item(self):
        item = self._get_item()

        spider = MagicMock()
        spider.name = "link"

        self.pipe.logger.info = MagicMock(side_effect=Exception("info"))
        try:
            self.pipe.process_item(item, spider)
            self.assertFalse(True)
        except Exception as e:
            self.assertEqual(str(e), "info")

        # test unknown item
        class WeirdItem(Item):
            pass
        item2 = WeirdItem()

        self.pipe.logger.warn = MagicMock(side_effect=Exception("warn"))
        try:
            self.pipe.process_item(item2, spider)
            self.assertFalse(True)
        except Exception as e:
            self.assertEqual(str(e), "warn") 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:26,代码来源:test_pipelines.py

示例2: process_spider_output

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def process_spider_output(self, response, result, spider):
        for element in result:
            if isinstance(element, (Item, dict)):
                if isinstance(self.field_to_process, list):
                    text = ' '.join(
                        [element[field] for field in self.field_to_process]
                    )
                elif isinstance(self.field_to_process, string_types):
                    text = element[self.field_to_process]
                else:
                    yield element

                tagger = StanfordNERTagger(
                    model_filename=self.classifier,
                    path_to_jar=self.jar_file
                )
                token_entity_pairs = tagger.tag(
                    tokens=self.tokenizer(s=text)
                )
                accumulated = self.accumulate(token_entity_pairs)
                element.setdefault(self.output_field, accumulated)
                yield element
            else:
                yield element 
开发者ID:vu3jej,项目名称:scrapy-corenlp,代码行数:26,代码来源:middlewares.py

示例3: process_spider_output

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        ts = datetime.now()
        stored_meta = response.meta.get('stored_meta')
        if stored_meta and 'timestamp' in stored_meta:
            ts = datetime.fromtimestamp(stored_meta['timestamp'])


        for i in result:
            if isinstance(i, (dict, Item)):
                i['scraped_time'] = ts
                i['scraped_time'] = ts.strftime('%d/%m/%Y')

                if 'DataAtualizacaoHumanizada' in i:
                    updated = parse(i['DataAtualizacaoHumanizada'],
                                    languages=['pt'],
                                    settings={'RELATIVE_BASE': ts})
                    i['updated_time'] = updated.strftime('%d/%m/%Y')
            yield i 
开发者ID:pauloromeira,项目名称:realestate-scraper,代码行数:24,代码来源:middlewares.py

示例4: find_validators

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def find_validators(self, item):
        find = lambda x: self.validators.get(x.__name__, [])
        return find(item.__class__) or find(Item) 
开发者ID:scrapinghub,项目名称:spidermon,代码行数:5,代码来源:pipelines.py

示例5: _add_errors_to_item

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def _add_errors_to_item(self, item, errors):
        try:
            if self.errors_field not in item.__class__.fields:
                item.__class__.fields[self.errors_field] = Field()
            if self.errors_field not in item._values:
                item[self.errors_field] = defaultdict(list)
        except AttributeError:
            # The item is just a dict object instead of a Scrapy.Item object
            if self.errors_field not in item:
                item[self.errors_field] = defaultdict(list)
        for field_name, messages in errors.items():
            item[self.errors_field][field_name] += messages 
开发者ID:scrapinghub,项目名称:spidermon,代码行数:14,代码来源:pipelines.py

示例6: test_default_item

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def test_default_item(self):
        stats = self._get_stats_for_docs(valid_docs, True)
        pipeline = JsonSchemaValidatePipeline(stats)
        item = Item()
        output_item = pipeline.process_item(item, None)
        assert item == output_item 
开发者ID:scrapy-plugins,项目名称:scrapy-jsonschema,代码行数:8,代码来源:test_pipeline.py

示例7: parse_data

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def parse_data(self, data):
        if isinstance(data, (dict, scrapy.Item)):
            return {
                self.parse_data(k): self.parse_data(v)
                for k, v in data.items()
            }
        elif isinstance(data, list):
            return [self.parse_data(x) for x in data]
        elif isinstance(data, bytes):
            return to_unicode(data)
        elif isinstance(data, datetime):
            return data.isoformat()
        elif isinstance(data, (int, float)):
            return data
        return str(data) 
开发者ID:scrapinghub,项目名称:scrapy-autounit,代码行数:17,代码来源:cli.py

示例8: get_scrapy_item_classes

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def get_scrapy_item_classes():
        """
        Get a list of tuples containing (1) the class name and (2) the class for all of the Scrapy item
        classes defined in the crawling module.
        :return: A list of tuples containing (1) the class name and (2) the class for all of the Scrapy item
        classes defined in the crawling module.
        """
        import lib.inspection.web.crawling.item
        import scrapy
        return list(set(IntrospectionHelper.get_all_classes_of_type(
            to_find=scrapy.Item,
            path="lib/inspection/web/crawling",
        ))) 
开发者ID:lavalamp-,项目名称:ws-backend-community,代码行数:15,代码来源:introspection.py

示例9: process_spider_exception

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass 
开发者ID:pauloromeira,项目名称:realestate-scraper,代码行数:9,代码来源:middlewares.py

示例10: create_item_class

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def create_item_class(class_name, fields_list):

	"""generic Item class creator populated from a list"""

	fields_dict = {}
	for field_name in fields_list:
		fields_dict[field_name] = Field()
	return type( str(class_name), (DictItem,), {'fields': fields_dict} ) 
开发者ID:entrepreneur-interet-general,项目名称:OpenScraper,代码行数:10,代码来源:items.py

示例11: hmset_dict

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def hmset_dict(self, key, item):
        if not isinstance(item, (dict, Item)):
            raise TypeError("Error type: %s" % type(item))

        if not item:
            raise ValueError("item is empty")
        args = chain.from_iterable(item.items())

        return self.cli.hmset(key, *args) 
开发者ID:Karmenzind,项目名称:fp-server,代码行数:11,代码来源:proxy.py

示例12: from_crawler

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def from_crawler(cls, crawler):
        spidermon_enabled = crawler.settings.getbool("SPIDERMON_ENABLED")
        if not spidermon_enabled:
            return PassThroughPipeline()

        validators = defaultdict(list)
        allowed_types = (list, tuple, dict)

        def set_validators(loader, schema):
            if type(schema) in (list, tuple):
                schema = {Item: schema}
            for obj, paths in schema.items():
                key = obj.__name__
                paths = paths if type(paths) in (list, tuple) else [paths]
                objects = [loader(v) for v in paths]
                validators[key].extend(objects)

        for loader, name in [
            (cls._load_jsonschema_validator, "SPIDERMON_VALIDATION_SCHEMAS"),
            (cls._load_schematics_validator, "SPIDERMON_VALIDATION_MODELS"),
        ]:
            res = crawler.settings.get(name)
            if not res:
                continue
            if type(res) not in allowed_types:
                raise NotConfigured(
                    "Invalid <{}> type for <{}> settings, dict or list/tuple"
                    "is required".format(type(res), name)
                )
            set_validators(loader, res)

        if not validators:
            raise NotConfigured("No validators were found")

        return cls(
            validators=validators,
            stats=crawler.stats,
            drop_items_with_errors=crawler.settings.getbool(
                "SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS"
            ),
            add_errors_to_items=crawler.settings.getbool(
                "SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS"
            ),
            errors_field=crawler.settings.get("SPIDERMON_VALIDATION_ERRORS_FIELD"),
        ) 
开发者ID:scrapinghub,项目名称:spidermon,代码行数:47,代码来源:pipelines.py


注:本文中的scrapy.Item方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。