本文整理汇总了Python中scrapy.Item方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.Item方法的具体用法?Python scrapy.Item怎么用?Python scrapy.Item使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy
的用法示例。
在下文中一共展示了scrapy.Item方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_process_item
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def test_process_item(self):
item = self._get_item()
spider = MagicMock()
spider.name = "link"
self.pipe.logger.info = MagicMock(side_effect=Exception("info"))
try:
self.pipe.process_item(item, spider)
self.assertFalse(True)
except Exception as e:
self.assertEqual(str(e), "info")
# test unknown item
class WeirdItem(Item):
pass
item2 = WeirdItem()
self.pipe.logger.warn = MagicMock(side_effect=Exception("warn"))
try:
self.pipe.process_item(item2, spider)
self.assertFalse(True)
except Exception as e:
self.assertEqual(str(e), "warn")
示例2: process_spider_output
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def process_spider_output(self, response, result, spider):
for element in result:
if isinstance(element, (Item, dict)):
if isinstance(self.field_to_process, list):
text = ' '.join(
[element[field] for field in self.field_to_process]
)
elif isinstance(self.field_to_process, string_types):
text = element[self.field_to_process]
else:
yield element
tagger = StanfordNERTagger(
model_filename=self.classifier,
path_to_jar=self.jar_file
)
token_entity_pairs = tagger.tag(
tokens=self.tokenizer(s=text)
)
accumulated = self.accumulate(token_entity_pairs)
element.setdefault(self.output_field, accumulated)
yield element
else:
yield element
示例3: process_spider_output
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
ts = datetime.now()
stored_meta = response.meta.get('stored_meta')
if stored_meta and 'timestamp' in stored_meta:
ts = datetime.fromtimestamp(stored_meta['timestamp'])
for i in result:
if isinstance(i, (dict, Item)):
i['scraped_time'] = ts
i['scraped_time'] = ts.strftime('%d/%m/%Y')
if 'DataAtualizacaoHumanizada' in i:
updated = parse(i['DataAtualizacaoHumanizada'],
languages=['pt'],
settings={'RELATIVE_BASE': ts})
i['updated_time'] = updated.strftime('%d/%m/%Y')
yield i
示例4: find_validators
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def find_validators(self, item):
find = lambda x: self.validators.get(x.__name__, [])
return find(item.__class__) or find(Item)
示例5: _add_errors_to_item
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def _add_errors_to_item(self, item, errors):
try:
if self.errors_field not in item.__class__.fields:
item.__class__.fields[self.errors_field] = Field()
if self.errors_field not in item._values:
item[self.errors_field] = defaultdict(list)
except AttributeError:
# The item is just a dict object instead of a Scrapy.Item object
if self.errors_field not in item:
item[self.errors_field] = defaultdict(list)
for field_name, messages in errors.items():
item[self.errors_field][field_name] += messages
示例6: test_default_item
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def test_default_item(self):
stats = self._get_stats_for_docs(valid_docs, True)
pipeline = JsonSchemaValidatePipeline(stats)
item = Item()
output_item = pipeline.process_item(item, None)
assert item == output_item
示例7: parse_data
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def parse_data(self, data):
if isinstance(data, (dict, scrapy.Item)):
return {
self.parse_data(k): self.parse_data(v)
for k, v in data.items()
}
elif isinstance(data, list):
return [self.parse_data(x) for x in data]
elif isinstance(data, bytes):
return to_unicode(data)
elif isinstance(data, datetime):
return data.isoformat()
elif isinstance(data, (int, float)):
return data
return str(data)
示例8: get_scrapy_item_classes
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def get_scrapy_item_classes():
"""
Get a list of tuples containing (1) the class name and (2) the class for all of the Scrapy item
classes defined in the crawling module.
:return: A list of tuples containing (1) the class name and (2) the class for all of the Scrapy item
classes defined in the crawling module.
"""
import lib.inspection.web.crawling.item
import scrapy
return list(set(IntrospectionHelper.get_all_classes_of_type(
to_find=scrapy.Item,
path="lib/inspection/web/crawling",
)))
示例9: process_spider_exception
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
示例10: create_item_class
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def create_item_class(class_name, fields_list):
"""generic Item class creator populated from a list"""
fields_dict = {}
for field_name in fields_list:
fields_dict[field_name] = Field()
return type( str(class_name), (DictItem,), {'fields': fields_dict} )
示例11: hmset_dict
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def hmset_dict(self, key, item):
if not isinstance(item, (dict, Item)):
raise TypeError("Error type: %s" % type(item))
if not item:
raise ValueError("item is empty")
args = chain.from_iterable(item.items())
return self.cli.hmset(key, *args)
示例12: from_crawler
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Item [as 别名]
def from_crawler(cls, crawler):
spidermon_enabled = crawler.settings.getbool("SPIDERMON_ENABLED")
if not spidermon_enabled:
return PassThroughPipeline()
validators = defaultdict(list)
allowed_types = (list, tuple, dict)
def set_validators(loader, schema):
if type(schema) in (list, tuple):
schema = {Item: schema}
for obj, paths in schema.items():
key = obj.__name__
paths = paths if type(paths) in (list, tuple) else [paths]
objects = [loader(v) for v in paths]
validators[key].extend(objects)
for loader, name in [
(cls._load_jsonschema_validator, "SPIDERMON_VALIDATION_SCHEMAS"),
(cls._load_schematics_validator, "SPIDERMON_VALIDATION_MODELS"),
]:
res = crawler.settings.get(name)
if not res:
continue
if type(res) not in allowed_types:
raise NotConfigured(
"Invalid <{}> type for <{}> settings, dict or list/tuple"
"is required".format(type(res), name)
)
set_validators(loader, res)
if not validators:
raise NotConfigured("No validators were found")
return cls(
validators=validators,
stats=crawler.stats,
drop_items_with_errors=crawler.settings.getbool(
"SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS"
),
add_errors_to_items=crawler.settings.getbool(
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS"
),
errors_field=crawler.settings.get("SPIDERMON_VALIDATION_ERRORS_FIELD"),
)