本文整理汇总了Python中scrapy.exceptions.DropItem方法的典型用法代码示例。如果您正苦于以下问题:Python exceptions.DropItem方法的具体用法?Python exceptions.DropItem怎么用?Python exceptions.DropItem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.exceptions
的用法示例。
在下文中一共展示了exceptions.DropItem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
if spider.name not in ['meituan']:
return item
if self.filter_dic.get(item['restaurant_name']) == item['address']:
print(item['restaurant_name'])
print(item['address'])
raise DropItem("Duplicate item found: %s" % item)
else:
self.filter_dic[item['restaurant_name']] = item['address']
try:
item['lng'], item['lat'] = gaode_to_baidu(float(item['lng']), float(item['lat']))
item['province_code'] = pinyin.get(item['province'])
item['city_code'] = pinyin.get(item['city'])
item['region_code'] = pinyin.get(item['region'])
item['area_code'] = pinyin.get(item['area'])
except BaseException as e:
print(e)
return item
示例2: item_completed
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def item_completed(self, results, item, info):
result = {}
for n, r in enumerate(results):
ok, x = r
if ok:
result[x["url"]] = x["path"]
else:
result[item[self.URLS_NAME][n]] = x.getErrorMessage()
# TODO: Save the result
# file_paths = [x['path'] for ok, x in results if ok]
# if not file_paths:
# raise DropItem("Item contains no files")
# item['image_paths'] = file_paths
# return item
return super(GroupDownPipelineMinix, self).item_completed(results, item, info)
示例3: process_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
if not isinstance(item, JsonSchemaItem):
return item
errors = list(item.validator.iter_errors(dict(item)))
paths_messages = []
for error in errors:
absolute_path = list(error.absolute_path)
# error path is not available when required field is not filled
# so we parse error message. Nasty.
required_match = self.REQUIRED_RE.search(error.message)
if required_match:
absolute_path.append(required_match.group(1))
path = '.'.join(map(str, absolute_path))
self.stats.inc_value(self.STAT_FMT.format(field=path))
paths_messages.append((path, error.message))
if errors:
error_msg = ''
for path, message in paths_messages:
error_msg += u'{}: {}\n'.format(path, message)
raise DropItem(u'schema validation failed: \n {}'.format(error_msg))
return item
示例4: _itemproc_finished
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def _itemproc_finished(self, output, item, response, spider):
"""ItemProcessor finished for the given ``item`` and returned ``output``
"""
self.slot.itemproc_size -= 1
if isinstance(output, Failure):
ex = output.value
if isinstance(ex, DropItem):
logkws = self.logformatter.dropped(item, ex, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_dropped, item=item, response=response,
spider=spider, exception=output.value)
else:
logger.error('Error processing %(item)s', {'item': item},
exc_info=failure_to_exc_info(output),
extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=spider, failure=output)
else:
logkws = self.logformatter.scraped(output, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=spider)
示例5: process_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
item['organization'] = spider.organization
if 'event_time' in item:
item['event_time']['date_format'] = spider.date_format
loader = EventLoader(**item)
# see if there is a custom filter for the item
if not spider.item_filter(item):
raise DropItem('Custom item filter did not allow this event')
if 'event_time' in loader.item:
time = loader.item['event_time']
if self.time_utils.time_range_is_between(time['start_timestamp'], time['end_timestamp'], spider.start_timestamp, spider.end_timestamp):
return loader.item
else:
raise DropItem('Event is not in the configured timeframe')
else:
return loader.item
示例6: process_exception
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_exception(self, request, exception, spider):
if isinstance(exception, (IgnoreRequest, DropItem)):
return
if not self._is_enabled_for_request(request):
return
autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
stop_time = time.time()
latency = time.time() - autoextract['timing']['start_ts']
autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})
# Make sure to log all unknown failures
logger.warning('AutoExtract failure after %.3fs for %s: %s',
latency,
autoextract['original_url'],
repr(exception),
extra={'spider': spider})
request.meta['autoextract'] = autoextract
ex_class = global_object_name(exception.__class__)
self.inc_metric('autoextract/errors/total_count', spider=spider)
self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)
示例7: process_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
if spider.name in ['RssCrawler', 'GdeltCrawler']:
# Search the CurrentVersion table for a version of the article
try:
self.cursor.execute(self.compare_versions, (item['url'],))
except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
pymysql.IntegrityError, TypeError) as error:
self.log.error("Something went wrong in rss query: %s", error)
# Save the result of the query. Must be done before the add,
# otherwise the result will be overwritten in the buffer
old_version = self.cursor.fetchone()
if old_version is not None and (datetime.datetime.strptime(
item['download_date'], "%y-%m-%d %H:%M:%S") -
old_version[3]) \
< datetime.timedelta(hours=self.delta_time):
# Compare the two download dates. index 3 of old_version
# corresponds to the download_date attribute in the DB
raise DropItem("Article in DB too recent. Not saving.")
return item
示例8: process_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
def raise_if_missing(name, item):
if name not in item:
raise DropItem(
'The required field "{}" is missing in: {}.'.format(name, item)
)
# Required fields for all items
for required in ("id", "title", "link"):
raise_if_missing(required, item)
# Required fields for FeedEntryItems
if isinstance(item, FeedEntryItem):
for required in ("updated",):
raise_if_missing(required, item)
return item
示例9: parse
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def parse(self, response):
"""
Default callback function with response for the crawled url
https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
"""
response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
property_key = response.url.split('=')[1].replace('&', '')
# logging.debug("Parsing property_key: %s", property_key)
if 'No Data at this time' in response.text:
msg = "No data for " + response.url
logging.warning(msg)
raise DropItem(msg)
else:
property_info = self.parse_property_info(response)
property_values = self.parse_property_values(response)
property_sales = self.parse_property_sales(response)
property_info['sales'] = property_sales
property_info['values'] = property_values
property_info['property_key'] = property_key
yield Property(property_info)
示例10: process_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
"""Main function that process URL item (first phase)."""
# validate URL length
if len(item['raw']) > MAX_URL_LEN:
item['raw'] = item['raw'][:MAX_URL_LEN]
logger.error('Raw URL too long, trucate it! %r', item['raw'])
# parse raw URL
purl = get_parsed_url(item['raw'])
if purl is None or purl.hostname is None:
raise DropItem('Invalide URL')
site_id = belongs_to_site(purl.hostname, self.site_tuples)
if site_id is None:
raise DropItem('Offsite domain: %s', item)
item['site_id'] = site_id
# insert URL into table
try:
get_or_create_murl(spider.session, item, spider.platform_id)
except SQLAlchemyError as e:
logger.error(e)
spider.session.rollback()
raise DropItem('Fail to insert database of url: %s', item)
return item
示例11: checkInvalidKeys
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def checkInvalidKeys(self, item):
""" Checks Keys For Invalid Entries Such as None/Empty """
allowedKeys = {
'None': ["image"],
'Empty': ["image"]
}
for key in item:
try:
if (item[key] == None or item[key] == "Error") and key not in allowedKeys['None']:
raise DropItem("Required Key " + str(key) + " is None")
if(type(item[key]) is str and key not in allowedKeys['Empty']):
if len(item[key]) == 0:
raise DropItem("Required Key " + str(key) + " is Empty")
except DropItem:
pass
except Exception as e:
logger.error(__name__ + " Exception: " + str(e))
continue
示例12: _process_verified_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def _process_verified_item(self, item, spider):
if item['incr'] == '-inf' or item['incr'] < 0:
raise DropItem('item verification has failed')
self.redis_con.zadd(item['queue'], item['verified_time'], item['url'])
示例13: _process_speed_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def _process_speed_item(self, item, spider):
if item['incr'] == '-inf' or item['incr'] < 0:
raise DropItem('item verification has failed')
self.redis_con.zadd(item['queue'], item['response_time'], item['url'])
示例14: process_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
if item['level'] == 2 and item['set_name'] is None:
raise DropItem('set name is empty')
return item
示例15: get_media_requests
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def get_media_requests(self, item, info):
# check for mandatory fields
for x in ["vendor", "url"]:
if x not in item:
raise DropItem(
"Missing required field '%s' for item: " % (x, item))
# resolve dynamic redirects in urls
for x in ["mib", "sdk", "url"]:
if x in item:
split = urlparse.urlsplit(item[x])
# remove username/password if only one provided
if split.username or split.password and not (split.username and split.password):
item[x] = urlparse.urlunsplit(
(split[0], split[1][split[1].find("@") + 1:], split[2], split[3], split[4]))
if split.scheme == "http":
item[x] = urllib.urlopen(item[x]).geturl()
# check for filtered url types in path
url = urlparse.urlparse(item["url"])
if any(url.path.endswith(x) for x in [".pdf", ".php", ".txt", ".doc", ".rtf", ".docx", ".htm", ".html", ".md5", ".sha1", ".torrent"]):
raise DropItem("Filtered path extension: %s" % url.path)
elif any(x in url.path for x in ["driver", "utility", "install", "wizard", "gpl", "login"]):
raise DropItem("Filtered path type: %s" % url.path)
# generate list of url's to download
item[self.files_urls_field] = [item[x]
for x in ["mib", "url"] if x in item]
# pass vendor so we can generate the correct file path and name
return [Request(x, meta={"ftp_user": "anonymous", "ftp_password": "chrome@example.com", "vendor": item["vendor"]}) for x in item[self.files_urls_field]]
# overrides function from FilesPipeline