当前位置: 首页>>代码示例>>Python>>正文


Python exceptions.DropItem方法代码示例

本文整理汇总了Python中scrapy.exceptions.DropItem方法的典型用法代码示例。如果您正苦于以下问题:Python exceptions.DropItem方法的具体用法?Python exceptions.DropItem怎么用?Python exceptions.DropItem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.exceptions的用法示例。


在下文中一共展示了exceptions.DropItem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
        if spider.name not in ['meituan']:
            return item
        if self.filter_dic.get(item['restaurant_name']) == item['address']:
            print(item['restaurant_name'])
            print(item['address'])
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.filter_dic[item['restaurant_name']] = item['address']
            try:
                item['lng'], item['lat'] = gaode_to_baidu(float(item['lng']), float(item['lat']))
                item['province_code'] = pinyin.get(item['province'])
                item['city_code'] = pinyin.get(item['city'])
                item['region_code'] = pinyin.get(item['region'])
                item['area_code'] = pinyin.get(item['area'])
            except BaseException as e:
                print(e)
            return item 
开发者ID:piaotiejun,项目名称:restaurant,代码行数:20,代码来源:pipelines.py

示例2: item_completed

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def item_completed(self, results, item, info):
        result = {}
        for n, r in enumerate(results):
            ok, x = r
            if ok:
                result[x["url"]] = x["path"]
            else:
                result[item[self.URLS_NAME][n]] = x.getErrorMessage()
        # TODO: Save the result

        # file_paths = [x['path'] for ok, x in results if ok]
        # if not file_paths:
        #     raise DropItem("Item contains no files")
        # item['image_paths'] = file_paths
        # return item

        return super(GroupDownPipelineMinix, self).item_completed(results, item, info) 
开发者ID:xgfone,项目名称:snippet,代码行数:19,代码来源:pipelines.py

示例3: process_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
        if not isinstance(item, JsonSchemaItem):
            return item

        errors = list(item.validator.iter_errors(dict(item)))
        paths_messages = []
        for error in errors:
            absolute_path = list(error.absolute_path)
            # error path is not available when required field is not filled
            # so we parse error message. Nasty.
            required_match = self.REQUIRED_RE.search(error.message)
            if required_match:
                absolute_path.append(required_match.group(1))
            path = '.'.join(map(str, absolute_path))
            self.stats.inc_value(self.STAT_FMT.format(field=path))
            paths_messages.append((path, error.message))
        if errors:
            error_msg = ''
            for path, message in paths_messages:
                error_msg += u'{}: {}\n'.format(path, message)
            raise DropItem(u'schema validation failed: \n {}'.format(error_msg))

        return item 
开发者ID:scrapy-plugins,项目名称:scrapy-jsonschema,代码行数:25,代码来源:pipeline.py

示例4: _itemproc_finished

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:27,代码来源:scraper.py

示例5: process_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
        item['organization'] = spider.organization
        if 'event_time' in item:
            item['event_time']['date_format'] = spider.date_format
        loader = EventLoader(**item)
        # see if there is a custom filter for the item
        if not spider.item_filter(item):
            raise DropItem('Custom item filter did not allow this event')
        if 'event_time' in loader.item:
            time = loader.item['event_time']
            if self.time_utils.time_range_is_between(time['start_timestamp'], time['end_timestamp'], spider.start_timestamp, spider.end_timestamp):
                return loader.item
            else:
                raise DropItem('Event is not in the configured timeframe')
        else:
            return loader.item 
开发者ID:In2ItChicago,项目名称:In2ItChicago,代码行数:18,代码来源:pipelines.py

示例6: process_exception

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_exception(self, request, exception, spider):
        if isinstance(exception, (IgnoreRequest, DropItem)):
            return
        if not self._is_enabled_for_request(request):
            return

        autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
        stop_time = time.time()
        latency = time.time() - autoextract['timing']['start_ts']
        autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})

        # Make sure to log all unknown failures
        logger.warning('AutoExtract failure after %.3fs for %s: %s',
                       latency,
                       autoextract['original_url'],
                       repr(exception),
                       extra={'spider': spider})

        request.meta['autoextract'] = autoextract
        ex_class = global_object_name(exception.__class__)
        self.inc_metric('autoextract/errors/total_count', spider=spider)
        self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider) 
开发者ID:scrapinghub,项目名称:scrapy-autoextract,代码行数:24,代码来源:middlewares.py

示例7: process_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
        if spider.name in ['RssCrawler', 'GdeltCrawler']:
            # Search the CurrentVersion table for a version of the article
            try:
                self.cursor.execute(self.compare_versions, (item['url'],))
            except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
                    pymysql.IntegrityError, TypeError) as error:
                self.log.error("Something went wrong in rss query: %s", error)

            # Save the result of the query. Must be done before the add,
            #   otherwise the result will be overwritten in the buffer
            old_version = self.cursor.fetchone()

            if old_version is not None and (datetime.datetime.strptime(
                    item['download_date'], "%y-%m-%d %H:%M:%S") -
                                            old_version[3]) \
                    < datetime.timedelta(hours=self.delta_time):
                # Compare the two download dates. index 3 of old_version
                # corresponds to the download_date attribute in the DB
                raise DropItem("Article in DB too recent. Not saving.")

        return item 
开发者ID:fhamborg,项目名称:news-please,代码行数:24,代码来源:pipelines.py

示例8: process_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
        def raise_if_missing(name, item):
            if name not in item:
                raise DropItem(
                    'The required field "{}" is missing in: {}.'.format(name, item)
                )

        # Required fields for all items
        for required in ("id", "title", "link"):
            raise_if_missing(required, item)

        # Required fields for FeedEntryItems
        if isinstance(item, FeedEntryItem):
            for required in ("updated",):
                raise_if_missing(required, item)

        return item 
开发者ID:PyFeeds,项目名称:PyFeeds,代码行数:19,代码来源:pipelines.py

示例9: parse

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def parse(self, response):
        """
        Default callback function with response for the crawled url
        https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
        """
        response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
        property_key = response.url.split('=')[1].replace('&', '')
        # logging.debug("Parsing property_key: %s", property_key)
        if 'No Data at this time' in response.text:
            msg = "No data for " + response.url
            logging.warning(msg)
            raise DropItem(msg)
        else:
            property_info = self.parse_property_info(response)
            property_values = self.parse_property_values(response)
            property_sales = self.parse_property_sales(response)
            property_info['sales'] = property_sales
            property_info['values'] = property_values
            property_info['property_key'] = property_key
            yield Property(property_info) 
开发者ID:codefornola,项目名称:assessor-scraper,代码行数:22,代码来源:assessment_spider.py

示例10: process_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
        """Main function that process URL item (first phase)."""
        # validate URL length
        if len(item['raw']) > MAX_URL_LEN:
            item['raw'] = item['raw'][:MAX_URL_LEN]
            logger.error('Raw URL too long, trucate it! %r', item['raw'])
        # parse raw URL
        purl = get_parsed_url(item['raw'])
        if purl is None or purl.hostname is None:
            raise DropItem('Invalide URL')
        site_id = belongs_to_site(purl.hostname, self.site_tuples)
        if site_id is None:
            raise DropItem('Offsite domain: %s', item)
        item['site_id'] = site_id
        # insert URL into table
        try:
            get_or_create_murl(spider.session, item, spider.platform_id)
        except SQLAlchemyError as e:
            logger.error(e)
            spider.session.rollback()
            raise DropItem('Fail to insert database of url: %s', item)
        return item 
开发者ID:IUNetSci,项目名称:hoaxy-backend,代码行数:24,代码来源:pipelines.py

示例11: checkInvalidKeys

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def checkInvalidKeys(self, item):
        """ Checks Keys For Invalid Entries Such as None/Empty """        
        allowedKeys = {
            'None': ["image"],
            'Empty': ["image"]
        }
        for key in item:
            try:
                if (item[key] == None or item[key] == "Error") and key not in allowedKeys['None']:
                    raise DropItem("Required Key " + str(key) + " is None")

                if(type(item[key]) is str and key not in allowedKeys['Empty']):
                    if len(item[key]) == 0:
                        raise DropItem("Required Key " + str(key) + " is Empty")
            except DropItem:
                pass
            except Exception as e:
                logger.error(__name__ + " Exception: " + str(e))
                continue 
开发者ID:vipulgupta2048,项目名称:scrape,代码行数:21,代码来源:pipelines.py

示例12: _process_verified_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def _process_verified_item(self, item, spider):
        if item['incr'] == '-inf' or item['incr'] < 0:
            raise DropItem('item verification has failed')

        self.redis_con.zadd(item['queue'], item['verified_time'], item['url']) 
开发者ID:SpiderClub,项目名称:haipproxy,代码行数:7,代码来源:pipelines.py

示例13: _process_speed_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def _process_speed_item(self, item, spider):
        if item['incr'] == '-inf' or item['incr'] < 0:
            raise DropItem('item verification has failed')

        self.redis_con.zadd(item['queue'], item['response_time'], item['url']) 
开发者ID:SpiderClub,项目名称:haipproxy,代码行数:7,代码来源:pipelines.py

示例14: process_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def process_item(self, item, spider):
		if item['level'] == 2 and item['set_name'] is None:
			raise DropItem('set name is empty')
		return item 
开发者ID:czs0x55aa,项目名称:video_url_crawler_demo,代码行数:6,代码来源:pipelines.py

示例15: get_media_requests

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import DropItem [as 别名]
def get_media_requests(self, item, info):
        # check for mandatory fields
        for x in ["vendor", "url"]:
            if x not in item:
                raise DropItem(
                    "Missing required field '%s' for item: " % (x, item))

        # resolve dynamic redirects in urls
        for x in ["mib", "sdk", "url"]:
            if x in item:
                split = urlparse.urlsplit(item[x])
                # remove username/password if only one provided
                if split.username or split.password and not (split.username and split.password):
                    item[x] = urlparse.urlunsplit(
                        (split[0], split[1][split[1].find("@") + 1:], split[2], split[3], split[4]))

                if split.scheme == "http":
                    item[x] = urllib.urlopen(item[x]).geturl()

        # check for filtered url types in path
        url = urlparse.urlparse(item["url"])
        if any(url.path.endswith(x) for x in [".pdf", ".php", ".txt", ".doc", ".rtf", ".docx", ".htm", ".html", ".md5", ".sha1", ".torrent"]):
            raise DropItem("Filtered path extension: %s" % url.path)
        elif any(x in url.path for x in ["driver", "utility", "install", "wizard", "gpl", "login"]):
            raise DropItem("Filtered path type: %s" % url.path)

        # generate list of url's to download
        item[self.files_urls_field] = [item[x]
                                       for x in ["mib", "url"] if x in item]

        # pass vendor so we can generate the correct file path and name
        return [Request(x, meta={"ftp_user": "anonymous", "ftp_password": "chrome@example.com", "vendor": item["vendor"]}) for x in item[self.files_urls_field]]

    # overrides function from FilesPipeline 
开发者ID:firmadyne,项目名称:scraper,代码行数:36,代码来源:pipelines.py


注:本文中的scrapy.exceptions.DropItem方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。