当前位置: 首页>>代码示例>>Python>>正文


Python ProxyList.load_url方法代码示例

本文整理汇总了Python中grab.proxylist.ProxyList.load_url方法的典型用法代码示例。如果您正苦于以下问题:Python ProxyList.load_url方法的具体用法?Python ProxyList.load_url怎么用?Python ProxyList.load_url使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在grab.proxylist.ProxyList的用法示例。


在下文中一共展示了ProxyList.load_url方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Spider

# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]

#.........这里部分代码省略.........
                    task.url = urljoin(self.base_url, task.url)
                    # If task has grab_config object then update it too
                    if task.grab_config:
                        task.grab_config["url"] = task.url
        except Exception as ex:
            self.stat.collect("task-with-invalid-url", task.url)
            if raise_error:
                raise
            else:
                logger.error("", exc_info=ex)
                return False

        # TODO: keep original task priority if it was set explicitly
        self.task_queue.put(task, task.priority, schedule_time=task.schedule_time)
        return True

    def stop(self):
        """
        This method set internal flag which signal spider
        to stop processing new task and shuts down.
        """

        logger_verbose.debug("Method `stop` was called")
        self.work_allowed = False

    def load_proxylist(self, source, source_type=None, proxy_type="http", auto_init=True, auto_change=True, **kwargs):
        self.proxylist = ProxyList()
        if isinstance(source, BaseProxySource):
            self.proxylist.set_source(source)
        elif isinstance(source, six.string_types):
            if source_type == "text_file":
                self.proxylist.load_file(source, proxy_type=proxy_type)
            elif source_type == "url":
                self.proxylist.load_url(source, proxy_type=proxy_type)
            else:
                raise SpiderMisuseError(
                    "Method `load_proxylist` received " "invalid `source_type` argument: %s" % source_type
                )
        else:
            raise SpiderMisuseError("Method `load_proxylist` received " "invalid `source` argument: %s" % source)

        self.proxylist_enabled = True
        self.proxy = None
        if not auto_change and auto_init:
            self.proxy = self.proxylist.get_random_proxy()
        self.proxy_auto_change = auto_change

    def process_next_page(self, grab, task, xpath, resolve_base=False, **kwargs):
        """
        Generate task for next page.

        :param grab: Grab instance
        :param task: Task object which should be assigned to next page url
        :param xpath: xpath expression which calculates list of URLS
        :param **kwargs: extra settings for new task object

        Example::

            self.follow_links(grab, 'topic', '//div[@class="topic"]/a/@href')
        """
        try:
            # next_url = grab.xpath_text(xpath)
            next_url = grab.doc.select(xpath).text()
        except IndexError:
            return False
        else:
开发者ID:mawentao007,项目名称:reading_grab,代码行数:70,代码来源:base.py

示例2: Spider

# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]

#.........这里部分代码省略.........
                    if task.grab_config:
                        task.grab_config['url'] = task.url
        except Exception as ex:
            self.stat.collect('task-with-invalid-url', task.url)
            if raise_error:
                raise
            else:
                logger.error('', exc_info=ex)
                return False

        # TODO: keep original task priority if it was set explicitly
        self.task_queue.put(task, task.priority, schedule_time=task.schedule_time)
        return True

    def stop(self):
        """
        This method set internal flag which signal spider
        to stop processing new task and shuts down.
        """

        logger_verbose.debug('Method `stop` was called')
        self.work_allowed = False

    def load_proxylist(self, source, source_type=None, proxy_type='http',
                       auto_init=True, auto_change=True,
                       **kwargs):
        self.proxylist = ProxyList()
        if isinstance(source, BaseProxySource):
            self.proxylist.set_source(source)
        elif isinstance(source, six.string_types):
            if source_type == 'text_file':
                self.proxylist.load_file(source, proxy_type=proxy_type)
            elif source_type == 'url':
                self.proxylist.load_url(source, proxy_type=proxy_type)
            else:
                raise SpiderMisuseError('Method `load_proxylist` received '
                                        'invalid `source_type` argument: %s'
                                        % source_type) 
        else:
            raise SpiderMisuseError('Method `load_proxylist` received '
                                    'invalid `source` argument: %s'
                                    % source) 

        self.proxylist_enabled = True
        self.proxy = None
        if not auto_change and auto_init:
            self.proxy = self.proxylist.get_random_proxy()
        self.proxy_auto_change = auto_change

    def process_next_page(self, grab, task, xpath,
                          resolve_base=False, **kwargs):
        """
        Generate task for next page.

        :param grab: Grab instance
        :param task: Task object which should be assigned to next page url
        :param xpath: xpath expression which calculates list of URLS
        :param **kwargs: extra settings for new task object

        Example::

            self.follow_links(grab, 'topic', '//div[@class="topic"]/a/@href')
        """
        try:
            # next_url = grab.xpath_text(xpath)
            next_url = grab.doc.select(xpath).text()
开发者ID:Michael-F-Bryan,项目名称:grab,代码行数:70,代码来源:base.py

示例3: Spider

# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]

#.........这里部分代码省略.........
                    if self.is_valid_for_cache(result):
                        with self.stat.log_time('cache'):
                            with self.stat.log_time('cache.write'):
                                self.cache.save_response(result['task'].url,
                                                         result['grab'])

                    # print '[process network results]'
                    self.process_network_result(result)
                    # print '[done]'
                    self.stat.inc('spider:request')

                # print '[transport iterate results - end]'

            logger_verbose.debug('Work done')
        except KeyboardInterrupt:
            print('\nGot ^C signal in process %d. Stopping.' % os.getpid())
            self.interrupted = True
            raise
        finally:
            # This code is executed when main cycles is breaked
            self.stat.stop_timer('total')
            self.shutdown()

    def load_proxylist(self, source, source_type=None, proxy_type='http',
                       auto_init=True, auto_change=True,
                       **kwargs):
        self.proxylist = ProxyList()
        if isinstance(source, BaseProxySource):
            self.proxylist.set_source(source)
        elif isinstance(source, six.string_types):
            if source_type == 'text_file':
                self.proxylist.load_file(source, proxy_type=proxy_type)
            elif source_type == 'url':
                self.proxylist.load_url(source, proxy_type=proxy_type)
            else:
                raise SpiderMisuseError('Method `load_proxylist` received '
                                        'invalid `source_type` argument: %s'
                                        % source_type) 
        else:
            raise SpiderMisuseError('Method `load_proxylist` received '
                                    'invalid `source` argument: %s'
                                    % source) 

        self.proxylist_enabled = True
        self.proxy = None
        if not auto_change and auto_init:
            self.proxy = self.proxylist.get_random_proxy()
        self.proxy_auto_change = auto_change

    def process_handler_result(self, result, task=None):
        """
        Process result received from the task handler.

        Result could be:
        * None
        * Task instance
        * Data instance.
        """

        if isinstance(result, Task):
            self.add_task(result)
        elif isinstance(result, Data):
            handler = self.find_data_handler(result)
            try:
                data_result = handler(**result.storage)
                if data_result is None:
开发者ID:smant,项目名称:grab,代码行数:70,代码来源:base.py

示例4: Spider

# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]

#.........这里部分代码省略.........
        This method set internal flag which signal spider
        to stop processing new task and shuts down.
        """
        self.work_allowed = False

    def load_proxylist(self, source, source_type=None, proxy_type='http',
                       auto_init=True, auto_change=True):
        """
        Load proxy list.

        :param source: Proxy source.
            Accepts string (file path, url) or ``BaseProxySource`` instance.
        :param source_type: The type of the specified source.
            Should be one of the following: 'text_file' or 'url'.
        :param proxy_type:
            Should be one of the following: 'socks4', 'socks5' or'http'.
        :param auto_change:
            If set to `True` then automatical random proxy rotation
            will be used.


        Proxy source format should be one of the following (for each line):
            - ip:port
            - ip:port:login:password

        """
        self.proxylist = ProxyList()
        if isinstance(source, BaseProxySource):
            self.proxylist.set_source(source)
        elif isinstance(source, six.string_types):
            if source_type == 'text_file':
                self.proxylist.load_file(source, proxy_type=proxy_type)
            elif source_type == 'url':
                self.proxylist.load_url(source, proxy_type=proxy_type)
            else:
                raise SpiderMisuseError('Method `load_proxylist` received '
                                        'invalid `source_type` argument: %s'
                                        % source_type)
        else:
            raise SpiderMisuseError('Method `load_proxylist` received '
                                    'invalid `source` argument: %s'
                                    % source)

        self.proxylist_enabled = True
        self.proxy = None
        if not auto_change and auto_init:
            self.proxy = self.proxylist.get_random_proxy()
        self.proxy_auto_change = auto_change

    def process_next_page(self, grab, task, xpath,
                          resolve_base=False, **kwargs):
        """
        Generate task for next page.

        :param grab: Grab instance
        :param task: Task object which should be assigned to next page url
        :param xpath: xpath expression which calculates list of URLS
        :param **kwargs: extra settings for new task object

        Example::

            self.follow_links(grab, 'topic', '//div[@class="topic"]/a/@href')
        """
        try:
            # next_url = grab.xpath_text(xpath)
            next_url = grab.doc.select(xpath).text()
开发者ID:lorien,项目名称:grab,代码行数:70,代码来源:base.py

示例5: test_web_proxy_source

# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]
 def test_web_proxy_source(self):
     plist = ProxyList()
     self.server.response['data'] = DEFAULT_PLIST_DATA
     plist.load_url(self.server.get_url())
     self.assertEqual(2, plist.size())
开发者ID:lorien,项目名称:grab,代码行数:7,代码来源:proxy.py


注:本文中的grab.proxylist.ProxyList.load_url方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。