本文整理汇总了Python中grab.proxylist.ProxyList.set_source方法的典型用法代码示例。如果您正苦于以下问题:Python ProxyList.set_source方法的具体用法?Python ProxyList.set_source怎么用?Python ProxyList.set_source使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类grab.proxylist.ProxyList
的用法示例。
在下文中一共展示了ProxyList.set_source方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Spider
# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import set_source [as 别名]
#.........这里部分代码省略.........
task.name,
task.url,
)
raise SpiderError(msg)
else:
task.url = urljoin(self.base_url, task.url)
# If task has grab_config object then update it too
if task.grab_config:
task.grab_config["url"] = task.url
except Exception as ex:
self.stat.collect("task-with-invalid-url", task.url)
if raise_error:
raise
else:
logger.error("", exc_info=ex)
return False
# TODO: keep original task priority if it was set explicitly
self.task_queue.put(task, task.priority, schedule_time=task.schedule_time)
return True
def stop(self):
"""
This method set internal flag which signal spider
to stop processing new task and shuts down.
"""
logger_verbose.debug("Method `stop` was called")
self.work_allowed = False
def load_proxylist(self, source, source_type=None, proxy_type="http", auto_init=True, auto_change=True, **kwargs):
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == "text_file":
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == "url":
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError(
"Method `load_proxylist` received " "invalid `source_type` argument: %s" % source_type
)
else:
raise SpiderMisuseError("Method `load_proxylist` received " "invalid `source` argument: %s" % source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_next_page(self, grab, task, xpath, resolve_base=False, **kwargs):
"""
Generate task for next page.
:param grab: Grab instance
:param task: Task object which should be assigned to next page url
:param xpath: xpath expression which calculates list of URLS
:param **kwargs: extra settings for new task object
Example::
self.follow_links(grab, 'topic', '//div[@class="topic"]/a/@href')
"""
try:
示例2: Spider
# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import set_source [as 别名]
#.........这里部分代码省略.........
)
return True
def stop(self):
"""
This method set internal flag which signal spider
to stop processing new task and shuts down.
"""
self.work_allowed = False
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True):
"""
Load proxy list.
:param source: Proxy source.
Accepts string (file path, url) or ``BaseProxySource`` instance.
:param source_type: The type of the specified source.
Should be one of the following: 'text_file' or 'url'.
:param proxy_type:
Should be one of the following: 'socks4', 'socks5' or'http'.
:param auto_change:
If set to `True` then automatical random proxy rotation
will be used.
Proxy source format should be one of the following (for each line):
- ip:port
- ip:port:login:password
"""
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_next_page(self, grab, task, xpath,
resolve_base=False, **kwargs):
"""
Generate task for next page.
:param grab: Grab instance
:param task: Task object which should be assigned to next page url
:param xpath: xpath expression which calculates list of URLS
:param **kwargs: extra settings for new task object
Example::
示例3: Spider
# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import set_source [as 别名]
#.........这里部分代码省略.........
else:
warn('Class attribute `Spider::base_url` is deprecated. '
'Use Task objects with absolute URLs')
task.url = urljoin(self.base_url, task.url)
# If task has grab_config object then update it too
if task.grab_config:
task.grab_config['url'] = task.url
except Exception as ex:
self.stat.collect('task-with-invalid-url', task.url)
if raise_error:
raise
else:
logger.error('', exc_info=ex)
return False
# TODO: keep original task priority if it was set explicitly
self.task_queue.put(task, task.priority, schedule_time=task.schedule_time)
return True
def stop(self):
"""
This method set internal flag which signal spider
to stop processing new task and shuts down.
"""
logger_verbose.debug('Method `stop` was called')
self.work_allowed = False
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True,
**kwargs):
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_next_page(self, grab, task, xpath,
resolve_base=False, **kwargs):
"""
Generate task for next page.
:param grab: Grab instance
:param task: Task object which should be assigned to next page url
:param xpath: xpath expression which calculates list of URLS
:param **kwargs: extra settings for new task object
Example::
示例4: Spider
# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import set_source [as 别名]
#.........这里部分代码省略.........
# Each result could be valid or failed
# Result format: {ok, grab, grab_config_backup, task, emsg}
# print '[transport iterate results - start]'
for result in self.transport.iterate_results():
if self.is_valid_for_cache(result):
with self.stat.log_time('cache'):
with self.stat.log_time('cache.write'):
self.cache.save_response(result['task'].url,
result['grab'])
# print '[process network results]'
self.process_network_result(result)
# print '[done]'
self.stat.inc('spider:request')
# print '[transport iterate results - end]'
logger_verbose.debug('Work done')
except KeyboardInterrupt:
print('\nGot ^C signal in process %d. Stopping.' % os.getpid())
self.interrupted = True
raise
finally:
# This code is executed when main cycles is breaked
self.stat.stop_timer('total')
self.shutdown()
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True,
**kwargs):
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_handler_result(self, result, task=None):
"""
Process result received from the task handler.
Result could be:
* None
* Task instance
* Data instance.
"""
if isinstance(result, Task):
self.add_task(result)