本文整理汇总了Python中grab.proxylist.ProxyList.load_url方法的典型用法代码示例。如果您正苦于以下问题:Python ProxyList.load_url方法的具体用法?Python ProxyList.load_url怎么用?Python ProxyList.load_url使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类grab.proxylist.ProxyList
的用法示例。
在下文中一共展示了ProxyList.load_url方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Spider
# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]
#.........这里部分代码省略.........
task.url = urljoin(self.base_url, task.url)
# If task has grab_config object then update it too
if task.grab_config:
task.grab_config["url"] = task.url
except Exception as ex:
self.stat.collect("task-with-invalid-url", task.url)
if raise_error:
raise
else:
logger.error("", exc_info=ex)
return False
# TODO: keep original task priority if it was set explicitly
self.task_queue.put(task, task.priority, schedule_time=task.schedule_time)
return True
def stop(self):
"""
This method set internal flag which signal spider
to stop processing new task and shuts down.
"""
logger_verbose.debug("Method `stop` was called")
self.work_allowed = False
def load_proxylist(self, source, source_type=None, proxy_type="http", auto_init=True, auto_change=True, **kwargs):
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == "text_file":
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == "url":
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError(
"Method `load_proxylist` received " "invalid `source_type` argument: %s" % source_type
)
else:
raise SpiderMisuseError("Method `load_proxylist` received " "invalid `source` argument: %s" % source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_next_page(self, grab, task, xpath, resolve_base=False, **kwargs):
"""
Generate task for next page.
:param grab: Grab instance
:param task: Task object which should be assigned to next page url
:param xpath: xpath expression which calculates list of URLS
:param **kwargs: extra settings for new task object
Example::
self.follow_links(grab, 'topic', '//div[@class="topic"]/a/@href')
"""
try:
# next_url = grab.xpath_text(xpath)
next_url = grab.doc.select(xpath).text()
except IndexError:
return False
else:
示例2: Spider
# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]
#.........这里部分代码省略.........
if task.grab_config:
task.grab_config['url'] = task.url
except Exception as ex:
self.stat.collect('task-with-invalid-url', task.url)
if raise_error:
raise
else:
logger.error('', exc_info=ex)
return False
# TODO: keep original task priority if it was set explicitly
self.task_queue.put(task, task.priority, schedule_time=task.schedule_time)
return True
def stop(self):
"""
This method set internal flag which signal spider
to stop processing new task and shuts down.
"""
logger_verbose.debug('Method `stop` was called')
self.work_allowed = False
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True,
**kwargs):
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_next_page(self, grab, task, xpath,
resolve_base=False, **kwargs):
"""
Generate task for next page.
:param grab: Grab instance
:param task: Task object which should be assigned to next page url
:param xpath: xpath expression which calculates list of URLS
:param **kwargs: extra settings for new task object
Example::
self.follow_links(grab, 'topic', '//div[@class="topic"]/a/@href')
"""
try:
# next_url = grab.xpath_text(xpath)
next_url = grab.doc.select(xpath).text()
示例3: Spider
# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]
#.........这里部分代码省略.........
if self.is_valid_for_cache(result):
with self.stat.log_time('cache'):
with self.stat.log_time('cache.write'):
self.cache.save_response(result['task'].url,
result['grab'])
# print '[process network results]'
self.process_network_result(result)
# print '[done]'
self.stat.inc('spider:request')
# print '[transport iterate results - end]'
logger_verbose.debug('Work done')
except KeyboardInterrupt:
print('\nGot ^C signal in process %d. Stopping.' % os.getpid())
self.interrupted = True
raise
finally:
# This code is executed when main cycles is breaked
self.stat.stop_timer('total')
self.shutdown()
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True,
**kwargs):
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_handler_result(self, result, task=None):
"""
Process result received from the task handler.
Result could be:
* None
* Task instance
* Data instance.
"""
if isinstance(result, Task):
self.add_task(result)
elif isinstance(result, Data):
handler = self.find_data_handler(result)
try:
data_result = handler(**result.storage)
if data_result is None:
示例4: Spider
# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]
#.........这里部分代码省略.........
This method set internal flag which signal spider
to stop processing new task and shuts down.
"""
self.work_allowed = False
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True):
"""
Load proxy list.
:param source: Proxy source.
Accepts string (file path, url) or ``BaseProxySource`` instance.
:param source_type: The type of the specified source.
Should be one of the following: 'text_file' or 'url'.
:param proxy_type:
Should be one of the following: 'socks4', 'socks5' or'http'.
:param auto_change:
If set to `True` then automatical random proxy rotation
will be used.
Proxy source format should be one of the following (for each line):
- ip:port
- ip:port:login:password
"""
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_next_page(self, grab, task, xpath,
resolve_base=False, **kwargs):
"""
Generate task for next page.
:param grab: Grab instance
:param task: Task object which should be assigned to next page url
:param xpath: xpath expression which calculates list of URLS
:param **kwargs: extra settings for new task object
Example::
self.follow_links(grab, 'topic', '//div[@class="topic"]/a/@href')
"""
try:
# next_url = grab.xpath_text(xpath)
next_url = grab.doc.select(xpath).text()
示例5: test_web_proxy_source
# 需要导入模块: from grab.proxylist import ProxyList [as 别名]
# 或者: from grab.proxylist.ProxyList import load_url [as 别名]
def test_web_proxy_source(self):
plist = ProxyList()
self.server.response['data'] = DEFAULT_PLIST_DATA
plist.load_url(self.server.get_url())
self.assertEqual(2, plist.size())