本文整理汇总了Python中grab.proxylist.ProxyList类的典型用法代码示例。如果您正苦于以下问题:Python ProxyList类的具体用法?Python ProxyList怎么用?Python ProxyList使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ProxyList类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main(**kwargs):
logging.basicConfig(level=logging.DEBUG)
pl = ProxyList()
pl.load_file('/web/proxy-us.txt')
proxy = pl.get_random_proxy()
urls = [
'http://jabbim.com',
]
pool = []
for url in urls:
pro = Process(target=click, kwargs={'url': url,
'proxy': proxy.get_address()})
pro._start_time = time.time()
pro.daemon = True
pro.start()
pool.append(pro)
TIMEOUT = None
while True:
if not pro.is_alive():
break
else:
time.sleep(0.5)
if TIMEOUT and time.time() - pro._start_time > TIMEOUT:
pro.terminate()
示例2: __init__
def __init__(self, document_body=None,
transport=None, **kwargs):
"""
Create Grab instance
"""
self.meta = {}
self._doc = None
self.config = default_config()
self.config['common_headers'] = self.common_headers()
self.cookies = CookieManager()
self.proxylist = ProxyList()
self.exception = None
# makes pylint happy
self.request_counter = None
self.request_head = None
self.request_body = None
self.request_method = None
self.transport_param = transport
self.transport = None
self.reset()
if kwargs:
self.setup(**kwargs)
if document_body is not None:
self.setup_document(document_body)
示例3: load_proxylist
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True,
**kwargs):
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
示例4: test_get_next_proxy
def test_get_next_proxy(self):
pl = ProxyList()
path = self.generate_plist_file('foo:1\nbar:1')
pl.load_file(path)
self.assertEqual(pl.get_next_proxy().host, 'foo')
self.assertEqual(pl.get_next_proxy().host, 'bar')
self.assertEqual(pl.get_next_proxy().host, 'foo')
pl.load_file(path)
self.assertEqual(pl.get_next_proxy().host, 'foo')
示例5: test_get_next_proxy
def test_get_next_proxy(self):
with temp_file() as path:
plist = ProxyList()
self.generate_plist_file(path, 'foo:1\nbar:1')
plist.load_file(path)
self.assertEqual(plist.get_next_proxy().host, 'foo')
self.assertEqual(plist.get_next_proxy().host, 'bar')
self.assertEqual(plist.get_next_proxy().host, 'foo')
plist.load_file(path)
self.assertEqual(plist.get_next_proxy().host, 'foo')
示例6: load_proxylist
def load_proxylist(self, source, source_type, proxy_type='http',
auto_init=True, auto_change=True,
**kwargs):
self.proxylist = ProxyList(source, source_type, proxy_type=proxy_type, **kwargs)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random()
self.proxy_auto_change = auto_change
示例7: __init__
def __init__(self, document_body=None,
transport='pycurl', **kwargs):
"""
Create Grab instance
"""
self.meta = {}
self._doc = None
self.config = default_config()
self.config['common_headers'] = self.common_headers()
self.cookies = CookieManager()
self.proxylist = ProxyList()
self.setup_transport(transport)
self.reset()
if kwargs:
self.setup(**kwargs)
if document_body is not None:
self.setup_document(document_body)
示例8: load_proxylist
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True):
"""
Load proxy list.
:param source: Proxy source.
Accepts string (file path, url) or ``BaseProxySource`` instance.
:param source_type: The type of the specified source.
Should be one of the following: 'text_file' or 'url'.
:param proxy_type:
Should be one of the following: 'socks4', 'socks5' or'http'.
:param auto_change:
If set to `True` then automatical random proxy rotation
will be used.
Proxy source format should be one of the following (for each line):
- ip:port
- ip:port:login:password
"""
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
示例9: test_basic
def test_basic(self):
plist = ProxyList()
self.assertEqual(0, plist.size())
示例10: Spider
#.........这里部分代码省略.........
queue.put(
task, priority=task.priority, schedule_time=task.schedule_time
)
return True
def stop(self):
"""
This method set internal flag which signal spider
to stop processing new task and shuts down.
"""
self.work_allowed = False
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True):
"""
Load proxy list.
:param source: Proxy source.
Accepts string (file path, url) or ``BaseProxySource`` instance.
:param source_type: The type of the specified source.
Should be one of the following: 'text_file' or 'url'.
:param proxy_type:
Should be one of the following: 'socks4', 'socks5' or'http'.
:param auto_change:
If set to `True` then automatical random proxy rotation
will be used.
Proxy source format should be one of the following (for each line):
- ip:port
- ip:port:login:password
"""
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_next_page(self, grab, task, xpath,
resolve_base=False, **kwargs):
"""
Generate task for next page.
:param grab: Grab instance
:param task: Task object which should be assigned to next page url
:param xpath: xpath expression which calculates list of URLS
:param **kwargs: extra settings for new task object
示例11: test_file_proxy_source
def test_file_proxy_source(self):
with temp_file() as path:
plist = ProxyList()
self.generate_plist_file(path)
plist.load_file(path)
self.assertEqual(2, plist.size())
示例12: Grab
class Grab(DeprecatedThings):
__slots__ = (
'request_head', 'request_body',
#'request_log',
'proxylist', 'config',
'transport',
'transport_param', 'request_method', 'request_counter',
'__weakref__', 'cookies',
'meta', 'exception',
# Dirty hack to make it possible to inherit Grab from
# multiple base classes with __slots__
'_doc',
)
# Attributes which should be processed when clone
# of Grab instance is creating
clonable_attributes = ('request_head', 'request_body',
#'request_log',
'proxylist')
# Complex config items which points to mutable objects
mutable_config_keys = copy(MUTABLE_CONFIG_KEYS)
#
# Public methods
#
def __init__(self, document_body=None,
transport=None, **kwargs):
"""
Create Grab instance
"""
self.meta = {}
self._doc = None
self.config = default_config()
self.config['common_headers'] = self.common_headers()
self.cookies = CookieManager()
self.proxylist = ProxyList()
self.exception = None
# makes pylint happy
self.request_counter = None
self.request_head = None
self.request_body = None
self.request_method = None
self.transport_param = transport
self.transport = None
self.reset()
if kwargs:
self.setup(**kwargs)
if document_body is not None:
self.setup_document(document_body)
def _get_doc(self):
if self._doc is None:
self._doc = Document(self)
return self._doc
def _set_doc(self, obj):
self._doc = obj
doc = property(_get_doc, _set_doc)
def setup_transport(self, transport_param, reset=False):
if self.transport is not None and not reset:
raise error.GrabMisuseError(
'Transport is already set up. Use'
' setup_transport(..., reset=True) to explicitly setup'
' new transport')
if transport_param is None:
transport_param = DEFAULT_TRANSPORT
if isinstance(transport_param, six.string_types):
if transport_param in TRANSPORT_ALIAS:
transport_param = TRANSPORT_ALIAS[transport_param]
if '.' not in transport_param:
raise error.GrabMisuseError('Unknown transport: %s'
% transport_param)
else:
mod_path, cls_name = transport_param.rsplit('.', 1)
try:
cls = TRANSPORT_CACHE[(mod_path, cls_name)]
except KeyError:
mod = __import__(mod_path, globals(), locals(), ['foo'])
cls = getattr(mod, cls_name)
TRANSPORT_CACHE[(mod_path, cls_name)] = cls
self.transport = cls()
elif isinstance(transport_param, collections.Callable):
self.transport = transport_param()
else:
raise error.GrabMisuseError('Option `transport` should be string '
'or class or callable. Got %s'
% type(transport_param))
def reset(self):
"""
Reset all attributes which could be modified during previous request
#.........这里部分代码省略.........
示例13: Grab
class Grab(DeprecatedThings):
__slots__ = ('request_head', 'request_log', 'request_body',
'proxylist', 'config',
'transport',
'transport_param', 'request_method', 'request_counter',
'__weakref__', 'cookies',
'meta',
# Dirty hack to make it possible to inherit Grab from
# multiple base classes with __slots__
'_doc',
)
# Attributes which should be processed when clone
# of Grab instance is creating
clonable_attributes = ('request_head', 'request_log', 'request_body',
'proxylist')
# Complex config items which points to mutable objects
mutable_config_keys = copy(MUTABLE_CONFIG_KEYS)
"""
Public methods
"""
def __init__(self, document_body=None,
transport='grab.transport.curl.CurlTransport', **kwargs):
"""
Create Grab instance
"""
self.meta = {}
self._doc = None
self.config = default_config()
self.config['common_headers'] = self.common_headers()
self.cookies = CookieManager()
self.proxylist = ProxyList()
self.setup_transport(transport)
self.reset()
if kwargs:
self.setup(**kwargs)
if document_body is not None:
self.setup_document(document_body)
def _get_doc(self):
if self._doc is None:
self._doc = Document(self)
return self._doc
def _set_doc(self, obj):
self._doc = obj
doc = property(_get_doc, _set_doc)
def setup_transport(self, transport_param):
self.transport_param = transport_param
if isinstance(transport_param, six.string_types):
mod_path, cls_name = transport_param.rsplit('.', 1)
try:
cls = TRANSPORT_CACHE[(mod_path, cls_name)]
except KeyError:
mod = __import__(mod_path, globals(), locals(), ['foo'])
cls = getattr(mod, cls_name)
TRANSPORT_CACHE[(mod_path, cls_name)] = cls
self.transport = cls()
elif isinstance(transport_param, collections.Callable):
self.transport = transport_param()
else:
raise error.GrabMisuseError('Option `transport` should be string '
'or callable. Got %s'
% type(transport_param))
def reset(self):
"""
Reset all attributes which could be modified during previous request
or which is not initialized yet if this is the new Grab instance.
This methods is automatically called before each network request.
"""
self.request_head = None
self.request_log = None
self.request_body = None
self.request_method = None
self.transport.reset()
def clone(self, **kwargs):
"""
Create clone of Grab instance.
Cloned instance will have the same state: cookies, referrer, response
document data
:param **kwargs: overrides settings of cloned grab instance
"""
g = Grab(transport=self.transport_param)
g.config = self.dump_config()
#.........这里部分代码省略.........
示例14: test_web_proxy_source
def test_web_proxy_source(self):
plist = ProxyList()
self.server.response['data'] = DEFAULT_PLIST_DATA
plist.load_url(self.server.get_url())
self.assertEqual(2, plist.size())
示例15: Spider
#.........这里部分代码省略.........
% (task.name, task.url)
raise SpiderError(msg)
else:
warn('Class attribute `Spider::base_url` is deprecated. '
'Use Task objects with absolute URLs')
task.url = urljoin(self.base_url, task.url)
# If task has grab_config object then update it too
if task.grab_config:
task.grab_config['url'] = task.url
except Exception as ex:
self.stat.collect('task-with-invalid-url', task.url)
if raise_error:
raise
else:
logger.error('', exc_info=ex)
return False
# TODO: keep original task priority if it was set explicitly
self.task_queue.put(task, task.priority, schedule_time=task.schedule_time)
return True
def stop(self):
"""
This method set internal flag which signal spider
to stop processing new task and shuts down.
"""
logger_verbose.debug('Method `stop` was called')
self.work_allowed = False
def load_proxylist(self, source, source_type=None, proxy_type='http',
auto_init=True, auto_change=True,
**kwargs):
self.proxylist = ProxyList()
if isinstance(source, BaseProxySource):
self.proxylist.set_source(source)
elif isinstance(source, six.string_types):
if source_type == 'text_file':
self.proxylist.load_file(source, proxy_type=proxy_type)
elif source_type == 'url':
self.proxylist.load_url(source, proxy_type=proxy_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source_type` argument: %s'
% source_type)
else:
raise SpiderMisuseError('Method `load_proxylist` received '
'invalid `source` argument: %s'
% source)
self.proxylist_enabled = True
self.proxy = None
if not auto_change and auto_init:
self.proxy = self.proxylist.get_random_proxy()
self.proxy_auto_change = auto_change
def process_next_page(self, grab, task, xpath,
resolve_base=False, **kwargs):
"""
Generate task for next page.
:param grab: Grab instance
:param task: Task object which should be assigned to next page url
:param xpath: xpath expression which calculates list of URLS
:param **kwargs: extra settings for new task object