本文整理汇总了Python中wpull.backport.logging.__函数的典型用法代码示例。如果您正苦于以下问题:Python __函数的具体用法?Python __怎么用?Python __使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了__函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: snapshot
def snapshot(self, remote, html_path=None, render_path=None):
'''Take HTML and PDF snapshot.'''
content = yield remote.eval('page.content')
url = yield remote.eval('page.url')
if html_path:
_logger.debug(__('Saving snapshot to {0}.', html_path))
dir_path = os.path.abspath(os.path.dirname(html_path))
if not os.path.exists(dir_path):
os.makedirs(dir_path)
with open(html_path, 'wb') as out_file:
out_file.write(content.encode('utf-8'))
if self._warc_recorder:
self._add_warc_snapshot(html_path, 'text/html', url)
if render_path:
_logger.debug(__('Saving snapshot to {0}.', render_path))
yield remote.call('page.render', render_path)
if self._warc_recorder:
self._add_warc_snapshot(render_path, 'application/pdf', url)
raise tornado.gen.Return(content)
示例2: process
def process(self, session: AppSession):
self._debug_log_registered_hooks(session)
internal_plugin_path = get_package_filename(os.path.join('application', 'plugins'))
plugin_locations = [internal_plugin_path]
plugin_filenames = []
if session.args.plugin_script:
plugin_filenames.append(session.args.plugin_script)
locator = PluginLocator(plugin_locations, plugin_filenames)
session.plugin_manager = PluginManager(plugin_locator=locator)
session.plugin_manager.collectPlugins()
for plugin_info in session.plugin_manager.getAllPlugins():
if plugin_info.path.startswith(internal_plugin_path):
_logger.debug(__(
_('Found plugin {name} from {filename}.'),
filename=plugin_info.path,
name=plugin_info.name
))
else:
_logger.info(__(
_('Found plugin {name} from {filename}.'),
filename=plugin_info.path,
name=plugin_info.name
))
plugin_info.plugin_object.app_session = session
if plugin_info.plugin_object.should_activate():
session.plugin_manager.activatePluginByName(plugin_info.name)
self._connect_plugin_hooks(session, plugin_info.plugin_object)
示例3: _polling_sleep
def _polling_sleep(cls, resource_monitor, log=False):
for counter in itertools.count():
resource_info = resource_monitor.check()
if not resource_info:
if log and counter:
_logger.info(_('Situation cleared.'))
break
if log and counter % 15 == 0:
if resource_info.path:
_logger.warning(__(
_('Low disk space on {path} ({size} free).'),
path=resource_info.path,
size=wpull.string.format_size(resource_info.free)
))
else:
_logger.warning(__(
_('Low memory ({size} free).'),
size=wpull.string.format_size(resource_info.free)
))
_logger.warning(_('Waiting for operator to clear situation.'))
yield from asyncio.sleep(60)
示例4: _scrape_document
def _scrape_document(self, request, response, url_item):
to_native = self.to_script_native_type
url_info_dict = to_native(request.url_info.to_dict())
document_info_dict = to_native(response.body.to_dict())
filename = to_native(response.body.content_file.name)
new_url_dicts = self.callbacks.get_urls(
filename, url_info_dict, document_info_dict)
_logger.debug(__('Hooked scrape returned {0}', new_url_dicts))
if not new_url_dicts:
return
if to_native(1) in new_url_dicts:
# Lua doesn't have sequences
for i in itertools.count(1):
new_url_dict = new_url_dicts[to_native(i)]
_logger.debug(__('Got lua new url info {0}', new_url_dict))
if new_url_dict is None:
break
self._add_hooked_url(url_item, new_url_dict)
else:
for new_url_dict in new_url_dicts:
self._add_hooked_url(url_item, new_url_dict)
示例5: _check_resource_monitor
def _check_resource_monitor(self):
if not self._resource_monitor:
return
for counter in itertools.count():
resource_info = self._resource_monitor.check()
if not resource_info:
if counter:
_logger.info(_('Situation cleared.'))
break
if counter % 15 == 0:
if resource_info.path:
_logger.warning(__(
_('Low disk space on {path} ({size} free).'),
path=resource_info.path,
size=wpull.string.format_size(resource_info.free)
))
else:
_logger.warning(__(
_('Low memory ({size} free).'),
size=wpull.string.format_size(resource_info.free)
))
_logger.warning(_('Waiting for operator to clear situation.'))
yield From(trollius.sleep(60))
示例6: _read_input_urls
def _read_input_urls(cls, session: AppSession, default_scheme='http'):
'''Read the URLs provided by the user.'''
url_string_iter = session.args.urls or ()
# FIXME: url rewriter isn't created yet
url_rewriter = session.factory.get('URLRewriter')
if session.args.input_file:
if session.args.force_html:
lines = cls._input_file_as_html_links(session)
else:
lines = cls._input_file_as_lines(session)
url_string_iter = itertools.chain(url_string_iter, lines)
base_url = session.args.base
for url_string in url_string_iter:
_logger.debug(__('Parsing URL {0}', url_string))
if base_url:
url_string = wpull.url.urljoin(base_url, url_string)
url_info = wpull.url.URLInfo.parse(
url_string, default_scheme=default_scheme)
_logger.debug(__('Parsed URL {0}', url_info))
if url_rewriter:
# TODO: this logic should be a hook
url_info = url_rewriter.rewrite(url_info)
_logger.debug(__('Rewritten URL {0}', url_info))
yield url_info
示例7: _make_socket
def _make_socket(self):
'''Make and wrap the socket with an IOStream.'''
host, port = self._original_address
family, self._resolved_address = yield self._resolver.resolve(
host, port)
self._socket = socket.socket(family, socket.SOCK_STREAM)
_logger.debug(__('Socket to {0}/{1}.', family, self._resolved_address))
if self._params.bind_address:
_logger.debug(__(
'Binding socket to {0}', self._params.bind_address
))
self._socket.bind(self._params.bind_address)
if self._ssl:
self._io_stream = SSLIOStream(
self._socket,
max_buffer_size=self._params.buffer_size,
rw_timeout=self._params.read_timeout,
ssl_options=self._params.ssl_options or {},
server_hostname=host,
)
else:
self._io_stream = IOStream(
self._socket,
rw_timeout=self._params.read_timeout,
max_buffer_size=self._params.buffer_size,
)
self._io_stream.set_close_callback(self._stream_closed_callback)
示例8: resolve_all
def resolve_all(self, host, port=0):
'''Resolve hostname and return a list of results.
Args:
host (str): The hostname.
port (int): The port number.
Returns:
list: A list of tuples where each tuple contains the family and
the socket address. See :method:`resolve` for the socket address
format.
'''
_logger.debug(__('Lookup address {0} {1}.', host, port))
host = self._lookup_hook(host, port)
results = None
if self._cache:
results = self._get_cache(host, port, self._family)
if results is None:
results = yield From(self._resolve_from_network(host, port))
if self._cache:
self._put_cache(host, port, results)
if not results:
raise DNSNotFound(
"DNS resolution for {0} did not return any results."
.format(repr(host))
)
_logger.debug(__('Resolved addresses: {0}.', results))
raise Return(results)
示例9: process
def process(self, item_session: ItemSession, request, response, file_writer_session):
'''Process PhantomJS.
Coroutine.
'''
if response.status_code != 200:
return
if not HTMLReader.is_supported(request=request, response=response):
return
_logger.debug('Starting PhantomJS processing.')
self._file_writer_session = file_writer_session
# FIXME: this is a quick hack for crashes. See #137.
attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5))
for dummy in range(attempts):
try:
yield from self._run_driver(item_session, request, response)
except asyncio.TimeoutError:
_logger.warning(_('Waiting for page load timed out.'))
break
except PhantomJSCrashed as error:
_logger.exception(__('PhantomJS crashed: {}', error))
else:
break
else:
_logger.warning(__(
_('PhantomJS failed to fetch ‘{url}’. I am sorry.'),
url=request.url_info.url
))
示例10: control
def control(self, remote):
'''Scroll the page.'''
num_scrolls = self._num_scrolls
if self._smart_scroll:
is_page_dynamic = yield remote.call('isPageDynamic')
if not is_page_dynamic:
num_scrolls = 0
url = yield remote.eval('page.url')
total_scroll_count = 0
for scroll_count in range(num_scrolls):
_logger.debug(__('Scrolling page. Count={0}.', scroll_count))
pre_scroll_counter_values = remote.resource_counter.values()
scroll_position = yield remote.eval('page.scrollPosition')
scroll_position['top'] += self._viewport_size[1]
yield self.scroll_to(remote, 0, scroll_position['top'])
total_scroll_count += 1
self._log_action('wait', self._wait_time)
yield wpull.async.sleep(self._wait_time)
post_scroll_counter_values = remote.resource_counter.values()
_logger.debug(__(
'Counter values pre={0} post={1}',
pre_scroll_counter_values,
post_scroll_counter_values
))
if post_scroll_counter_values == pre_scroll_counter_values \
and self._smart_scroll:
break
for dummy in range(remote.resource_counter.pending):
if remote.resource_counter.pending:
self._log_action('wait', self._wait_time)
yield wpull.async.sleep(self._wait_time)
else:
break
yield self.scroll_to(remote, 0, 0)
_logger.info(__(
gettext.ngettext(
'Scrolled page {num} time.',
'Scrolled page {num} times.',
total_scroll_count,
), num=total_scroll_count
))
if self._warc_recorder:
self._add_warc_action_log(url)
示例11: run
def run(self):
scrape_snapshot_path = self._get_temp_path('phantom', suffix='.html')
action_log_path = self._get_temp_path('phantom-action', suffix='.txt')
event_log_path = self._get_temp_path('phantom-event', suffix='.txt')
snapshot_paths = [scrape_snapshot_path]
snapshot_paths.extend(self._get_snapshot_paths())
url = self._item_session.url_record.url
driver_params = PhantomJSDriverParams(
url=url,
snapshot_paths=snapshot_paths,
wait_time=self._params.wait_time,
num_scrolls=self._params.num_scrolls,
smart_scroll=self._params.smart_scroll,
snapshot=self._params.snapshot,
viewport_size=self._params.viewport_size,
paper_size=self._params.paper_size,
event_log_filename=event_log_path,
action_log_filename=action_log_path,
custom_headers=self._params.custom_headers,
page_settings=self._params.page_settings,
)
driver = self._phantomjs_driver_factory(params=driver_params)
_logger.info(__(
_('PhantomJS fetching ‘{url}’.'),
url=url
))
with contextlib.closing(driver):
yield from driver.start()
# FIXME: we don't account that things might be scrolling and
# downloading so it might not be a good idea to timeout like
# this
if self._params.load_time:
yield from asyncio.wait_for(
driver.process.wait(), self._params.load_time
)
else:
yield from driver.process.wait()
if driver.process.returncode != 0:
raise PhantomJSCrashed(
'PhantomJS exited with code {}'
.format(driver.process.returncode)
)
if self._warc_recorder:
self._add_warc_action_log(action_log_path, url)
for path in snapshot_paths:
self._add_warc_snapshot(path, url)
_logger.info(__(
_('PhantomJS fetched ‘{url}’.'),
url=url
))
示例12: write_record
def write_record(self, record):
'''Append the record to the WARC file.'''
# FIXME: probably not a good idea to modifiy arguments passed to us
# TODO: add extra gzip headers that wget uses
record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[
WARCRecord.WARC_RECORD_ID]
_logger.debug(__('Writing WARC record {0}.',
record.fields['WARC-Type']))
if self._params.compress:
open_func = gzip.GzipFile
else:
open_func = open
# Use getsize to get actual file size. Avoid tell() because it may
# not be the raw file position.
if os.path.exists(self._warc_filename):
before_offset = os.path.getsize(self._warc_filename)
else:
before_offset = 0
journal_filename = self._warc_filename + '-wpullinc'
with open(journal_filename, 'w') as file:
file.write('wpull-journal-version:1\n')
file.write('offset:{}\n'.format(before_offset))
try:
with open_func(self._warc_filename, mode='ab') as out_file:
for data in record:
out_file.write(data)
except (OSError, IOError) as error:
_logger.info(__(
_('Rolling back file {filename} to length {length}.'),
filename=self._warc_filename, length=before_offset
))
with open(self._warc_filename, mode='wb') as out_file:
out_file.truncate(before_offset)
raise error
finally:
os.remove(journal_filename)
after_offset = os.path.getsize(self._warc_filename)
if self._cdx_filename:
raw_file_offset = before_offset
raw_file_record_size = after_offset - before_offset
self._write_cdx_field(
record, raw_file_record_size, raw_file_offset
)
示例13: _load_ca_certs
def _load_ca_certs(cls, session: AppSession, clean: bool=True):
'''Load the Certificate Authority certificates.
'''
args = session.args
if session.ca_certs_filename:
return session.ca_certs_filename
certs = set()
if args.use_internal_ca_certs:
pem_filename = os.path.join(
os.path.dirname(__file__), '..', '..', 'cert', 'ca-bundle.pem'
)
certs.update(cls._read_pem_file(pem_filename, from_package=True))
if args.ca_directory:
if os.path.isdir(args.ca_directory):
for filename in os.listdir(args.ca_directory):
if os.path.isfile(filename):
certs.update(cls._read_pem_file(filename))
else:
_logger.warning(__(
_('Certificate directory {path} does not exist.'),
path=args.ca_directory
))
if args.ca_certificate:
if os.path.isfile(args.ca_certificate):
certs.update(cls._read_pem_file(args.ca_certificate))
else:
_logger.warning(__(
_('Certificate file {path} does not exist.'),
path=args.ca_certificate
))
session.ca_certs_filename = certs_filename = tempfile.mkstemp(
suffix='.pem', prefix='tmp-wpull-')[1]
def clean_certs_file():
os.remove(certs_filename)
if clean:
atexit.register(clean_certs_file)
with open(certs_filename, 'w+b') as certs_file:
for cert in certs:
certs_file.write(cert)
_logger.debug('CA certs loaded.')
示例14: _process_url_item
def _process_url_item(self, url_item):
'''Process an item.
Args:
url_item (:class:`.item.URLItem`): The item to process.
This function calls :meth:`.processor.BaseProcessor.process`.
'''
_logger.debug(__('Begin session for {0} {1}.',
url_item.url_record, url_item.url_info))
yield self._processor.process(url_item)
_logger.debug(__('End session for {0} {1}.',
url_item.url_record, url_item.url_info))
示例15: _read_content
def _read_content(self, response, original_url_info):
'''Read response and parse the contents into the pool.'''
data = response.body.read(4096)
url_info = original_url_info
try:
self._robots_txt_pool.load_robots_txt(url_info, data)
except ValueError:
_logger.warning(__(
_('Failed to parse {url} for robots exclusion rules. '
'Ignoring.'), url_info.url))
self._accept_as_blank(url_info)
else:
_logger.debug(__('Got a good robots.txt for {0}.',
url_info.url))