本文整理汇总了Python中warcio.archiveiterator.ArchiveIterator方法的典型用法代码示例。如果您正苦于以下问题:Python archiveiterator.ArchiveIterator方法的具体用法?Python archiveiterator.ArchiveIterator怎么用?Python archiveiterator.ArchiveIterator使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类warcio.archiveiterator
的用法示例。
在下文中一共展示了archiveiterator.ArchiveIterator方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_capture_http_proxy
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_capture_http_proxy(self):
with capture_http() as warc_writer:
res = requests.get("http://example.com/test", proxies=self.proxies, verify=False)
ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test'
assert response.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
with raises(StopIteration):
assert next(ai)
示例2: test_request_response_concur
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_request_response_concur(self, is_gzip, builder_factory):
writer = BufferWARCWriter(gzip=is_gzip)
builder = builder_factory(writer, builder_cls=RecordBuilder)
resp = sample_response(builder)
req = sample_request(builder)
# test explicitly calling ensure_digest with block digest enabled on a record
writer.ensure_digest(resp, block=True, payload=True)
writer.write_request_response_pair(req, resp)
stream = writer.get_stream()
reader = ArchiveIterator(stream)
resp, req = list(reader)
resp_id = resp.rec_headers.get_header('WARC-Record-ID')
req_id = req.rec_headers.get_header('WARC-Record-ID')
assert resp_id != req_id
assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
示例3: test_response_warc_1_1
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_response_warc_1_1(self, is_gzip, builder_factory):
writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1')
builder = builder_factory(writer, warc_version='WARC/1.1')
resp = sample_response(builder)
writer.write_record(resp)
stream = writer.get_stream()
reader = ArchiveIterator(stream)
recs = list(reader)
assert len(recs) == 1
assert recs[0].rec_headers.protocol == 'WARC/1.1'
# ISO 8601 date with fractional seconds (microseconds)
assert '.' in recs[0].rec_headers['WARC-Date']
assert len(recs[0].rec_headers['WARC-Date']) == 27
示例4: test_identity
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_identity(self):
""" read(write(record)) should yield record """
payload = b'foobar'
writer = BufferWARCWriter(gzip=True)
httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
warcHeaders = {'Foo': 'Bar'}
record = writer.create_warc_record('http://example.com/', 'request',
payload=BytesIO(payload),
warc_headers_dict=warcHeaders, http_headers=httpHeaders)
writer.write_record(record)
for new_rec in ArchiveIterator(writer.get_stream()):
assert new_rec.rec_type == record.rec_type
assert new_rec.rec_headers == record.rec_headers
assert new_rec.content_type == record.content_type
assert new_rec.length == record.length
assert new_rec.http_headers == record.http_headers
assert new_rec.raw_stream.read() == payload
示例5: test_get
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_get(self):
url = 'http://localhost:{0}/get?foo=bar'.format(self.port)
with capture_http() as warc_writer:
res = requests.get(url, headers={'Host': 'httpbin.org'})
assert res.json()['args'] == {'foo': 'bar'}
ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == url
assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1'
assert res.json() == json.loads(response.content_stream().read().decode('utf-8'))
request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == url
assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1'
示例6: test_get_cache_to_file
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_get_cache_to_file(self):
warc_writer = BufferWARCWriter(gzip=False)
url = 'http://localhost:{0}/bytes/{1}'.format(self.port, BUFF_SIZE * 2)
with capture_http(warc_writer):
res = requests.get(url, headers={'Host': 'httpbin.org'})
assert len(res.content) == BUFF_SIZE * 2
ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == url
assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1'
assert res.content == response.content_stream().read()
request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == url
assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1'
示例7: test_post_json
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_post_json(self):
warc_writer = BufferWARCWriter(gzip=False)
with capture_http(warc_writer):
res = requests.post('http://localhost:{0}/post'.format(self.port),
headers={'Host': 'httpbin.org'},
json={'some': {'data': 'posted'}})
assert res.json()['json'] == {'some': {'data': 'posted'}}
# response
ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert res.json() == json.loads(response.content_stream().read().decode('utf-8'))
# request
request = next(ai)
assert request.rec_type == 'request'
assert request.http_headers['Content-Type'] == 'application/json'
data = request.content_stream().read().decode('utf-8')
assert data == '{"some": {"data": "posted"}}'
示例8: test_warc_1_1
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_warc_1_1(self):
full_path = os.path.join(self.temp_dir, 'example3.warc')
url = 'http://localhost:{0}/get?foo=bar'.format(self.port)
with capture_http(full_path, append=False, warc_version='1.1', gzip=False):
res = requests.get(url)
with open(full_path, 'rb') as stream:
# response
ai = ArchiveIterator(stream)
response = next(ai)
assert response.rec_headers.protocol == 'WARC/1.1'
warc_date = response.rec_headers['WARC-Date']
# ISO 8601 date with fractional seconds (microseconds)
assert '.' in warc_date
assert len(warc_date) == 27
os.remove(full_path)
示例9: test_remote
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_remote(self):
with capture_http(warc_version='1.1', gzip=True) as writer:
requests.get('http://example.com/')
requests.get('https://google.com/')
expected = [('http://example.com/', 'response', True),
('http://example.com/', 'request', True),
('https://google.com/', 'response', True),
('https://google.com/', 'request', True),
('https://www.google.com/', 'response', True),
('https://www.google.com/', 'request', True)
]
actual = [
(record.rec_headers['WARC-Target-URI'],
record.rec_type,
'WARC-IP-Address' in record.rec_headers)
for record in ArchiveIterator(writer.get_stream())
]
assert actual == expected
示例10: makeReport
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def makeReport (fd):
alreadyFixed = set ()
for record in ArchiveIterator (fd):
if record.rec_type == 'warcinfo':
try:
data = json.load (record.raw_stream)
# errata records precceed everything else and indicate which
# ones were fixed already
if data['tool'] == 'crocoite-errata':
alreadyFixed.update (data['parameters']['errata'])
else:
haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs)
except json.decoder.JSONDecodeError:
pass
示例11: errataFix
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def errataFix (args):
errata = args.errata
with args.input as infd, args.output as outfd:
writer = WARCWriter (outfd, gzip=True)
warcinfo = {
'software': getSoftwareInfo (),
'tool': 'crocoite-errata', # not the name of the cli tool
'parameters': {'errata': [errata.uuid]},
}
payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
record = writer.create_warc_record ('', 'warcinfo',
payload=payload,
warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
writer.write_record (record)
for record in ArchiveIterator (infd):
fixedRecord = errata.applyFix (record)
writer.write_record (fixedRecord)
json.dump (errata.stats, sys.stdout)
sys.stdout.write ('\n')
sys.stdout.flush ()
示例12: load_and_write
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def load_and_write(self, stream, output):
count = 0
with open(output, 'wb') as out:
writer = WARCWriter(filebuf=out, gzip=True)
for record in ArchiveIterator(stream,
no_record_parse=False,
arc2warc=True,
verify_http=False):
writer.write_record(record)
count += 1
return count
示例13: _create_record_iter
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def _create_record_iter(self, input_):
return ArchiveIterator(input_,
no_record_parse=not self.record_parse,
arc2warc=True)
示例14: process_one
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def process_one(self, filename):
printed_filename = False
with open(filename, 'rb') as stream:
it = ArchiveIterator(stream, check_digests=True)
for record in it:
digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
record.rec_headers.get_header('WARC-Block-Digest'))
_read_entire_stream(record.content_stream())
d_msg = None
output = []
rec_id = record.rec_headers.get_header('WARC-Record-ID')
rec_type = record.rec_headers.get_header('WARC-Type')
rec_offset = it.get_record_offset()
if record.digest_checker.passed is False:
self.exit_value = 1
output = list(record.digest_checker.problems)
elif record.digest_checker.passed is True and self.verbose:
d_msg = 'digest pass'
elif record.digest_checker.passed is None and self.verbose:
if digest_present and rec_type == 'revisit':
d_msg = 'digest present but not checked (revisit)'
elif digest_present: # pragma: no cover
# should not happen
d_msg = 'digest present but not checked'
else:
d_msg = 'no digest to check'
if d_msg or output:
if not printed_filename:
print(filename)
printed_filename = True
print(' ', 'offset', rec_offset, 'WARC-Record-ID', rec_id, rec_type)
if d_msg:
print(' ', d_msg)
for o in output:
print(' ', o)
示例15: extract
# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def extract(self, payload_only, headers_only):
with open(self.filename, 'rb') as fh:
fh.seek(int(self.offset))
it = iter(ArchiveIterator(fh))
record = next(it)
try:
stdout_raw = sys.stdout.buffer
except AttributeError: #pragma: no cover
stdout_raw = sys.stdout
if payload_only:
stream = record.content_stream()
buf = stream.read(self.READ_SIZE)
while buf:
stdout_raw.write(buf)
buf = stream.read(self.READ_SIZE)
else:
stdout_raw.write(record.rec_headers.to_bytes())
if record.http_headers:
stdout_raw.write(record.http_headers.to_bytes())
if not headers_only:
buf = record.raw_stream.read(self.READ_SIZE)
while buf:
stdout_raw.write(buf)
buf = record.raw_stream.read(self.READ_SIZE)