当前位置: 首页>>代码示例>>Python>>正文


Python archiveiterator.ArchiveIterator方法代码示例

本文整理汇总了Python中warcio.archiveiterator.ArchiveIterator方法的典型用法代码示例。如果您正苦于以下问题:Python archiveiterator.ArchiveIterator方法的具体用法?Python archiveiterator.ArchiveIterator怎么用?Python archiveiterator.ArchiveIterator使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在warcio.archiveiterator的用法示例。


在下文中一共展示了archiveiterator.ArchiveIterator方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_capture_http_proxy

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_capture_http_proxy(self):
        with capture_http() as warc_writer:
            res = requests.get("http://example.com/test", proxies=self.proxies, verify=False)

        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test"
        assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test'
        assert response.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)

        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test"
        assert request.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)

        with raises(StopIteration):
            assert next(ai) 
开发者ID:webrecorder,项目名称:warcio,代码行数:20,代码来源:test_capture_http_proxy.py

示例2: test_request_response_concur

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_request_response_concur(self, is_gzip, builder_factory):
        writer = BufferWARCWriter(gzip=is_gzip)
        builder = builder_factory(writer, builder_cls=RecordBuilder)

        resp = sample_response(builder)

        req = sample_request(builder)

        # test explicitly calling ensure_digest with block digest enabled on a record
        writer.ensure_digest(resp, block=True, payload=True)

        writer.write_request_response_pair(req, resp)

        stream = writer.get_stream()

        reader = ArchiveIterator(stream)
        resp, req = list(reader)

        resp_id = resp.rec_headers.get_header('WARC-Record-ID')
        req_id = req.rec_headers.get_header('WARC-Record-ID')

        assert resp_id != req_id
        assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To') 
开发者ID:webrecorder,项目名称:warcio,代码行数:25,代码来源:test_writer.py

示例3: test_response_warc_1_1

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_response_warc_1_1(self, is_gzip, builder_factory):
        writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1')

        builder = builder_factory(writer, warc_version='WARC/1.1')
        resp = sample_response(builder)

        writer.write_record(resp)

        stream = writer.get_stream()

        reader = ArchiveIterator(stream)
        recs = list(reader)

        assert len(recs) == 1
        assert recs[0].rec_headers.protocol == 'WARC/1.1'

        # ISO 8601 date with fractional seconds (microseconds)
        assert '.' in recs[0].rec_headers['WARC-Date']
        assert len(recs[0].rec_headers['WARC-Date']) == 27 
开发者ID:webrecorder,项目名称:warcio,代码行数:21,代码来源:test_writer.py

示例4: test_identity

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_identity(self):
        """ read(write(record)) should yield record """
        payload = b'foobar'
        writer = BufferWARCWriter(gzip=True)
        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
        warcHeaders = {'Foo': 'Bar'}
        record = writer.create_warc_record('http://example.com/', 'request',
                payload=BytesIO(payload),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)

        writer.write_record(record)

        for new_rec in ArchiveIterator(writer.get_stream()):
            assert new_rec.rec_type == record.rec_type
            assert new_rec.rec_headers == record.rec_headers
            assert new_rec.content_type == record.content_type
            assert new_rec.length == record.length
            assert new_rec.http_headers == record.http_headers
            assert new_rec.raw_stream.read() == payload 
开发者ID:webrecorder,项目名称:warcio,代码行数:21,代码来源:test_writer.py

示例5: test_get

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_get(self):
        url = 'http://localhost:{0}/get?foo=bar'.format(self.port)
        with capture_http() as warc_writer:
            res = requests.get(url, headers={'Host': 'httpbin.org'})

        assert res.json()['args'] == {'foo': 'bar'}

        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == url
        assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1'
        assert res.json() == json.loads(response.content_stream().read().decode('utf-8'))

        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == url
        assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1' 
开发者ID:webrecorder,项目名称:warcio,代码行数:20,代码来源:test_capture_http.py

示例6: test_get_cache_to_file

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_get_cache_to_file(self):
        warc_writer = BufferWARCWriter(gzip=False)

        url = 'http://localhost:{0}/bytes/{1}'.format(self.port, BUFF_SIZE * 2)
        with capture_http(warc_writer):
            res = requests.get(url, headers={'Host': 'httpbin.org'})

        assert len(res.content) == BUFF_SIZE * 2

        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == url
        assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1'
        assert res.content == response.content_stream().read()

        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == url
        assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1' 
开发者ID:webrecorder,项目名称:warcio,代码行数:22,代码来源:test_capture_http.py

示例7: test_post_json

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_post_json(self):
        warc_writer = BufferWARCWriter(gzip=False)

        with capture_http(warc_writer):
            res = requests.post('http://localhost:{0}/post'.format(self.port),
                                headers={'Host': 'httpbin.org'},
                                json={'some': {'data': 'posted'}})

        assert res.json()['json'] == {'some': {'data': 'posted'}}

        # response
        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'

        assert res.json() == json.loads(response.content_stream().read().decode('utf-8'))

        # request
        request = next(ai)
        assert request.rec_type == 'request'
        assert request.http_headers['Content-Type'] == 'application/json'

        data = request.content_stream().read().decode('utf-8')
        assert data == '{"some": {"data": "posted"}}' 
开发者ID:webrecorder,项目名称:warcio,代码行数:26,代码来源:test_capture_http.py

示例8: test_warc_1_1

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_warc_1_1(self):
        full_path = os.path.join(self.temp_dir, 'example3.warc')

        url = 'http://localhost:{0}/get?foo=bar'.format(self.port)

        with capture_http(full_path, append=False, warc_version='1.1', gzip=False):
            res = requests.get(url)

        with open(full_path, 'rb') as stream:
            # response
            ai = ArchiveIterator(stream)
            response = next(ai)
            assert response.rec_headers.protocol == 'WARC/1.1'
            warc_date = response.rec_headers['WARC-Date']

            # ISO 8601 date with fractional seconds (microseconds)
            assert '.' in warc_date
            assert len(warc_date) == 27

        os.remove(full_path) 
开发者ID:webrecorder,项目名称:warcio,代码行数:22,代码来源:test_capture_http.py

示例9: test_remote

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def test_remote(self):
        with capture_http(warc_version='1.1', gzip=True) as writer:
            requests.get('http://example.com/')
            requests.get('https://google.com/')

        expected = [('http://example.com/', 'response', True),
                    ('http://example.com/', 'request', True),
                    ('https://google.com/', 'response', True),
                    ('https://google.com/', 'request', True),
                    ('https://www.google.com/', 'response', True),
                    ('https://www.google.com/', 'request', True)
                   ]

        actual = [
                  (record.rec_headers['WARC-Target-URI'],
                   record.rec_type,
                   'WARC-IP-Address' in record.rec_headers)

                  for record in ArchiveIterator(writer.get_stream())
                 ]

        assert actual == expected 
开发者ID:webrecorder,项目名称:warcio,代码行数:24,代码来源:test_capture_http.py

示例10: makeReport

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def makeReport (fd):
    alreadyFixed = set ()

    for record in ArchiveIterator (fd):
        if record.rec_type == 'warcinfo':
            try:
                data = json.load (record.raw_stream)
                # errata records precceed everything else and indicate which
                # ones were fixed already
                if data['tool'] == 'crocoite-errata':
                    alreadyFixed.update (data['parameters']['errata'])
                else:
                    haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
                    yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs)
            except json.decoder.JSONDecodeError:
                pass 
开发者ID:PromyLOPh,项目名称:crocoite,代码行数:18,代码来源:tools.py

示例11: errataFix

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def errataFix (args):
    errata = args.errata

    with args.input as infd, args.output as outfd:
        writer = WARCWriter (outfd, gzip=True)

        warcinfo = {
                'software': getSoftwareInfo (),
                'tool': 'crocoite-errata', # not the name of the cli tool
                'parameters': {'errata': [errata.uuid]},
                }
        payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
        record = writer.create_warc_record ('', 'warcinfo',
                payload=payload,
                warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
        writer.write_record (record)

        for record in ArchiveIterator (infd):
            fixedRecord = errata.applyFix (record)
            writer.write_record (fixedRecord)
    json.dump (errata.stats, sys.stdout)
    sys.stdout.write ('\n')
    sys.stdout.flush () 
开发者ID:PromyLOPh,项目名称:crocoite,代码行数:25,代码来源:tools.py

示例12: load_and_write

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def load_and_write(self, stream, output):
        count = 0
        with open(output, 'wb') as out:
            writer = WARCWriter(filebuf=out, gzip=True)

            for record in ArchiveIterator(stream,
                                          no_record_parse=False,
                                          arc2warc=True,
                                          verify_http=False):

                writer.write_record(record)
                count += 1

            return count 
开发者ID:webrecorder,项目名称:warcio,代码行数:16,代码来源:recompressor.py

示例13: _create_record_iter

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def _create_record_iter(self, input_):
        return ArchiveIterator(input_,
                               no_record_parse=not self.record_parse,
                               arc2warc=True) 
开发者ID:webrecorder,项目名称:warcio,代码行数:6,代码来源:indexer.py

示例14: process_one

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def process_one(self, filename):
        printed_filename = False
        with open(filename, 'rb') as stream:
            it = ArchiveIterator(stream, check_digests=True)
            for record in it:
                digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
                                  record.rec_headers.get_header('WARC-Block-Digest'))

                _read_entire_stream(record.content_stream())

                d_msg = None
                output = []

                rec_id = record.rec_headers.get_header('WARC-Record-ID')
                rec_type = record.rec_headers.get_header('WARC-Type')
                rec_offset = it.get_record_offset()

                if record.digest_checker.passed is False:
                    self.exit_value = 1
                    output = list(record.digest_checker.problems) 
                elif record.digest_checker.passed is True and self.verbose:
                    d_msg = 'digest pass'
                elif record.digest_checker.passed is None and self.verbose:
                    if digest_present and rec_type == 'revisit':
                        d_msg = 'digest present but not checked (revisit)'
                    elif digest_present:  # pragma: no cover
                        # should not happen
                        d_msg = 'digest present but not checked'
                    else:
                        d_msg = 'no digest to check'

                if d_msg or output:
                    if not printed_filename:
                        print(filename)
                        printed_filename = True
                    print(' ', 'offset', rec_offset, 'WARC-Record-ID', rec_id, rec_type)
                    if d_msg:
                        print('   ', d_msg)
                    for o in output:
                        print('   ', o) 
开发者ID:webrecorder,项目名称:warcio,代码行数:42,代码来源:checker.py

示例15: extract

# 需要导入模块: from warcio import archiveiterator [as 别名]
# 或者: from warcio.archiveiterator import ArchiveIterator [as 别名]
def extract(self, payload_only, headers_only):
        with open(self.filename, 'rb') as fh:
            fh.seek(int(self.offset))
            it = iter(ArchiveIterator(fh))
            record = next(it)

            try:
                stdout_raw = sys.stdout.buffer
            except AttributeError:  #pragma: no cover
                stdout_raw = sys.stdout

            if payload_only:
                stream = record.content_stream()
                buf = stream.read(self.READ_SIZE)
                while buf:
                    stdout_raw.write(buf)
                    buf = stream.read(self.READ_SIZE)
            else:
                stdout_raw.write(record.rec_headers.to_bytes())
                if record.http_headers:
                    stdout_raw.write(record.http_headers.to_bytes())
                if not headers_only:
                    buf = record.raw_stream.read(self.READ_SIZE)
                    while buf:
                        stdout_raw.write(buf)
                        buf = record.raw_stream.read(self.READ_SIZE) 
开发者ID:webrecorder,项目名称:warcio,代码行数:28,代码来源:extractor.py


注:本文中的warcio.archiveiterator.ArchiveIterator方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。