当前位置: 首页>>代码示例>>Python>>正文


Python Queue.get方法代码示例

本文整理汇总了Python中asyncio.Queue.get方法的典型用法代码示例。如果您正苦于以下问题:Python Queue.get方法的具体用法?Python Queue.get怎么用?Python Queue.get使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在asyncio.Queue的用法示例。


在下文中一共展示了Queue.get方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: MessageHandler

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class MessageHandler(ws.WS):
    def __init__(self):
        self.queue = Queue()

    def get(self):
        return self.queue.get()

    def on_message(self, websocket, message):
        return self.queue.put(message)
开发者ID:huobao36,项目名称:pulsar,代码行数:11,代码来源:app.py

示例2: __init__

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class Listener:

    def __init__(self):
        self._messages = Queue()

    def __call__(self, channel, message):
        self._messages.put_nowait((channel, message))

    def get(self):
        return self._messages.get()
开发者ID:LoganTK,项目名称:pulsar,代码行数:12,代码来源:pulsards.py

示例3: Message

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class Message(ws.WS):

    def __init__(self, loop):
        self.queue = Queue(loop=loop)

    def get(self):
        return self.queue.get()

    def on_message(self, websocket, message):
        self.queue.put_nowait(message)
开发者ID:LoganTK,项目名称:pulsar,代码行数:12,代码来源:tests.py

示例4: input

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
def input(self, fd, dst):
    q = Queue()

    def cb():
        q.put_nowait(os.read(fd, 32))

    self.loop.add_reader(fd, cb)
    try:
        while True:
            data = yield from q.get()
            if not data:
                break
            yield from send(dst, BYTES, data)
    finally:
        self.loop.remove_reader(fd)
开发者ID:vmagamedov,项目名称:pi,代码行数:17,代码来源:run.py

示例5: Echo

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class Echo(WS):

    def __init__(self, loop=None):
        self.queue = Queue(loop=loop)

    def get(self):
        return self.queue.get()

    def on_message(self, ws, message):
        self.queue.put_nowait(message)

    def on_ping(self, ws, body):
        ws.pong(body)
        self.queue.put_nowait('PING: %s' % body.decode('utf-8'))

    def on_pong(self, ws, body):
        self.queue.put_nowait('PONG: %s' % body.decode('utf-8'))

    def on_close(self, ws):
        self.queue.put_nowait('CLOSE')
开发者ID:LoganTK,项目名称:pulsar,代码行数:22,代码来源:tests.py

示例6: _EventManager

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class _EventManager(object):

    def __init__(self):
        providers = {}
        self.__registration = {}
        self.__module_functions = {}
        self.__events = Queue()

    @coroutine
    def handleEvents(self):
        while True:
            event, args, future = yield from self.__events.get()
            logger.debug("Handling event {}".format(event))
            for fn, expects in self.__registration[event[0]]:
                fire = True
                if len(event) - 1 != len(expects):
                    continue
                for i in range(len(event)-1):
                    ev = event[i+1].lower()
                    ex = expects[i]
                    if isinstance(ex, list):
                        if not any(ev == val.lower() for val in ex):
                            logger.error("Won't fire")
                            fire = False
                            break
                    else:
                        if ev.lower() != ex.lower():
                            fire = False
                            break
                if fire:
                    logger.debug("Firing event function: {} with {}".format(fn.__name__, args))
                    ret = fn(event=event, **args)
                    future.set_result(ret)

    @coroutine
    def handle_event(self, event, args):
        logger.debug('Handling event {}'.format(event))
        to_call = []
        results = []
        for fn, expects in self.__registration[event[0]]:
            fire = True
            if len(event) -1 != len(expects):
                continue
            for i in range(len(event)-1):
                ev = event[i+1].lower()
                ex = expects[i]
                if isinstance(ex, list):
                    if not any(ev == val.lower() for val in ex):
                        logger.error("Won't fire")
                        fire = False
                        break
                else:
                    if ev.lower() != ex.lower():
                        fire = False
                        break
            if fire:
                to_call.append(fn(event=event, **args))

        if len(to_call) > 0:
            results = yield from gather(*to_call)

        return results

    def register_class(self, cls):
        methods = inspect.getmembers(cls, predicate=inspect.ismethod)
        for _, f in methods:
            fn = f
            event = getattr(fn, '__event__', None)
            if event is not None:
                logger.debug('Registering {} for {}'.format(fn.__name__, event))
                self.register_function(event, fn)

    def register_function(self, event, func):
        primary = event[0]
        expects = []
        if len(event) > 1:
            expects = event[1:]
        if not primary in self.__registration:
            self.__registration[primary] = []
        self.__registration[primary].append([func, expects])

        mod = sys.modules[func.__module__]
        if not mod in self.__module_functions:
            self.__module_functions[mod] = []

        self.__module_functions[mod].append(func)

    @coroutine
    def fire_event(self, *event, **kwargs):
        results = yield from self.handle_event(event, kwargs)

        return results

    def unregisterModuleFunctions(self, mod):
        if not mod in self.__module_functions:
            return True

        for r in __registration:
            self.__registration[r][:] = [i for i,_ in self.__registration[r] if i not in self.__module_functions[mod]]

#.........这里部分代码省略.........
开发者ID:Thezomg,项目名称:OctoBot,代码行数:103,代码来源:events.py

示例7: BrokerProtocolHandler

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class BrokerProtocolHandler(ProtocolHandler):
    def __init__(self, plugins_manager: PluginManager, session: Session=None, loop=None):
        super().__init__(plugins_manager, session, loop)
        self._disconnect_waiter = None
        self._pending_subscriptions = Queue(loop=self._loop)
        self._pending_unsubscriptions = Queue(loop=self._loop)

    @asyncio.coroutine
    def start(self):
        yield from super().start()
        if self._disconnect_waiter is None:
            self._disconnect_waiter = futures.Future(loop=self._loop)

    @asyncio.coroutine
    def stop(self):
        yield from super().stop()
        if self._disconnect_waiter is not None and not self._disconnect_waiter.done():
            self._disconnect_waiter.set_result(None)

    @asyncio.coroutine
    def wait_disconnect(self):
        return (yield from self._disconnect_waiter)

    def handle_write_timeout(self):
        pass

    def handle_read_timeout(self):
        if self._disconnect_waiter is not None and not self._disconnect_waiter.done():
            self._disconnect_waiter.set_result(None)

    @asyncio.coroutine
    def handle_disconnect(self, disconnect):
        self.logger.debug("Client disconnecting")
        if self._disconnect_waiter and not self._disconnect_waiter.done():
            self.logger.debug("Setting waiter result to %r" % disconnect)
            self._disconnect_waiter.set_result(disconnect)

    @asyncio.coroutine
    def handle_connection_closed(self):
        yield from self.handle_disconnect(None)

    @asyncio.coroutine
    def handle_connect(self, connect: ConnectPacket):
        # Broker handler shouldn't received CONNECT message during messages handling
        # as CONNECT messages are managed by the broker on client connection
        self.logger.error('%s [MQTT-3.1.0-2] %s : CONNECT message received during messages handling' %
                          (self.session.client_id, format_client_message(self.session)))
        if self._disconnect_waiter is not None and not self._disconnect_waiter.done():
            self._disconnect_waiter.set_result(None)

    @asyncio.coroutine
    def handle_pingreq(self, pingreq: PingReqPacket):
        yield from self._send_packet(PingRespPacket.build())

    @asyncio.coroutine
    def handle_subscribe(self, subscribe: SubscribePacket):
        subscription = {'packet_id': subscribe.variable_header.packet_id, 'topics': subscribe.payload.topics}
        yield from self._pending_subscriptions.put(subscription)

    @asyncio.coroutine
    def handle_unsubscribe(self, unsubscribe: UnsubscribePacket):
        unsubscription = {'packet_id': unsubscribe.variable_header.packet_id, 'topics': unsubscribe.payload.topics}
        yield from self._pending_unsubscriptions.put(unsubscription)

    @asyncio.coroutine
    def get_next_pending_subscription(self):
        subscription = yield from self._pending_subscriptions.get()
        return subscription

    @asyncio.coroutine
    def get_next_pending_unsubscription(self):
        unsubscription = yield from self._pending_unsubscriptions.get()
        return unsubscription

    @asyncio.coroutine
    def mqtt_acknowledge_subscription(self, packet_id, return_codes):
        suback = SubackPacket.build(packet_id, return_codes)
        yield from self._send_packet(suback)

    @asyncio.coroutine
    def mqtt_acknowledge_unsubscription(self, packet_id):
        unsuback = UnsubackPacket.build(packet_id)
        yield from self._send_packet(unsuback)

    @asyncio.coroutine
    def mqtt_connack_authorize(self, authorize: bool):
        if authorize:
            connack = ConnackPacket.build(self.session.parent, CONNECTION_ACCEPTED)
        else:
            connack = ConnackPacket.build(self.session.parent, NOT_AUTHORIZED)
        yield from self._send_packet(connack)

    @classmethod
    @asyncio.coroutine
    def init_from_connect(cls, reader: ReaderAdapter, writer: WriterAdapter, plugins_manager, loop=None):
        """

        :param reader:
        :param writer:
        :param plugins_manager:
#.........这里部分代码省略.........
开发者ID:FlorianLudwig,项目名称:hbmqtt,代码行数:103,代码来源:broker_handler.py

示例8: __init__

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        self.session = aiohttp.ClientSession(loop=loop)

        # Put (URL, max_redirect) in the Queue
        self.q.put((root_url, self.max_redirect))
        
    @asyncio.coroutine
    def crawl(self):
        '''Run the crawler untill all work is done.'''
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks)]

        # When all work is done, exit.
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # Download page and add new links to self.q
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        # Handle redirects ourselves.
        response = yield from self.session.get(
            url, allow_redirects=False)

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        # We have done this before.
                        return

                    # Remember we have seen this url.
                    self.seen_urls.add(next_url)

                    # Follow the redirect. One less redirect remains.
                    self.q.put_nowait((next_url, max_redirect -1))
            else:
                links = yield from self.parse_links(response)
                # Python set-logic:
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            # Return connection to pool.
            yield from response.release()
开发者ID:Chaogebruce,项目名称:Webcrawler,代码行数:64,代码来源:app_asyncio.py

示例9: Cloner

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class Cloner(object):
    def __init__(self, root):
        self.visited_urls = []
        self.root = self.add_scheme(root)
        if len(self.root.host) < 4:
            sys.exit('invalid taget {}'.format(self.root.host))
        self.target_path = '/opt/snare/pages/{}'.format(self.root.host)

        if not os.path.exists(self.target_path):
            os.mkdir(self.target_path)

        self.new_urls = Queue()

    @staticmethod
    def add_scheme(url):
        if yarl.URL(url).scheme:
            new_url = yarl.URL(url)
        else:
            new_url = yarl.URL('http://' + url)
        return new_url

    @asyncio.coroutine
    def process_link(self, url, check_host=False):
        url = yarl.URL(url)
        if check_host:
            if (url.host != self.root.host or url.fragment
                            or url in self.visited_urls):
                return None
        if not url.is_absolute():
            url = self.root.join(url)

        yield from self.new_urls.put(url)
        return url.relative().human_repr()

    @asyncio.coroutine
    def replace_links(self, data):
        soup = BeautifulSoup(data, 'html.parser')

        # find all relative links
        for link in soup.findAll(href=True):
            res = yield from self.process_link(link['href'], check_host=True)
            if res is not None:
                link['href'] = res

        # find all images and scripts
        for elem in soup.findAll(src=True):
            res = yield from self.process_link(elem['src'])
            if res is not None:
                elem['src'] = res

        # find all action elements
        for act_link in soup.findAll(action=True):
            res = yield from self.process_link(act_link['action'])
            if res is not None:
                act_link['action'] = res

        # prevent redirects
        for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}):
            redir['value'] = yarl.URL(redir['value']).relative().human_repr()

        return soup

    @asyncio.coroutine
    def get_body(self):
        while not self.new_urls.empty():
            current_url = yield from self.new_urls.get()
            if current_url in self.visited_urls:
                continue
            self.visited_urls.append(current_url)
            if current_url.name:
                file_name = current_url.name
            elif current_url.raw_path != '/':
                file_name = current_url.path.rsplit('/')[1]
            else:
                file_name = 'index.html'
            file_path = os.path.dirname(current_url.path)
            if file_path == '/':
                file_path = self.target_path
            else:
                file_path = os.path.join(self.target_path, file_path[1:])

            print('path: ', file_path, 'name: ', file_name)

            if file_path and not os.path.exists(file_path):
                os.makedirs(file_path)

            data = None
            try:
                with aiohttp.Timeout(10.0):
                    with aiohttp.ClientSession() as session:
                        response = yield from session.get(current_url)
                        data = yield from response.read()
            except aiohttp.ClientError as client_error:
                print(client_error)
            else:
                response.release()
                session.close()
            if data is not None:
                if re.match(re.compile('.*\.(html|php)'), file_name):
                    soup = yield from self.replace_links(data)
#.........这里部分代码省略.........
开发者ID:mushorg,项目名称:snare,代码行数:103,代码来源:clone.py

示例10: ProxyResponse

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class ProxyResponse(object):
    '''Asynchronous wsgi response.
    '''
    _started = False
    _headers = None
    _done = False

    def __init__(self, environ, start_response):
        self._loop = environ['pulsar.connection']._loop
        self.environ = environ
        self.start_response = start_response
        self.queue = Queue()

    def __iter__(self):
        while True:
            if self._done:
                try:
                    yield self.queue.get_nowait()
                except QueueEmpty:
                    break
            else:
                yield async(self.queue.get(), loop=self._loop)

    def pre_request(self, response, exc=None):
        self._started = True
        response.bind_event('data_processed', self.data_processed)
        return response

    def error(self, exc):
        if not self._started:
            request = wsgi.WsgiRequest(self.environ)
            content_type = request.content_types.best_match(
                ('text/html', 'text/plain'))
            uri = self.environ['RAW_URI']
            msg = 'Could not find %s' % uri
            logger.info(msg=msg)
            if content_type == 'text/html':
                html = wsgi.HtmlDocument(title=msg)
                html.body.append('<h1>%s</h1>' % msg)
                data = html.render()
                resp = wsgi.WsgiResponse(504, data, content_type='text/html')
            elif content_type == 'text/plain':
                resp = wsgi.WsgiResponse(504, msg, content_type='text/html')
            else:
                resp = wsgi.WsgiResponse(504, '')
            self.start_response(resp.status, resp.get_headers())
            self._done = True
            self.queue.put_nowait(resp.content[0])

    def data_processed(self, response, exc=None, **kw):
        '''Receive data from the requesting HTTP client.'''
        status = response.get_status()
        if status == '100 Continue':
            stream = self.environ.get('wsgi.input') or io.BytesIO()
            body = yield stream.read()
            response.transport.write(body)
        if response.parser.is_headers_complete():
            if self._headers is None:
                headers = self.remove_hop_headers(response.headers)
                self._headers = Headers(headers, kind='server')
                # start the response
                self.start_response(status, list(self._headers))
            body = response.recv_body()
            if response.parser.is_message_complete():
                self._done = True
            self.queue.put_nowait(body)

    def remove_hop_headers(self, headers):
        for header, value in headers:
            if header.lower() not in wsgi.HOP_HEADERS:
                yield header, value
开发者ID:huobao36,项目名称:pulsar,代码行数:73,代码来源:manage.py

示例11: BasePlugin

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class BasePlugin(metaclass=ABCMeta):
    '''Core plug-in functionality

    A Sphinx plug-in needs to provide a minimim set of services in order to be
    useful.  Those are defined here, with default implementations where it
    makes sense.
    '''

    # This is a handle to the data bus.  It's set when we are registered.
    _databus = None

    # Type manager handle
    _tm = None

    def __init__(self, runner, plugins, source = None):
        '''Constructor

        This is how our plugin pipeline is constructed.  Each plugin instance
        is created when the input script is read, and they are chained together,
        from source to sink, here.

        This method _must_ be called with the event loop from which it will be
        called in the future, e.g., asyncio.get_event_loop().
        '''

        # A dict that maps each destination for our data, to the type that the
        # destination can consume.
        self._sinks = {}

        # Retain a pointer to our source, and add ourself to it's list of sinks.
        self._source = source
        if source:
            # Validate that we can process data from this source
            sink_types = set(source.sources()).intersection(self.sinks())
            if len(sink_types):
                source._set_sink(self, sink_types.pop())
                
            else:
                err = "{} cannot sink '{}'".format(self, source.sources())
                _log.error(err)
                raise ImpedenceMismatchError(err)

        # Our input queue
        self._queue = Queue()

        self.runner = runner
        self._plugins = plugins

        # create_task schedules the execution of the coroutine "run", wrapped
        # in a future.
        self._task = self.runner.create_task(self.run())


    def __getattr__(self, name):
        '''Plugin Pipeline Bulding

        This method is called when Python can't find a requested attribute. We
        use it to create a new plugin instance to add to the pipeline.
        '''
        if name in self._plugins:
            return partial(self._plugins[name], source = self)

        else:
            raise AttributeError


    def _set_sink(self, sink, data_type):
        '''Register a sink

        Called during initialization to register a sink (destination for our
        output).
        '''
        self._sinks[sink] = data_type
        

    @coroutine
    def publish(self, data):
        '''Publish data

        Called by a plugin to publish data to it's sinks.
        '''
        for sink, data_type in self._sinks.items():
            # Special case 'None', since that's our 'eof'.  See the 'done'
            # method below.
            if data:
                data = self.xform_data(data, data_type)
            yield from self._databus.publish(data, sink)


    @coroutine
    def write_data(self, data):
        '''Write data to queue
        
        Called by the databus controller to enqueue data from our source.
        '''
        yield from self._queue.put(data)
        

    @coroutine
    def read_data(self):
#.........这里部分代码省略.........
开发者ID:Electrostatics,项目名称:APBS_Sphinx,代码行数:103,代码来源:base.py

示例12: __init__

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class Messagedispatcher:
    def __init__(self, communicator):
        self.communicator = communicator
        self.messages = {
            "direct": {
                "status": {
                    "class": messages.StatusDirect,
                    "queue": Queue()
                },
                "pinor": {
                    "class": messages.PinorDirect,
                    "queue": Queue()
                }
            },
            "mesh": {
                "status": {
                    "class": messages.StatusMesh,
                    "queue": Queue()
                },
                "pinor": {
                    "class": messages.PinorMesh,
                    "queue": Queue()
                },
                "return": {
                    "class": messages.ReturnMesh,
                    "queue": Queue()
                },
                "deploy": {
                    "class": messages.DeployMesh,
                    "queue": Queue()
                },
                "grid": {
                    "class": messages.GridMesh,
                    "queue": Queue()
                }
            }
        }
        self.mesh_queue = Queue()
    @coroutine
    def wait_for_message(self, *types):
        x = self.messages
        for i in types:
            x = x[i]
        q = x["queue"]
        return (yield from q.get())
    @coroutine
    def get_mesh_message(self):
        return (yield from self.mesh_queue.get())
    @coroutine
    def startup(self):
        while True:
            meshput = False
            msg = yield from self.communicator.receive()
            if msg["type"] == "mesh":
                meshput = True
            x = self.messages
            x = x[msg["type"]]
            x = x[msg["data"]["datatype"]]
            q = x["queue"]
            c = x["class"]
            emsg = c.from_json(msg)
            yield from q.put(emsg)
            if meshput:
                # print("RECEIVE:  " + str(msg) + "\n")
                yield from self.mesh_queue.put(emsg)
开发者ID:GPIG5,项目名称:drone,代码行数:67,代码来源:messagedispatcher.py

示例13: __init__

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
#.........这里部分代码省略.........
开发者ID:penglee87,项目名称:lpython,代码行数:103,代码来源:crawl_01.py

示例14: Crawler

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import get [as 别名]
class Crawler(object):
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """

    def __init__(
        self,
        roots,
        scraper=None,
        data_handler=None,
        exclude=None,
        strict=True,  # What to crawl.
        max_redirect=5,
        max_tries=10,  # Per-url limits.
        max_tasks=10,
        max_connections_per_host=3,
        *,
        loop=None
    ):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.max_connections_per_host = max_connections_per_host
        self.scraper = scraper
        self.data_handler = data_handler
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r"\A[\d\.]*\Z", host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_urls(root)
        self.t0 = time.time()
        self.t1 = None

    def record_statistic(
        self,
        url=None,
        next_url=None,
        status=None,
        exception=None,
        content_type=None,
        encoding=None,
        num_urls=0,
        num_new_urls=0,
    ):
        """Record the FetchStatistic for completed / failed URL."""
        fetch_statistic = FetchStatistic(
            url=url,
            next_url=next_url,
            status=status,
            size=0,
            exception=exception,
            content_type=content_type,
            encoding=encoding,
            num_urls=num_urls,
            num_new_urls=num_new_urls,
        )
        self.done.append(fetch_statistic)

    def extract_data(self, root_url, html):
        raise NotImplementedError("You need to define a extract_data method!")

    def close(self):
        """Close resources."""
        LOGGER.debug("closing resources")
        self.session.close()

    @asyncio.coroutine
    def parse_links(self, web_page_html, base_url, _content_type, _encoding):
        """Return a list of links."""
        links = set()
        tree = html.fromstring(web_page_html)
        tree.make_links_absolute(base_url)
        urls = [link[2] for link in tree.iterlinks()]
        for url in urls:
            defragmented, frag = urllib.parse.urldefrag(url)
            if verify.url_allowed(
                defragmented, self.root_domains, exclude=self.exclude
            ):  # Select Valid links, testing against regexp and root_domains
                links.add(defragmented)
#.........这里部分代码省略.........
开发者ID:koolkt,项目名称:python_crawler,代码行数:103,代码来源:crawling.py


注:本文中的asyncio.Queue.get方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。