当前位置: 首页>>代码示例>>Python>>正文


Python PhantomJS.set_page_load_timeout方法代码示例

本文整理汇总了Python中selenium.webdriver.PhantomJS.set_page_load_timeout方法的典型用法代码示例。如果您正苦于以下问题:Python PhantomJS.set_page_load_timeout方法的具体用法?Python PhantomJS.set_page_load_timeout怎么用?Python PhantomJS.set_page_load_timeout使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在selenium.webdriver.PhantomJS的用法示例。


在下文中一共展示了PhantomJS.set_page_load_timeout方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: WeixinPhantomjs

# 需要导入模块: from selenium.webdriver import PhantomJS [as 别名]
# 或者: from selenium.webdriver.PhantomJS import set_page_load_timeout [as 别名]
class WeixinPhantomjs(Base):
    all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.__class__.all_uids:
                    self.__class__.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl_single(self, word=None, go=0):
        is_go = True
        go_page = int(go)
        next_page_css = 'sogou_page_%s'

        is_break = self.open_weixin_browser(word)
        pages = self.get_total_pages_to_word()

        for page in range(self.start_page + 1, (pages or self.end_page) + 1):
#.........这里部分代码省略.........
开发者ID:xutaoding,项目名称:csf_scraper,代码行数:103,代码来源:wx_threads.py

示例2: PagesCrawler

# 需要导入模块: from selenium.webdriver import PhantomJS [as 别名]
# 或者: from selenium.webdriver.PhantomJS import set_page_load_timeout [as 别名]
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO)
        self.log("ARGUMENTS : "+str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(
            scrapyd_config().get('logs_dir'),
            HYPHE_PROJECT,
            self.name,
            self.crawler.settings['JOBID']
        )
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(
            executable_path=PHANTOM['PATH'],
            service_args=phantom_args,
            desired_capabilities=self.capabilities,
            service_log_path="%s-phantomjs.log" % self.prefixfiles
        )
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log("%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

          # Collect whole DOM of the webpage including embedded iframes
            with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

          # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout,
                        self.ph_idle_timeout, self.ph_ajax_timeout)
#.........这里部分代码省略.........
开发者ID:SciencesPoDRIS,项目名称:hyphe,代码行数:103,代码来源:pages.py

示例3: WeixinPhantomjs

# 需要导入模块: from selenium.webdriver import PhantomJS [as 别名]
# 或者: from selenium.webdriver.PhantomJS import set_page_load_timeout [as 别名]
class WeixinPhantomjs(Base):
    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

        self.client = MongoClient(HOST, PORT)
        self.collection = self.client[DB][COLLECTION]
        self.all_uids = self.uids

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def get_query_words(self, word):
        query_words = []

        for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]):
            w = docs['conp']

            if w not in query_words:
                query_words.append(w)

            for item in docs['rel']:
                if item not in query_words:
                    query_words.append(item)

        self.client.close()

        return self.query_index(query_words, word)

    @property
    def uids(self):
        return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.all_uids:
                    self.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @staticmethod
    def query_index(words, cut_word):
        temp_words = words[START_INDEX:END_INDEX]

        try:
            index = temp_words.index(cut_word)
            return temp_words[index:], index + START_INDEX
#.........这里部分代码省略.........
开发者ID:xutaoding,项目名称:csf_scraper,代码行数:103,代码来源:wx_phantomjs.py


注:本文中的selenium.webdriver.PhantomJS.set_page_load_timeout方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。