当前位置: 首页>>代码示例>>Python>>正文


Python Browser.open_novisit方法代码示例

本文整理汇总了Python中mechanize.Browser.open_novisit方法的典型用法代码示例。如果您正苦于以下问题:Python Browser.open_novisit方法的具体用法?Python Browser.open_novisit怎么用?Python Browser.open_novisit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在mechanize.Browser的用法示例。


在下文中一共展示了Browser.open_novisit方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: str

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import open_novisit [as 别名]
                self.logger.error("Exception during parsing " + str(tag))
        return _results

    def parse(self, content, url=""):
        _soup = BeautifulSoup(content, self.__class__.PARSER_TYPE)
        _links = []
        _encoding = self._get_encoding(_soup)
        _links += self._find_links_with_a_href(_soup, url, _encoding)
        _links += self._find_links_with_link_href(_soup, url, _encoding)
        return [self._encode_for_transport(content), _links]

    def _encode_for_transport(self, content):
        return Base64ContentCoder.encode(content)

if __name__ == '__main__':
    link = 'http://dziecko.pl/'

    browser = Browser()
    browser.set_handle_robots(True)
    browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) \
     Chrome/23.0.1271.64 Safari/537.11')]
    _response = browser.open_novisit(link)
    _parser = TextHtmlParser()
    _data = _parser.parse(_response.read(), url=link)
    print _data[1]





开发者ID:pombredanne,项目名称:fcs,代码行数:27,代码来源:content_parser.py

示例2: Crawler

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import open_novisit [as 别名]
class Crawler(ThreadWithExc):

    CONTENT_TYPE = 'content-type'
    CONTENT_LENGTH = 'content-length'
    CLIENT_VERSION = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) \
     Chrome/23.0.1271.64 Safari/537.11'

    def __init__(self, web_server, event, port, manager_address, max_content_length=1024 * 1024, handle_robots=False):
        super(Crawler, self).__init__()
        self.link_package_queue = Queue()
        self.max_content_length = max_content_length

        self.browser = Browser()
        self.browser.set_handle_robots(handle_robots)
        self.browser.addheaders = [('User-agent', self.__class__.CLIENT_VERSION)]

        self.event = event
        self.exit_flag_lock = Lock()
        self.exit_flag = False

        self.uuid = ''
        self.manager_address = manager_address
        self.port = port
        self.web_server = web_server

        self.stats_lock = RLock()
        self.stats_reset_time = time.time()
        self.crawled_links = []

        self.logger = logging.getLogger('crawler')
        _file_handler = logging.FileHandler('crawler.log')
        _formatter = logging.Formatter('<%(asctime)s>:%(levelname)s: %(message)s')
        _file_handler.setFormatter(_formatter)
        self.logger.addHandler(_file_handler)
        self.logger.setLevel(logging.DEBUG)
        self.logger.debug('Crawler initialized')

    def put_into_link_queue(self, link_package):
        self.link_package_queue.put(link_package)

    def _analyse_header(self, response):
        _header = response.info()
        _header_dict = dict(zip(map(string.lower, _header.keys()), _header.values()))
        result = {}
        try:
            _content_length = _header_dict.__contains__(self.__class__.CONTENT_TYPE)
            if _content_length > self.max_content_length:
                raise Exception("'Content length' too big")
        except KeyError:
            raise Exception("'Content-length' unknown")
        try:
            _content_type = _header_dict[self.__class__.CONTENT_TYPE]
            result[self.__class__.CONTENT_TYPE] = _content_type
        except Exception:
            raise Exception("'Content-type' unknown")
        return result

    def _process_one_link(self, link, mime_types):
        _response = self.browser.open_novisit(link)
        _header_data = self._analyse_header(_response)
        _content_type = _header_data[self.__class__.CONTENT_TYPE]
        if not MimeContentType(_content_type).one_of(mime_types):
            raise Exception("Page skipped because does not meet MIME content type criteria.")
        _parser = ParserProvider.get_parser(_content_type)
        _data = _parser.parse(_response.read(), url=link)
        _results = dict()
        _results["url"] = link
        _results["content"], _results["links"] = (_data[0], _data[1])
        return _results

    def _crawl(self):
        while not self.link_package_queue.empty() and not self._get_exit_flag():
            start_time = time.time()
            _final_results = []
            (_id, _server_address, _mime_types, _links) = self.link_package_queue.get()
            _mime_types = [MimeContentType(x) for x in _mime_types]
            for _link in _links:
                self.logger.info("Processing url %s started..." % _link)
                try:
                    _results = self._process_one_link(_link, _mime_types)
                except Exception as e:
                    self.logger.error("Exception in %s : %s" % (_link, e))
                    _results = {"url": _link, "links": [], "content": ""}
                else:
                    self.logger.info("Processing url %s ended successfully. %s urls extracted" %
                                     (_link, len(_results['links'])))
                _final_results.append(_results)

            self.logger.info("Crawling package from %s ended." % _server_address)
            self._send_results_to_task_server(_id, _server_address, _final_results)
            end_time = time.time()
            self._add_stats(start_time, end_time, len(_final_results))

    def _add_stats(self, start_time, end_time, links_num):
        self.stats_lock.acquire()
        self.crawled_links.append((start_time, end_time, links_num))
        self.stats_lock.release()

    def _clear_stats(self):
        self.stats_lock.acquire()
#.........这里部分代码省略.........
开发者ID:agh-glk,项目名称:fcs,代码行数:103,代码来源:crawler.py

示例3: Browser

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import open_novisit [as 别名]
from zipfile import ZipFile,is_zipfile
from bs4 import BeautifulSoup
import pytesseract
import Image


url = 'http://buscatextual.cnpq.br/buscatextual/download.do?metodo=apresentar&idcnpq='
idlattes = '1982919735990024'

while True:
	try:
		browser = Browser()
		html = browser.open(url+idlattes)
		soup = BeautifulSoup(html)
		image = soup.find('img')
		data = browser.open_novisit(image['src']).read()
		filename = 'captcha.png'
		save = open(filename, 'wb')
		save.write(data)
		save.close()
		captcha = pytesseract.image_to_string(Image.open('captcha.png'))
		getUrl = "http://buscatextual.cnpq.br/buscatextual/download.do?metodo=enviar&idcnpq=%s&palavra=%s" % (idlattes,captcha)
		arquivo = browser.retrieve(getUrl,'%s.zip' % idlattes)[0]
		if is_zipfile('%s.zip' % idlattes):
			print "ENTREI"
			data = ZipFile('%s.zip' % idlattes).read('curriculo.xml')
			break
	except:
		pass

root = ET.fromstring(data)
开发者ID:leandrodeassis,项目名称:Lattes,代码行数:33,代码来源:getXML.py


注:本文中的mechanize.Browser.open_novisit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。