本文整理汇总了Python中mechanize.Browser.open_novisit方法的典型用法代码示例。如果您正苦于以下问题:Python Browser.open_novisit方法的具体用法?Python Browser.open_novisit怎么用?Python Browser.open_novisit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类mechanize.Browser
的用法示例。
在下文中一共展示了Browser.open_novisit方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: str
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import open_novisit [as 别名]
self.logger.error("Exception during parsing " + str(tag))
return _results
def parse(self, content, url=""):
_soup = BeautifulSoup(content, self.__class__.PARSER_TYPE)
_links = []
_encoding = self._get_encoding(_soup)
_links += self._find_links_with_a_href(_soup, url, _encoding)
_links += self._find_links_with_link_href(_soup, url, _encoding)
return [self._encode_for_transport(content), _links]
def _encode_for_transport(self, content):
return Base64ContentCoder.encode(content)
if __name__ == '__main__':
link = 'http://dziecko.pl/'
browser = Browser()
browser.set_handle_robots(True)
browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) \
Chrome/23.0.1271.64 Safari/537.11')]
_response = browser.open_novisit(link)
_parser = TextHtmlParser()
_data = _parser.parse(_response.read(), url=link)
print _data[1]
示例2: Crawler
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import open_novisit [as 别名]
class Crawler(ThreadWithExc):
CONTENT_TYPE = 'content-type'
CONTENT_LENGTH = 'content-length'
CLIENT_VERSION = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) \
Chrome/23.0.1271.64 Safari/537.11'
def __init__(self, web_server, event, port, manager_address, max_content_length=1024 * 1024, handle_robots=False):
super(Crawler, self).__init__()
self.link_package_queue = Queue()
self.max_content_length = max_content_length
self.browser = Browser()
self.browser.set_handle_robots(handle_robots)
self.browser.addheaders = [('User-agent', self.__class__.CLIENT_VERSION)]
self.event = event
self.exit_flag_lock = Lock()
self.exit_flag = False
self.uuid = ''
self.manager_address = manager_address
self.port = port
self.web_server = web_server
self.stats_lock = RLock()
self.stats_reset_time = time.time()
self.crawled_links = []
self.logger = logging.getLogger('crawler')
_file_handler = logging.FileHandler('crawler.log')
_formatter = logging.Formatter('<%(asctime)s>:%(levelname)s: %(message)s')
_file_handler.setFormatter(_formatter)
self.logger.addHandler(_file_handler)
self.logger.setLevel(logging.DEBUG)
self.logger.debug('Crawler initialized')
def put_into_link_queue(self, link_package):
self.link_package_queue.put(link_package)
def _analyse_header(self, response):
_header = response.info()
_header_dict = dict(zip(map(string.lower, _header.keys()), _header.values()))
result = {}
try:
_content_length = _header_dict.__contains__(self.__class__.CONTENT_TYPE)
if _content_length > self.max_content_length:
raise Exception("'Content length' too big")
except KeyError:
raise Exception("'Content-length' unknown")
try:
_content_type = _header_dict[self.__class__.CONTENT_TYPE]
result[self.__class__.CONTENT_TYPE] = _content_type
except Exception:
raise Exception("'Content-type' unknown")
return result
def _process_one_link(self, link, mime_types):
_response = self.browser.open_novisit(link)
_header_data = self._analyse_header(_response)
_content_type = _header_data[self.__class__.CONTENT_TYPE]
if not MimeContentType(_content_type).one_of(mime_types):
raise Exception("Page skipped because does not meet MIME content type criteria.")
_parser = ParserProvider.get_parser(_content_type)
_data = _parser.parse(_response.read(), url=link)
_results = dict()
_results["url"] = link
_results["content"], _results["links"] = (_data[0], _data[1])
return _results
def _crawl(self):
while not self.link_package_queue.empty() and not self._get_exit_flag():
start_time = time.time()
_final_results = []
(_id, _server_address, _mime_types, _links) = self.link_package_queue.get()
_mime_types = [MimeContentType(x) for x in _mime_types]
for _link in _links:
self.logger.info("Processing url %s started..." % _link)
try:
_results = self._process_one_link(_link, _mime_types)
except Exception as e:
self.logger.error("Exception in %s : %s" % (_link, e))
_results = {"url": _link, "links": [], "content": ""}
else:
self.logger.info("Processing url %s ended successfully. %s urls extracted" %
(_link, len(_results['links'])))
_final_results.append(_results)
self.logger.info("Crawling package from %s ended." % _server_address)
self._send_results_to_task_server(_id, _server_address, _final_results)
end_time = time.time()
self._add_stats(start_time, end_time, len(_final_results))
def _add_stats(self, start_time, end_time, links_num):
self.stats_lock.acquire()
self.crawled_links.append((start_time, end_time, links_num))
self.stats_lock.release()
def _clear_stats(self):
self.stats_lock.acquire()
#.........这里部分代码省略.........
示例3: Browser
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import open_novisit [as 别名]
from zipfile import ZipFile,is_zipfile
from bs4 import BeautifulSoup
import pytesseract
import Image
url = 'http://buscatextual.cnpq.br/buscatextual/download.do?metodo=apresentar&idcnpq='
idlattes = '1982919735990024'
while True:
try:
browser = Browser()
html = browser.open(url+idlattes)
soup = BeautifulSoup(html)
image = soup.find('img')
data = browser.open_novisit(image['src']).read()
filename = 'captcha.png'
save = open(filename, 'wb')
save.write(data)
save.close()
captcha = pytesseract.image_to_string(Image.open('captcha.png'))
getUrl = "http://buscatextual.cnpq.br/buscatextual/download.do?metodo=enviar&idcnpq=%s&palavra=%s" % (idlattes,captcha)
arquivo = browser.retrieve(getUrl,'%s.zip' % idlattes)[0]
if is_zipfile('%s.zip' % idlattes):
print "ENTREI"
data = ZipFile('%s.zip' % idlattes).read('curriculo.xml')
break
except:
pass
root = ET.fromstring(data)