本文整理汇总了Python中webpage.WebPage类的典型用法代码示例。如果您正苦于以下问题:Python WebPage类的具体用法?Python WebPage怎么用?Python WebPage使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了WebPage类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: set_data
def set_data(self):
w=WebPage(htmldata=self.htmldata1)
#find all the codes for season/year in the first html form data
self.semesters={}
xpath="""//*[@id="CLASS_SRCH_WRK2_STRM$35$"]/option"""
for e in w.get_from_xpath(xpath):
key,semester=e.text,e.get("value")
if key.strip() and semester.strip():
self.semesters[key]=semester
#match up season/year to the codes we just found, if possible
code=0
for key in self.semesters:
if self.season.lower() in key.lower() and str(self.year) in key:
code=self.semesters[key]
break
if not code:
print_color("Warning: failed to find season/year in search options. season='%s' year='%s'"%(self.season,self.year),COLORS.RED)
print_d("search options",self.semesters)
self.data={"ICFocus":"SSR_CLSRCH_WRK_ACAD_CAREER$2",
"CLASS_SRCH_WRK2_STRM$35$":str(code),
"SSR_CLSRCH_WRK_SUBJECT$0":self.department,
"ICAction":"CLASS_SRCH_WRK2_SSR_PB_CLASS_SRCH",
"SSR_CLSRCH_WRK_ACAD_CAREER$2":self.level}
示例2: Crawler
class Crawler():
def __init__(self ):
self.downloader = DownloadManager()
self.webpage = None
self.init_database()
self.rules = {}
def init_database(self):
self.queue = QueueDB('queue.db')
self.webpagedb = WebpageDB('webpage.db')
self.duplcheck = DuplCheckDB('duplcheck.db')
def add_seeds(self, links):
new_links = self.duplcheck.filter_dupl_urls(links)
self.duplcheck.add_urls(new_links)
self.queue.push_urls(new_links)
def add_rules(self, rules):
self.rules = {}
for url, inurls in rules.items():
reurl = re.compile(url)
repatn = []
for u in inurls:
repatn.append(re.compile(u))
self.rules[reurl] = repatn
def get_patterns_from_rules(self,url):
patns = []
for purl,ru in self.rules.items():
if purl.match(url)!= None:
patns.extend(ru)
return list(set(patns))
def start(self):
while 1:
url = self.queue.pop_url()
print url
if url == None:
print "crawling task is done."
break
error_msg, url, redirected_url, html = self.downloader.download(url)
#print error_msg, url, redirected_url, html
if html !=None:
self.webpagedb.html2db(url,html)
self.webpage = WebPage(url,html)
self.webpage.parse_links()
ruptn = self.get_patterns_from_rules(url)
print ruptn
links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
self.add_seeds(links)
self.mysleep(3)
def mysleep(self, n):
for i in range(n):
time.sleep(1)
print "sleep",i,"of",n
示例3: Crawler
class Crawler():
def __init__(self):
self.downloader = DownloadManager()
self.webpage = None
self.rules = {}
self.dbop = OperatorDB()
def add_seeds(self, links):
self.dbop.add_seeds(links)
def add_rules(self, rules):
self.rules = {}
for url, inurls in rules.items():
reurl = re.compile(url)
repatn = []
for u in inurls:
repatn.append(re.compile(u))
self.rules[reurl] = repatn
def get_patterns_from_rules(self, url):
patns = []
for purl, ru in self.rules.items():
if purl.match(url) != None:
patns.extend(ru)
return list(set(patns))
def start(self):
while 1:
try:
url = self.dbop.pop_url()
print "url: %s" % url
if url == None:
print "crawling task is done."
break
error_msg, url, redirected_url, html = self.downloader.download(url)
#print error_msg, url, redirected_url, html
if html != None:
self.webpage = WebPage(url, html)
article = self.webpage.extract()
if len(article) > 5:
addtime = "%s %s" % (article[1], article[2])
self.dbop.html2db(url, html,
article[0],
addtime,
article[3],
article[5])
else:
self.dbop.html2db(url, html)
print self.webpage.parse_links()
ruptn = self.get_patterns_from_rules(url)
links = self.webpage.filter_links(tags=['a'],
str_patterns=ruptn)
self.add_seeds(links)
self.mysleep(3)
except Exception, err:
print "!!error!! Exception happend! %s %s" % (url, err)
self.dbop.close()
示例4: updateView
def updateView(self):
page = WebPage(logger=None, parent=self)
page.setLinkDelegationPolicy(QWebPage.DelegateAllLinks)
page.mainFrame().addToJavaScriptWindowObject("qtWindow", self)
self.ui.webView.setPage(page)
html = self.generateHtml()
# baseUrl must end with a trailing '/' otherwise QWebView won't be able
# to load files from there
baseUrl = QUrl.fromLocalFile(os.path.join(self.dataDir, "static/"))
self.ui.webView.setHtml(html, baseUrl)
示例5: __init__
def __init__(self):
logging.debug("-->")
super(WebBrowser, self).__init__()
self.app = QApplication.instance()
if self.app is None:
self.app = QApplication(sys.argv)
self.app.setQuitOnLastWindowClosed(False)
self.event_loop = QEventLoop()
self.cookie_jar = CookieJar()
self.proxy = QNetworkProxy(QNetworkProxy.HttpProxy, "127.0.1.1", 8888)
self.network_manager = NetworkAccessManager()
self.network_manager.setCookieJar(self.cookie_jar)
# self.network_manager.setProxy(self.proxy)
self.web_page = WebPage()
self.web_page.setNetworkAccessManager(self.network_manager)
self.web_view = QWebView()
self.web_view.setPage(self.web_page)
self.web_view.settings().setAttribute(QWebSettings.AutoLoadImages,False)
self.web_view.settings().setAttribute(QWebSettings.PluginsEnabled, True)
self.web_view.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
# self.web_view.settings().setAttribute(QWebSettings.XSSAuditingEnabled, False)
self.web_view.settings().setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True)
self.connect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished)
self.page_loaded_validator = None
self.page_loaded_handler = None
self.page_loaded_handler_kwargs = None
self.timeout_message = None
self.timer = None
self.event_loop_exception = None
logging.debug("<--")
示例6: getlinks
def getlinks(self,url,html):
self.webpage = WebPage(url,html)
self.webpage.parse_links()
ruptn = self.get_patterns_from_rules(url)
#print ruptn
links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
return links
示例7: start
def start(self):
while 1:
url = self.queue.pop_url()
print url
if url == None:
print "crawling task is done."
break
error_msg, url, redirected_url, html = self.downloader.download(url)
# print error_msg, url, redirected_url, html
if html !=None:
self.webpagedb.html2db(url,html)
self.webpage = WebPage(url,html)
self.webpage.parse_links()
ruptn = self.get_patterns_from_rules(url)
#print ruptn
links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
print links
self.add_seeds(links)
file_pattern = []
file_pattern.append(re.compile(self.file_rule))
files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern)
self.files.append(files)
#TODO:
self.download_files(files)
print files
示例8: start
def start(self):
while 1:
try:
url = self.dbop.pop_url()
print "url: %s" % url
if url == None:
print "crawling task is done."
break
error_msg, url, redirected_url, html = self.downloader.download(url)
#print error_msg, url, redirected_url, html
if html != None:
self.webpage = WebPage(url, html)
article = self.webpage.extract()
if len(article) > 5:
addtime = "%s %s" % (article[1], article[2])
self.dbop.html2db(url, html,
article[0],
addtime,
article[3],
article[5])
else:
self.dbop.html2db(url, html)
print self.webpage.parse_links()
ruptn = self.get_patterns_from_rules(url)
links = self.webpage.filter_links(tags=['a'],
str_patterns=ruptn)
self.add_seeds(links)
self.mysleep(3)
except Exception, err:
print "!!error!! Exception happend! %s %s" % (url, err)
self.dbop.close()
示例9: __init__
def __init__(self, parent, args):
QObject.__init__(self, parent)
# variable declarations
self.m_defaultPageSettings = {}
self.m_pages = []
self.m_verbose = args.verbose
self.m_page = WebPage(self)
self.m_returnValue = 0
self.m_terminated = False
# setup the values from args
self.m_scriptFile = args.script
self.m_args = args.script_args
self.m_filesystem = FileSystem(self)
self.m_pages.append(self.m_page)
do_action('PhantomInitPre')
if args.proxy is None:
QNetworkProxyFactory.setUseSystemConfiguration(True)
else:
proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
QNetworkProxy.setApplicationProxy(proxy)
# Provide WebPage with a non-standard Network Access Manager
self.m_netAccessMan = NetworkAccessManager(self, args.disk_cache, args.cookies, args.ignore_ssl_errors)
self.m_page.setNetworkAccessManager(self.m_netAccessMan)
self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)
self.m_defaultPageSettings['loadImages'] = args.load_images
self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
self.m_defaultPageSettings['javascriptEnabled'] = True
self.m_defaultPageSettings['XSSAuditingEnabled'] = False
self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
self.m_defaultPageSettings['localAccessRemote'] = args.local_access_remote
self.m_page.applySettings(self.m_defaultPageSettings)
self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
# inject our properties and slots into javascript
self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)
self.m_page.mainFrame().addToJavaScriptWindowObject('fs', self.m_filesystem)
bootstrap = QFile(':/bootstrap.js')
if not bootstrap.open(QFile.ReadOnly):
sys.exit('Can not bootstrap!')
bootstrapper = str(bootstrap.readAll())
bootstrap.close()
if not bootstrapper:
sys.exit('Can not bootstrap!')
self.m_page.mainFrame().evaluateJavaScript(bootstrapper)
do_action('PhantomInitPost')
示例10: __init__
def __init__(self, args, parent = None):
QObject.__init__(self, parent)
# variable declarations
self.m_loadStatus = self.m_state = QString()
self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
self.m_verbose = args.verbose
self.m_page = WebPage(self)
self.m_clipRect = QRect()
# setup the values from args
self.m_script = QString.fromUtf8(args.script[0].read())
self.m_scriptFile = args.script[0].name
self.m_args = args.script[1:]
self.m_upload_file = args.upload_file
autoLoadImages = False if args.load_images == 'no' else True
pluginsEnabled = True if args.load_plugins == 'yes' else False
args.script[0].close()
palette = self.m_page.palette()
palette.setBrush(QPalette.Base, Qt.transparent)
self.m_page.setPalette(palette)
if not args.proxy:
QNetworkProxyFactory.setUseSystemConfiguration(True)
else:
proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
QNetworkProxy.setApplicationProxy(proxy)
self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages)
self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled)
self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True)
self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True)
self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True)
self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))
self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))
# Ensure we have a document.body.
self.m_page.mainFrame().setHtml('<html><body></body></html>')
self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
# if our script was called in a different directory, change to it
# to make any dealings with files be relative to the scripts directory
if os.path.dirname(self.m_scriptFile):
os.chdir(os.path.dirname(self.m_scriptFile))
if self.m_verbose:
m_netAccessMan = NetworkAccessManager(self)
self.m_page.setNetworkAccessManager(m_netAccessMan)
# inject our properties and slots into javascript
self.connect(self.m_page.mainFrame(), SIGNAL('javaScriptWindowObjectCleared()'), self.inject)
self.connect(self.m_page, SIGNAL('loadFinished(bool)'), self.finish)
示例11: __init__
def __init__(self, args, parent = None):
QObject.__init__(self, parent)
# variable declarations
self.m_loadStatus = self.m_state = ''
self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
self.m_verbose = args.verbose
self.m_page = WebPage(self)
self.m_clipRect = QRect()
# setup the values from args
self.m_script = args.script.read()
self.m_scriptFile = args.script.name
self.m_scriptDir = os.path.dirname(args.script.name)
if sys.platform.startswith('win'):
self.m_scriptDir += '\\'
else:
self.m_scriptDir += '/'
self.m_args = args.script_args
self.m_upload_file = args.upload_file
autoLoadImages = False if args.load_images == 'no' else True
pluginsEnabled = True if args.load_plugins == 'yes' else False
args.script.close()
palette = self.m_page.palette()
palette.setBrush(QPalette.Base, Qt.transparent)
self.m_page.setPalette(palette)
if not args.proxy:
QNetworkProxyFactory.setUseSystemConfiguration(True)
else:
proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
QNetworkProxy.setApplicationProxy(proxy)
self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages)
self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled)
self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True)
self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True)
self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True)
self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))
self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))
# Ensure we have a document.body.
self.m_page.mainFrame().setHtml('<html><body></body></html>')
self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
if self.m_verbose:
m_netAccessMan = NetworkAccessManager(args.disk_cache, self)
self.m_page.setNetworkAccessManager(m_netAccessMan)
# inject our properties and slots into javascript
self.m_page.mainFrame().javaScriptWindowObjectCleared.connect(self.inject)
self.m_page.loadFinished.connect(self.finish)
示例12: run
def run(self):
while self.status:
try:
url = self.spider.task_list.get(timeout = 1)
except Empty:
# log.info('%s: task_list Empty' % self.name)
continue
self.spider.increase_running()
if not self.spider.check_robots(url):
log.info('%s - robots forbidden : %s' % (self.name, url))
continue
page = WebPage(url)
# print('%s prepare to fetch %s' % (self.name, url))
if page.fetch():
self.spider.db.save_data(page.get_data())
for link in page.get_link(): # retrive links from html
if link not in self.spider.visited_list: # not visited yet
self.spider.extend_list.add(link)
else:
print('%s: Page fetch failed: %s' % (self.name, page.url))
self.spider.decrease_running()
示例13: createWebPage
def createWebPage(self):
page = WebPage(self)
self.m_pages.append(page)
page.applySettings(self.m_defaultPageSettings)
page.setNetworkAccessManager(self.m_netAccessMan)
page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
return page
示例14: __init__
def __init__(self, args, parent=None):
QObject.__init__(self, parent)
# variable declarations
self.m_defaultPageSettings = {}
self.m_verbose = args.verbose
self.m_page = WebPage(self)
self.m_returnValue = 0
self.m_terminated = False
# setup the values from args
self.m_script = args.script
self.m_scriptFile = args.script_name
self.m_args = args.script_args
do_action('PhantomInitPre', Bunch(locals()))
if not args.proxy:
QNetworkProxyFactory.setUseSystemConfiguration(True)
else:
proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
QNetworkProxy.setApplicationProxy(proxy)
# Provide WebPage with a non-standard Network Access Manager
self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self)
self.m_page.setNetworkAccessManager(self.m_netAccessMan)
self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)
self.m_defaultPageSettings['loadImages'] = args.load_images
self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
self.m_page.applySettings(self.m_defaultPageSettings)
# inject our properties and slots into javascript
self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)
bootstrap = QFile(':/bootstrap.js')
if not bootstrap.open(QFile.ReadOnly):
qCritical('Can not bootstrap!')
sys.exit(1)
bootstrapper = str(bootstrap.readAll())
bootstrap.close()
if not bootstrapper:
qCritical('Can not bootstrap!')
sys.exit(1)
self.m_page.mainFrame().evaluateJavaScript(bootstrapper)
do_action('PhantomInitPost', Bunch(locals()))
示例15: add_webpage
def add_webpage(self):
webpage = WebPage(name='', description='', url='', load_content=False)
webpage.name = raw_input('Name: ')
webpage.description = raw_input('Description: ')
webpage.url = raw_input('URL: ')
webpage.update_timeout = int(raw_input('Update timeout: '))
webpage.request_timeout = int(raw_input('Request timeout: '))
webpage.data_offset = int(raw_input('Data offset: '))
done = False
while not done:
confirm = raw_input('Save? (y/n)')
if confirm in ['y', 'Y']:
try:
webpage.current = webpage.retrieve()
except ValueError, e:
logger.error('[!] Error: ' + str(e))
done = True
self.__webpages.append(webpage)
#self.start_updater()
if confirm in ['y', 'Y', 'n', 'N']:
done = True