本文整理汇总了Python中Analyzer.Analyzer.addItem方法的典型用法代码示例。如果您正苦于以下问题:Python Analyzer.addItem方法的具体用法?Python Analyzer.addItem怎么用?Python Analyzer.addItem使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Analyzer.Analyzer
的用法示例。
在下文中一共展示了Analyzer.addItem方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from Analyzer import Analyzer [as 别名]
# 或者: from Analyzer.Analyzer import addItem [as 别名]
class LynCrawler:
def __init__(self):
self.ToCrawl = set([])
self.Crawled = set([])
self.Crawling = ""
self.PageAnalyzer = Analyzer() # used to extract useful info
self.PageSniffer = Analyzer() # used to find new pages to crawl
self.initBrowser()
def initBrowser(self):
self.browser = mechanize.Browser()
# important to bypass the website anti-robot defence
self.browser.set_handle_robots(False)
self.browser.addheaders = [("User-Agent",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11")]
def addPageSniffItem(self, name, reStr):
self.PageSniffer.addItem(name, reStr)
def rmPageSniffItem(self, name):
self.PageSniffer.rmItem(name)
def addPageAnalyzeItem(self, name, reStr):
self.PageAnalyzer.addItem(name, reStr)
def rmPageAnalyzeItem(self, name):
self.PageAnalyzer.rmItem(name)
def sniffNewPage(self, pageContent):
if self.PageSniffer.getItemCount() == 0:
print "No sniff items yet. Please add new items use addPageSniffItem()."
else:
result = []
for i in self.PageSniffer.analyzeAll(pageContent).values():
if i:
for p in i:
if p:
newPage = self.generateNewPageURL(p)
result.append(newPage)
for p in result:
if p not in self.Crawled:
self.ToCrawl.add(p)
def analyzePage(self, pageContent):
if self.PageAnalyzer.getItemCount() == 0:
print "No data items for analyzing yet. Please add new items use addPageAnalyzeItem()."
else:
return self.PageAnalyzer.analyzeAll(pageContent)
def start(self, initPage):
self.ToCrawl.add(initPage)
while self.ToCrawl:
self.crawl(self.ToCrawl.pop())
else:
print "All done."
return True
def crawl(self, page):
try:
response = self.browser.open(page)
pageContent = response.read()
self.store(self.analyzePage(pageContent))
self.Crawled.add(page)
print 'page crawled:', page
self.sniffNewPage(pageContent)
except Exception as e:
print e.message
print "something went wrong when crawling ", page
def store(self, data):
with open('data.txt', 'a+') as f:
f.write(str(data))
f.write("\n")
def generateNewPageURL(self, tmpURL, parentPath=""):
"""Sometimes the new url sniffed from the page content are relative path, you can modify it here."""
if not parentPath:
parPath = r"http://www.gsmarena.com/"
if "http://" in tmpURL:
return tmpURL
else:
return parPath + tmpURL