本文整理匯總了Python中spider.Spider.process_page方法的典型用法代碼示例。如果您正苦於以下問題:Python Spider.process_page方法的具體用法?Python Spider.process_page怎麽用?Python Spider.process_page使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類spider.Spider
的用法示例。
在下文中一共展示了Spider.process_page方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: downloadArchivesList
# 需要導入模塊: from spider import Spider [as 別名]
# 或者: from spider.Spider import process_page [as 別名]
def downloadArchivesList(aList, container, extension='.txt.gz', numThreads=5):
'''Set up downloader'''
queue = initDownloader(numThreads)
import csv
f = open(aList, 'rb')
reader = csv.reader(f)
for row in reader:
startURL = row[0]
mlName = startURL.split('/')[-2]
spider = Spider(startURL)
spider.process_page(startURL)
'''Only the links to archive files are interesting:
mailing list archive file names end with '.txt.gz' '''
urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)]
if len(urlList):
print '%s: %d archives' % (mlName, len(urlList))
store = os.path.join(container, mlName)
if not (os.path.isdir(store)):
os.system("mkdir %s" % store)
'''Download each archive'''
addToQ(queue, urlList, store)
'''If here, download finished. Stop threads'''
stopDownloader(queue, numThreads)
示例2: downloadArchives
# 需要導入模塊: from spider import Spider [as 別名]
# 或者: from spider.Spider import process_page [as 別名]
def downloadArchives(startURL, container, lookInsideSubfolders=False, extension='.txt.gz', numThreads=5):
'''Crawl <startURL> and find all mailing list archives (given the filename <extension>).
Store the files in the folder with the path <container>.
If <lookInsideSubfolders>, then go one level deeper (crawl all first-order links as well).
'''
'''Set up downloader'''
queue = initDownloader(numThreads)
print 'Downloading archives from', startURL
if not lookInsideSubfolders:
spider = Spider(startURL)
spider.process_page(startURL)
'''Only the links to archive files are interesting:
mailing list archive file names end with '.txt.gz' '''
urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)]
print '%d archives' % (len(urlList))
addToQ(queue, urlList, container)
else:
spider = Spider(startURL)
spider.process_page(startURL)
for link in sorted(spider.URLs):
subspider = Spider(link)
subspider.process_page(link)
mlName = link.split('/')[-2]
'''Only the links to archive files are interesting:
mailing list archive file names end with '.txt.gz' '''
urlList = [x for x in sorted(subspider.URLs) if x.endswith(extension)]
if len(urlList):
print '%s: %d archives' % (mlName, len(urlList))
'''Create a folder for the mailing list'''
store = os.path.join(container, mlName)
if not (os.path.isdir(store)):
os.system("mkdir %s" % store)
addToQ(queue, urlList, store)
'''If here, download finished. Stop threads'''
stopDownloader(queue, numThreads)