本文整理汇总了Python中Crawler.Crawler.crawl方法的典型用法代码示例。如果您正苦于以下问题:Python Crawler.crawl方法的具体用法?Python Crawler.crawl怎么用?Python Crawler.crawl使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Crawler.Crawler
的用法示例。
在下文中一共展示了Crawler.crawl方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def run(self):
robot_url = "http://allrecipes.com/"
root = 'http://allrecipes.com/Recipes/ViewAll.aspx?Page=1'
depth_limit = 5
confine_reg = ['http://allrecipes.com/Recipes/ViewAll.aspx\?Page\=[0-9]*$','http://allrecipes.com/Recipe/[a-zA-Z0-9\-]*/Detail.aspx$']
c = Crawler(root, depth_limit,confine_reg,robot_url)
c.crawl()
示例2: main
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def main(self):
if self.config.has_option("sources", "bootstrap"):
self.bootstrap(
filename = self.config.get("sources", "bootstrap")
)
b = Base(
endpoint = self.config.get("xserver", "endpoint"),
base = self.config.get("xserver", "base")
)
c = Crawler(base = b)
c.crawl(callback = self.callback)
self.processCache()
self.addTopConcepts()
self.addLinks()
self.write()
self.writeTables()
shutil.rmtree("temp")
示例3: crawl
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def crawl(max_page):
text.delete('1.0', END)
text.insert(END, 'Currently Crawling Please Wait\n')
search_engine.update()
count = int(max_page)
while len(Crawler.queue) > 0 and count > 0:
queue = str(Crawler.queue.pop())
Crawler.crawl(queue)
count -= 1
text.insert(END, 'Currently Crawling: ' + queue + '\n')
search_engine.update()
print('Crawl Finished Can Now Search')
text.delete('1.0', END)
text.insert(END, 'Crawl Finished Can Now Search\n')
text.insert(END, str(len(Crawler.crawled)) + " Url's have been Crawled and Indexed \n")
text.insert(END, str(len(Crawler.queue)) + " Total Number of Url's In Queue\n")
search_engine.update()
Crawler.save_lists()
示例4: test_crawl_limit
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def test_crawl_limit(self):
c = Crawler("http://a.com")
c.SLEEP_TIME = 0
def side_effect():
c.process_q.pop(0)
c._process_next_url = mock.Mock(side_effect=side_effect)
c.render_sitemap = mock.Mock()
c.URL_LIMIT = 10
c.process_q = ["test"] * 5
c.crawl()
self.assertEqual(c._process_next_url.call_count, 5)
c._process_next_url.call_count = 0
c.process_q = ["test"] * 10
c.URL_LIMIT = 5
c.crawl()
self.assertEqual(c._process_next_url.call_count, 5)
c._process_next_url.call_count = 0
c.process_q = ["test"] * 10
c.URL_LIMIT = float("inf")
c.crawl()
self.assertEqual(c._process_next_url.call_count, 10)
示例5: scrape_documents
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def scrape_documents(min_count=0):
doc_count = 0
s = Crawler()
docs = s.crawl(min_count)
while min_count <= 0 or doc_count < min_count:
for doc in docs:
log.debug('uploaded image doc from %s', doc.url)
doc_count += 1
if doc_count % 100 == 0:
log.info('%d images and counting...', doc_count)
yield doc
示例6: getWebPage
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def getWebPage(self, URL, depth):
'''
Retreve all the text data from webpage/webpages.
@param URL: URL which is going to be the sourse
@param depth: the depth of the links from the URL which should be searched
default = 0
@return: string of all text from all webpages.
'''
if int(depth) != 0:
t = ""
crawler = Crawler(URL, int(depth)-1)
crawler.crawl()
for l in crawler.links_remembered:
text = self.Alchemy.URLGetText(str(l.dst))
element = ET.XML(text)
t += element.findtext("text")
else:
text = self.Alchemy.URLGetText(URL)
element = ET.XML(text)
t = element.findtext("text")
return t.encode('ascii','ignore')
示例7: test_crawl_inline
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def test_crawl_inline(self):
path_rules = {
"start": "./",
"file": {
"include": ["\\.py$"]
}
}
rules = {
"search_author": {
"include": "author",
"result": {
"author": "author[\\s_]+=\s+'([\\w\\s]+)'"
}
}
}
result = {
"BUILT-IN": ["FILENAME"]
}
output = None
crawl_res_sync = Crawler.crawl(path_rules, rules, result, output)
current_test_file = "./test/test_crawler.py"
self.assertIsNotNone(crawl_res_sync.get(current_test_file))
self.assertTrue("matches" in crawl_res_sync[current_test_file] and len(crawl_res_sync[current_test_file]) > 0)
self.assertEqual(crawl_res_sync[current_test_file]["matches"]["search_author"]["author"][0], __author__)
示例8: cralwer
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
import os.path
parser = argparse.ArgumentParser(description='Crawl file and execute regex rules on them')
parser.add_argument('-p', metavar='ParameterFilePath', type=argparse.FileType('r'), required=True,
help="path to a parameter json file. Parameter file should contain a 'crawling', 'rules' and 'result' key")
parser.add_argument('-o', metavar='OutputFilePath', type=argparse.FileType('w+'), help='output file. This argument is required if no output is specified in parameter file.\n The file must be either a .csv or .json')
parser.add_argument('-mt', metavar='Thread Numbers', type=int, help='have a multi-threaded cralwer (1 thread per file) and precise the number of concurrent thread')
parser.add_argument('-s', metavar='StartDirectory', type=str, help='directory in which the crawling will start. This parameter is necessary if there is no "crawling" dictionary in the parameter file')
args = parser.parse_args()
if "p" not in args or args.p is None:
parser.error(parser.format_usage())
param = FO.get_from_JSON_file(args.p.name)
if "rules" not in param or ("o" not in args and "output" not in param):
print("rules error")
parser.error(parser.format_usage())
if "crawling" not in param and ("s" not in args or args.s is None):
parser.error(parser.format_usage())
elif "s" in args and args.s is not None:
param["crawling"] = { "start": args.s}
if "o" in args and args.o is not None:
output_name, output_extension = os.path.splitext(args.o.name)
param["output"] = {
"path": args.o.name,
"type": "csv" if ".csv" in output_extension else "json"
}
if "mt" in args and args.mt is not None:
Crawler.crawl_multithread(param.get("crawling"), param.get("rules"), param.get("result"), param["output"], args.mt)
else:
Crawler.crawl(param.get("crawling"), param.get("rules"), param.get("result"), param["output"])
示例9: main
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def main():
urls = raw_input("\n Pages to crawl: ")
maxLinksToCrawl = int(raw_input(" Maximum amount of links to crawl: "))
crawler = Crawler(urls, maxLinksToCrawl)
crawler.crawl()
示例10: Crawler
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
# Written by Kevin Keraudren, 14/06/2011
import argparse
from Crawler import Crawler
parser = argparse.ArgumentParser(
usage = "Usage: %(prog)s seed_url [options]" )
parser.add_argument(
'seed',
metavar='seed_url',
help='url for starting the crawl' )
parser.add_argument(
'--dir',
default='./',
help="root directory to store the result of the crawl" )
parser.add_argument(
'--verbose',
action="store_true", default=True,
help="verbose mode" )
args = parser.parse_args()
crawler = Crawler( args.seed, rootdir=args.dir, verbose=args.verbose )
print crawler
crawler.crawl()
print crawler
print "Crawl complete"
示例11: isinstance
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
for regex, tags in self.regexes_tags:
if(regex.match(resource)):
auto_tags.extend(tags)
resources_tags.append((resource, auto_tags))
assert isinstance(resources_tags, list)
return resources_tags
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~ Main ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if __name__ == '__main__':
from Crawler import Crawler
crawler = Crawler(white_list=set((
'..',
)))
auto_tagger = AutoTagger({
r'^.*\.py$': ['python', 'development'],
r'^.*\.css$': ['css', 'development'],
r'^.*\.js$': ['javascript', 'development'],
})
for resource, tags in auto_tagger.process(crawler.crawl()):
print(resource, tags)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
示例12: test_crawl_fake_directCrawl
# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def test_crawl_fake_directCrawl(self):
parameters = FileOperations.get_from_JSON_file("./test/search_parameters.json")
Crawler.crawl(parameters["crawling"], parameters["rules"], parameters["result"], parameters["output"])
self.assertTrue(os.path.isfile(parameters["output"]["path"]))
result_from_file = FileOperations.get_from_JSON_file(parameters["output"]["path"])
self.assertEqual(len(result_from_file), 3)