当前位置: 首页>>代码示例>>Python>>正文


Python Crawler.crawl方法代码示例

本文整理汇总了Python中Crawler.Crawler.crawl方法的典型用法代码示例。如果您正苦于以下问题:Python Crawler.crawl方法的具体用法?Python Crawler.crawl怎么用?Python Crawler.crawl使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Crawler.Crawler的用法示例。


在下文中一共展示了Crawler.crawl方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: run

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
 def run(self):
     robot_url = "http://allrecipes.com/"
     root = 'http://allrecipes.com/Recipes/ViewAll.aspx?Page=1'
     depth_limit = 5
     confine_reg = ['http://allrecipes.com/Recipes/ViewAll.aspx\?Page\=[0-9]*$','http://allrecipes.com/Recipe/[a-zA-Z0-9\-]*/Detail.aspx$']
     c = Crawler(root, depth_limit,confine_reg,robot_url)  
     c.crawl()     
开发者ID:ggppwx,项目名称:webCrawler,代码行数:9,代码来源:Thread.py

示例2: main

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
 def main(self):
   if self.config.has_option("sources", "bootstrap"):
     self.bootstrap(
       filename = self.config.get("sources", "bootstrap")
     )
   b = Base(
     endpoint = self.config.get("xserver", "endpoint"),
     base = self.config.get("xserver", "base")
   )
   c = Crawler(base = b)
   c.crawl(callback = self.callback)
   self.processCache()
   self.addTopConcepts()
   self.addLinks()
   self.write()
   self.writeTables()
   shutil.rmtree("temp")
开发者ID:jindrichmynarz,项目名称:Skosify,代码行数:19,代码来源:Skosify.py

示例3: crawl

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
    def crawl(max_page):
        text.delete('1.0', END)
        text.insert(END, 'Currently Crawling Please Wait\n')
        search_engine.update()

        count = int(max_page)
        while len(Crawler.queue) > 0 and count > 0:
            queue = str(Crawler.queue.pop())
            Crawler.crawl(queue)
            count -= 1
            text.insert(END, 'Currently Crawling: ' + queue + '\n')
            search_engine.update()

        print('Crawl Finished Can Now Search')
        text.delete('1.0', END)
        text.insert(END, 'Crawl Finished Can Now Search\n')
        text.insert(END, str(len(Crawler.crawled)) + " Url's have been Crawled and Indexed \n")
        text.insert(END, str(len(Crawler.queue)) + " Total Number of Url's In Queue\n")
        search_engine.update()

        Crawler.save_lists()
开发者ID:Gunn3r1995,项目名称:Search,代码行数:23,代码来源:main.py

示例4: test_crawl_limit

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
    def test_crawl_limit(self):
        c = Crawler("http://a.com")
        c.SLEEP_TIME = 0

        def side_effect():
            c.process_q.pop(0)
        c._process_next_url = mock.Mock(side_effect=side_effect)
        c.render_sitemap = mock.Mock()

        c.URL_LIMIT = 10
        c.process_q = ["test"] * 5
        c.crawl()
        self.assertEqual(c._process_next_url.call_count, 5)

        c._process_next_url.call_count = 0
        c.process_q = ["test"] * 10
        c.URL_LIMIT = 5
        c.crawl()
        self.assertEqual(c._process_next_url.call_count, 5)

        c._process_next_url.call_count = 0
        c.process_q = ["test"] * 10
        c.URL_LIMIT = float("inf")
        c.crawl()
        self.assertEqual(c._process_next_url.call_count, 10)
开发者ID:vhamid,项目名称:crawler-example,代码行数:27,代码来源:test_crawler.py

示例5: scrape_documents

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def scrape_documents(min_count=0):

    doc_count = 0

    s = Crawler()
    docs = s.crawl(min_count)

    while min_count <= 0 or doc_count < min_count:
        for doc in docs:
            log.debug('uploaded image doc from %s', doc.url)
            doc_count += 1
            if doc_count % 100 == 0:
                log.info('%d images and counting...', doc_count)
            yield doc
开发者ID:Sebelino,项目名称:ir13-project,代码行数:16,代码来源:Upload.py

示例6: getWebPage

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
 def getWebPage(self, URL, depth):
     '''
     Retreve all the text data from webpage/webpages.
     
     @param URL: URL which is going to be the sourse
     @param depth: the depth of the links from the URL which should be searched
     default = 0
     
     @return: string of all text from all webpages. 
     '''
     if int(depth) != 0:
         t = ""
         crawler = Crawler(URL, int(depth)-1)
         crawler.crawl()
         for l in crawler.links_remembered:
             text = self.Alchemy.URLGetText(str(l.dst))     
             element = ET.XML(text)
             t += element.findtext("text")
     else:
         text = self.Alchemy.URLGetText(URL)     
         element = ET.XML(text)
         t = element.findtext("text")
     return t.encode('ascii','ignore')
开发者ID:danjamker,项目名称:N-Fly,代码行数:25,代码来源:GetData.py

示例7: test_crawl_inline

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
 def test_crawl_inline(self):
     path_rules = {
         "start": "./",
         "file": {
             "include": ["\\.py$"]
         }
     }
     rules = {
         "search_author": {
             "include": "author",
             "result": {
                 "author": "author[\\s_]+=\s+'([\\w\\s]+)'"
             }
         }
     }
     result = {
         "BUILT-IN": ["FILENAME"]
     }
     output = None
     crawl_res_sync = Crawler.crawl(path_rules, rules, result, output)
     current_test_file = "./test/test_crawler.py"
     self.assertIsNotNone(crawl_res_sync.get(current_test_file))
     self.assertTrue("matches" in crawl_res_sync[current_test_file] and len(crawl_res_sync[current_test_file]) > 0)
     self.assertEqual(crawl_res_sync[current_test_file]["matches"]["search_author"]["author"][0], __author__)
开发者ID:glebedel,项目名称:FileCrawler,代码行数:26,代码来源:test_crawler.py

示例8: cralwer

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
import os.path

parser = argparse.ArgumentParser(description='Crawl file and execute regex rules on them')
parser.add_argument('-p', metavar='ParameterFilePath', type=argparse.FileType('r'), required=True,
                   help="path to a parameter json file. Parameter file should contain a 'crawling', 'rules' and 'result' key")
parser.add_argument('-o', metavar='OutputFilePath', type=argparse.FileType('w+'), help='output file. This argument is required if no output is specified in parameter file.\n The file must be either a .csv or .json')
parser.add_argument('-mt', metavar='Thread Numbers', type=int, help='have a multi-threaded cralwer (1 thread per file) and precise the number of concurrent thread')
parser.add_argument('-s', metavar='StartDirectory', type=str, help='directory in which the crawling will start. This parameter is necessary if there is no "crawling" dictionary in the parameter file')

args = parser.parse_args()
if "p" not in args or args.p is None:
    parser.error(parser.format_usage())
param = FO.get_from_JSON_file(args.p.name)
if "rules" not in param or ("o" not in args and "output" not in param):
    print("rules error")
    parser.error(parser.format_usage())
if "crawling" not in param and ("s" not in args or args.s is None):
    parser.error(parser.format_usage())
elif "s" in args and args.s is not None:
    param["crawling"] = { "start": args.s}
if "o" in args and args.o is not None:
    output_name, output_extension = os.path.splitext(args.o.name)
    param["output"] = {
        "path": args.o.name,
        "type": "csv" if ".csv" in output_extension else "json"
    }
if "mt" in args and args.mt is not None:
    Crawler.crawl_multithread(param.get("crawling"), param.get("rules"), param.get("result"), param["output"], args.mt)
else:
    Crawler.crawl(param.get("crawling"), param.get("rules"),  param.get("result"), param["output"])
开发者ID:glebedel,项目名称:FileCrawler,代码行数:32,代码来源:filecrawler.py

示例9: main

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
def main():
	urls = raw_input("\n Pages to crawl: ")
	maxLinksToCrawl = int(raw_input(" Maximum amount of links to crawl: "))
	
	crawler = Crawler(urls, maxLinksToCrawl)
	crawler.crawl()
开发者ID:helgso,项目名称:WebCrawler,代码行数:8,代码来源:main.py

示例10: Crawler

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
# Written by Kevin Keraudren, 14/06/2011

import argparse

from Crawler import Crawler

parser = argparse.ArgumentParser(
    usage = "Usage: %(prog)s seed_url [options]" )
parser.add_argument(
    'seed',
    metavar='seed_url',        
    help='url for starting the crawl' )
parser.add_argument(
    '--dir',
    default='./',
    help="root directory to store the result of the crawl" )
parser.add_argument(
    '--verbose',
    action="store_true", default=True,
    help="verbose mode" ) 

args = parser.parse_args()

crawler = Crawler( args.seed, rootdir=args.dir, verbose=args.verbose )

print crawler
crawler.crawl()
print crawler
print "Crawl complete"

开发者ID:kevin-keraudren,项目名称:python-crawler,代码行数:31,代码来源:crawl.py

示例11: isinstance

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
            for regex, tags in self.regexes_tags:
                if(regex.match(resource)):
                    auto_tags.extend(tags)

            resources_tags.append((resource, auto_tags))

        assert isinstance(resources_tags, list)

        return resources_tags
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


# ~~ Main ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if __name__ == '__main__':
    from Crawler import Crawler

    crawler = Crawler(white_list=set((
        '..',
    )))

    auto_tagger = AutoTagger({
        r'^.*\.py$': ['python', 'development'],
        r'^.*\.css$': ['css', 'development'],
        r'^.*\.js$': ['javascript', 'development'],
    })

    for resource, tags in auto_tagger.process(crawler.crawl()):
        print(resource, tags)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
开发者ID:EPadronU,项目名称:omnitag-client,代码行数:31,代码来源:AutoTagger.py

示例12: test_crawl_fake_directCrawl

# 需要导入模块: from Crawler import Crawler [as 别名]
# 或者: from Crawler.Crawler import crawl [as 别名]
 def test_crawl_fake_directCrawl(self):
     parameters = FileOperations.get_from_JSON_file("./test/search_parameters.json")
     Crawler.crawl(parameters["crawling"], parameters["rules"], parameters["result"], parameters["output"])
     self.assertTrue(os.path.isfile(parameters["output"]["path"]))
     result_from_file = FileOperations.get_from_JSON_file(parameters["output"]["path"])
     self.assertEqual(len(result_from_file), 3)
开发者ID:glebedel,项目名称:FileCrawler,代码行数:8,代码来源:test_crawler.py


注:本文中的Crawler.Crawler.crawl方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。