本文整理汇总了PHP中Crawler::start方法的典型用法代码示例。如果您正苦于以下问题:PHP Crawler::start方法的具体用法?PHP Crawler::start怎么用?PHP Crawler::start使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Crawler
的用法示例。
在下文中一共展示了Crawler::start方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。
示例1: testCrawl
public function testCrawl()
{
$account = Account::login("searchzen.org", "test");
$c = $account->collections[0];
$crawler = new Crawler($c);
$crawler->pageLimit = 10;
$crawler->start();
}
示例2: foreach
}
foreach ($links as $link) {
$this->processLink($link);
}
}
// Sleep until we're guaranteed to have something to crawl, but no less than 1 second.
$sleepTime = 1 + max(0, min($this->accessTimes) + Config::get('crawler.t_wait') - time());
Applog::log("Sleeping for {$sleepTime} seconds");
sleep($sleepTime);
}
}
function start()
{
Applog::log("Crawler started");
// Salvam întregul whiteList in tabelul Link pentru a incepe extragerea.
// Aceste URL-uri nu vor avea o pagina din care sunt descoperite, deci crawledPageId va avea valoarea 0.
foreach (Config::get('crawler.whiteList') as $startUrl) {
$startUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
$rec = StringUtil::parseUtf8Url($startUrl);
Link::saveLink2DB($startUrl, $rec['host'], 0);
}
$this->crawlLoop();
}
}
/*
* Obiectul nu va fi creat daca acest fisier nu va fi fisier cautat
*/
if (strstr($_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
$obj = new Crawler();
$obj->start();
}
示例3: crawl
public function crawl($domain)
{
$crawler = new Crawler($this->getAccountId($domain));
$crawler->start();
}
示例4: Crawler
<?php
/**
* Author: Will Smelser
* Date: 1/10/14
* Time: 11:09 PM
* Project: openProjects
*/
error_reporting(E_ALL);
include 'required/class/Crawler.php';
$crawler = new Crawler('http://simple-seo-api.local', 'http://openprojects.local/crawler/required/loaders/Links.php', true, 999, 10, 10, 30);
$result = $crawler->start();
var_dump($result);
示例5: runCrawler
/**
* runs crawler for given settings, returns array of files
* TODO update to new stuff
*/
function runCrawler($FILE, $FILES_CACHE)
{
global $SETTINGS, $LAYOUT;
$urlToCrawl = isset($SETTINGS[PSNG_CRAWLER_URL]) && $SETTINGS[PSNG_CRAWLER_URL] != $SETTINGS[PSNG_WEBSITE] ? $SETTINGS[PSNG_CRAWLER_URL] : $SETTINGS[PSNG_WEBSITE];
$url = parse_url($urlToCrawl);
$path = $url['path'];
if (substr($urlToCrawl, -1) != '/' && $url['path'] == '') {
$path .= '/';
$urlToCrawl .= '/';
}
// check if we have a already started scan
debug($SETTINGS[PSNG_TIMEOUT], 'PSNG_TIMEOUT');
if (isset($SETTINGS[PSNG_TIMEOUT_TODO])) {
debug($SETTINGS[PSNG_TIMEOUT_TODO], 'PSNG_TIMEOUT_TODO');
}
# !!! 'repair' may not be correct mk/2005-11-08
if ($SETTINGS[PSNG_TIMEOUT] != PSNG_TIMEOUT_NONE && isset($SETTINGS[PSNG_TIMEOUT_TODO])) {
// check if we're running in TIMEOUT mode
debug('', "Running crawler engine from last point");
$crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]);
$crawler->setTodo($SETTINGS[PSNG_TIMEOUT_TODO]);
$crawler->setDone($SETTINGS[PSNG_TIMEOUT_DONE]);
$crawler->setFiles($SETTINGS[PSNG_TIMEOUT_FILE]);
} else {
// we are not in timeout mode, no rerun
$crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]);
$crawler->setTodo(array($urlToCrawl));
}
$crawler->setForbiddenKeys($SETTINGS[PSNG_DISALLOW_KEY]);
$crawler->setForbiddenDirectories($SETTINGS[PSNG_DISALLOW_DIR]);
$crawler->setForbiddenFiles($SETTINGS[PSNG_DISALLOW_FILE]);
//Set the directory to forbid the crawler to follow below it
$crawler->setDirectory($path);
$crawler->start();
if (!$crawler->hasFinished()) {
// store current data into session
$SETTINGS[PSNG_TIMEOUT_TODO] = $crawler->getTodo();
$SETTINGS[PSNG_TIMEOUT_DONE] = $crawler->getDone();
$SETTINGS[PSNG_TIMEOUT_FILE] = $crawler->getFiles();
$SETTINGS[PSNG_TIMEOUT_ACTION] = PSNG_TIMEOUT_ACTION_WEBSITE;
} else {
while ($crawler->hasNext()) {
$fileinfo = $crawler->getNext();
// returns an array
if (!isset($fileinfo['http_status'])) {
$fileinfo['http_status'] = '';
}
if (!isset($fileinfo['file'])) {
$fileinfo['file'] = '';
}
if (!isset($fileinfo['lastmod'])) {
$fileinfo['lastmod'] = '';
}
if (!isset($fileinfo['changefreq'])) {
$fileinfo['changefreq'] = '';
}
if (!isset($fileinfo['priority'])) {
$fileinfo['priority'] = '';
}
$http_status = $fileinfo['http_status'];
// create and setup valid values
$fileinfo = handleURL($fileinfo['file'], $fileinfo['lastmod'], $fileinfo['changefreq'], $fileinfo['priority']);
$fileinfo = handleURLCached($FILES_CACHE, $fileinfo);
// handle some website specific stuff
if ($http_status == "404") {
$fileinfo[PSNG_FILE_ENABLED] = '';
$fileinfo[PSNG_HTML_STATUS] = 'class="notfound"';
}
// info($fileinfo, 'Fileinfo from crawler');
// handle if the file exists on filesystem and on website
if (array_key_exists($fileinfo[PSNG_FILE_URL], $FILE)) {
$fileinfo = handleDoubleEntryFilesystemWebsite($FILE[$fileinfo[PSNG_FILE_URL]], $fileinfo);
}
// info($fileinfo, 'Fileinfo after handle double entry');
if (isset($fileinfo[PSNG_HTML_SOURCE])) {
if ($fileinfo[PSNG_HTML_SOURCE] == PSNG_HTML_SOURCE_FS) {
$fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_FS_WEBSITE;
} else {
$fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE;
}
} else {
$fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE;
}
$FILE[$fileinfo[PSNG_FILE_URL]] = $fileinfo;
}
$SETTINGS[PSNG_TIMEOUT_ACTION] = '';
}
return $FILE;
}