本文整理汇总了PHP中Crawler::setFiles方法的典型用法代码示例。如果您正苦于以下问题:PHP Crawler::setFiles方法的具体用法?PHP Crawler::setFiles怎么用?PHP Crawler::setFiles使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Crawler
的用法示例。
在下文中一共展示了Crawler::setFiles方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。
示例1: runCrawler
/**
* runs crawler for given settings, returns array of files
* TODO update to new stuff
*/
function runCrawler($FILE, $FILES_CACHE)
{
global $SETTINGS, $LAYOUT;
$urlToCrawl = isset($SETTINGS[PSNG_CRAWLER_URL]) && $SETTINGS[PSNG_CRAWLER_URL] != $SETTINGS[PSNG_WEBSITE] ? $SETTINGS[PSNG_CRAWLER_URL] : $SETTINGS[PSNG_WEBSITE];
$url = parse_url($urlToCrawl);
$path = $url['path'];
if (substr($urlToCrawl, -1) != '/' && $url['path'] == '') {
$path .= '/';
$urlToCrawl .= '/';
}
// check if we have a already started scan
debug($SETTINGS[PSNG_TIMEOUT], 'PSNG_TIMEOUT');
if (isset($SETTINGS[PSNG_TIMEOUT_TODO])) {
debug($SETTINGS[PSNG_TIMEOUT_TODO], 'PSNG_TIMEOUT_TODO');
}
# !!! 'repair' may not be correct mk/2005-11-08
if ($SETTINGS[PSNG_TIMEOUT] != PSNG_TIMEOUT_NONE && isset($SETTINGS[PSNG_TIMEOUT_TODO])) {
// check if we're running in TIMEOUT mode
debug('', "Running crawler engine from last point");
$crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]);
$crawler->setTodo($SETTINGS[PSNG_TIMEOUT_TODO]);
$crawler->setDone($SETTINGS[PSNG_TIMEOUT_DONE]);
$crawler->setFiles($SETTINGS[PSNG_TIMEOUT_FILE]);
} else {
// we are not in timeout mode, no rerun
$crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]);
$crawler->setTodo(array($urlToCrawl));
}
$crawler->setForbiddenKeys($SETTINGS[PSNG_DISALLOW_KEY]);
$crawler->setForbiddenDirectories($SETTINGS[PSNG_DISALLOW_DIR]);
$crawler->setForbiddenFiles($SETTINGS[PSNG_DISALLOW_FILE]);
//Set the directory to forbid the crawler to follow below it
$crawler->setDirectory($path);
$crawler->start();
if (!$crawler->hasFinished()) {
// store current data into session
$SETTINGS[PSNG_TIMEOUT_TODO] = $crawler->getTodo();
$SETTINGS[PSNG_TIMEOUT_DONE] = $crawler->getDone();
$SETTINGS[PSNG_TIMEOUT_FILE] = $crawler->getFiles();
$SETTINGS[PSNG_TIMEOUT_ACTION] = PSNG_TIMEOUT_ACTION_WEBSITE;
} else {
while ($crawler->hasNext()) {
$fileinfo = $crawler->getNext();
// returns an array
if (!isset($fileinfo['http_status'])) {
$fileinfo['http_status'] = '';
}
if (!isset($fileinfo['file'])) {
$fileinfo['file'] = '';
}
if (!isset($fileinfo['lastmod'])) {
$fileinfo['lastmod'] = '';
}
if (!isset($fileinfo['changefreq'])) {
$fileinfo['changefreq'] = '';
}
if (!isset($fileinfo['priority'])) {
$fileinfo['priority'] = '';
}
$http_status = $fileinfo['http_status'];
// create and setup valid values
$fileinfo = handleURL($fileinfo['file'], $fileinfo['lastmod'], $fileinfo['changefreq'], $fileinfo['priority']);
$fileinfo = handleURLCached($FILES_CACHE, $fileinfo);
// handle some website specific stuff
if ($http_status == "404") {
$fileinfo[PSNG_FILE_ENABLED] = '';
$fileinfo[PSNG_HTML_STATUS] = 'class="notfound"';
}
// info($fileinfo, 'Fileinfo from crawler');
// handle if the file exists on filesystem and on website
if (array_key_exists($fileinfo[PSNG_FILE_URL], $FILE)) {
$fileinfo = handleDoubleEntryFilesystemWebsite($FILE[$fileinfo[PSNG_FILE_URL]], $fileinfo);
}
// info($fileinfo, 'Fileinfo after handle double entry');
if (isset($fileinfo[PSNG_HTML_SOURCE])) {
if ($fileinfo[PSNG_HTML_SOURCE] == PSNG_HTML_SOURCE_FS) {
$fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_FS_WEBSITE;
} else {
$fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE;
}
} else {
$fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE;
}
$FILE[$fileinfo[PSNG_FILE_URL]] = $fileinfo;
}
$SETTINGS[PSNG_TIMEOUT_ACTION] = '';
}
return $FILE;
}