本文整理汇总了PHP中Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks方法的典型用法代码示例。如果您正苦于以下问题:PHP Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks方法的具体用法?PHP Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks怎么用?PHP Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Zend_Search_Lucene_Document_Html
的用法示例。
在下文中一共展示了Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。
示例1: testHtmlNoFollowLinks
public function testHtmlNoFollowLinks()
{
$html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>';
$oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
$doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
$this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html'));
Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
$doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html);
$this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc2->getLinks()) == array('link1.html'));
}
示例2: parseHtml
/**
* @param string $link
* @param Zend_Http_Response $response
* @param string $host
* @param string $protocol
* @param Zend_Http_CookieJar
* @param integer $depth
* @return boolean
*/
protected function parseHtml($link, $response, $host, $protocol, $cookieJar, $depth)
{
$html = $response->getBody();
$canonicalLink = $this->checkForCanonical($html);
if ($canonicalLink and $canonicalLink != $link) {
$this->processFoundLink($canonicalLink, $protocol, $host, $link, $depth, $cookieJar);
logger::debug(get_class($this) . ": Stopping to parse html at [ {$link} ], processing canonical link [ {$canonicalLink} ] instead");
return true;
}
//TODO: robots.txt
/*
//legacy ...
$links = array();
preg_match_all('/href=[\'"]+?\s*(?P<link>\S+)\s*[\'"]+?/', $html, $links);
$links = $links['link'];
*/
Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
$doc = Zend_Search_Lucene_Document_Html::loadHTML($html, false, "utf-8");
$links = $doc->getLinks();
$robotsMeta = $this->getRobotsMetaInfo($html);
if (in_array("nofollow", $robotsMeta)) {
//no links to follow
$links = array();
logger::debug(get_class($this) . ": not following links on [ {$link} ] because it has robots nofollow");
}
if (!in_array("noindex", $robotsMeta)) {
//now limit to search content area if indicators are set and found in this document
if (!empty($this->searchStartIndicator)) {
$documentHasDelimiter = strpos($html, $this->searchStartIndicator) !== FALSE;
}
if ($documentHasDelimiter and !empty($this->searchStartIndicator) and !empty($this->searchEndIndicator)) {
//get part before html head starts
$top = explode("<head>", $html);
//get html head
$htmlHead = array();
preg_match_all('@(<head[^>]*?>.*?</head>)@si', $html, $htmlHead);
$head = $top[0] . "<head></head>";
if (is_array($htmlHead[0])) {
$head = $top[0] . $htmlHead[0][0];
}
//get snippets within allowed content areas
$htmlSnippets = array();
$minified = str_replace(array("\r\n", "\r", "\n"), "", $html);
$minified = preg_replace('@[ \\t\\n\\r\\f]+@', " ", $minified);
preg_match_all('%' . $this->searchStartIndicator . '(.*?)' . $this->searchEndIndicator . '%si', $minified, $htmlSnippets);
$html = $head;
if (is_array($htmlSnippets[0])) {
foreach ($htmlSnippets[0] as $snippet) {
$html .= " " . $snippet;
}
}
//close html tag
$html .= "</html>";
}
$this->addHtmlToIndex($html, $link, $this->getLanguageFromResponse($response), $this->getEncodingFromResponse($response), $host);
logger::info(get_class($this) . ": Added to indexer stack [ {$link} ]");
} else {
$this->addNoIndexPage($link);
logger::debug(get_class($this) . ": not indexing [ {$link} ] because it has robots noindex");
}
if (count($links) > 0) {
foreach ($links as $foundLink) {
$this->processFoundLink($foundLink, $protocol, $host, $link, $depth, $cookieJar);
}
} else {
logger::debug(get_class($this) . ": No links found on page at [ {$link} ] ");
}
//TODO: for now we always return true - as success ... are there any unsuccessful states?
return true;
}
示例3: __construct
public function __construct()
{
Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
$this->_client = new Zend_Http_Client();
$this->_client->setConfig(array('timeout' => 10, 'keepalive' => true));
}
示例4: parseHtml
private function parseHtml($link, $response, $host)
{
$resource = $response->getResponse();
$crawler = $response->getCrawler();
$html = $resource->getBody();
$language = $this->getLanguageFromResponse($resource, $html);
$encoding = $this->getEncodingFromResponse($resource, $html);
//page has canonical link: do not track!
$hasCanonicalLink = $crawler->filterXpath('//link[@rel="canonical"]')->count() > 0;
if ($hasCanonicalLink === TRUE) {
\Pimcore\Logger::debug('LuceneSearch: not indexing [ ' . $link . ' ] because it has canonical links');
return FALSE;
}
//page has no follow: do not track!
$hasNoFollow = $crawler->filterXpath('//meta[@content="nofollow"]')->count() > 0;
if ($hasNoFollow === TRUE) {
\Pimcore\Logger::debug('LuceneSearch: not indexing [ ' . $link . ' ] because it has robots noindex');
return FALSE;
}
$hasCountryMeta = $crawler->filterXpath('//meta[@name="country"]')->count() > 0;
$hasTitle = $response->getCrawler()->filterXpath('//title')->count() > 0;
$hasDescription = $response->getCrawler()->filterXpath('//meta[@name="description"]')->count() > 0;
$hasRestriction = $response->getCrawler()->filterXpath('//meta[@name="m:groups"]')->count() > 0;
$country = FALSE;
if ($hasCountryMeta === TRUE) {
$country = $crawler->filterXpath('//meta[@name="country"]')->attr('content');
}
$title = '';
$description = '';
if ($hasTitle === TRUE) {
$title = $response->getCrawler()->filterXpath('//title')->text();
}
if ($hasDescription === TRUE) {
$description = $response->getCrawler()->filterXpath('//meta[@name="description"]')->attr('content');
}
$restrictions = FALSE;
if ($hasRestriction === TRUE) {
$restrictions = $crawler->filterXpath('//meta[@name="m:groups"]')->attr('content');
}
\Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
$documentHasDelimiter = FALSE;
$documentHasExcludeDelimiter = FALSE;
//now limit to search content area if indicators are set and found in this document
if (!empty($this->searchStartIndicator)) {
$documentHasDelimiter = strpos($html, $this->searchStartIndicator) !== FALSE;
}
//remove content between exclude indicators
if (!empty($this->searchExcludeStartIndicator)) {
$documentHasExcludeDelimiter = strpos($html, $this->searchExcludeStartIndicator) !== FALSE;
}
if ($documentHasDelimiter && !empty($this->searchStartIndicator) && !empty($this->searchEndIndicator)) {
preg_match_all('%' . $this->searchStartIndicator . '(.*?)' . $this->searchEndIndicator . '%si', $html, $htmlSnippets);
$html = '';
if (is_array($htmlSnippets[1])) {
foreach ($htmlSnippets[1] as $snippet) {
if ($documentHasExcludeDelimiter && !empty($this->searchExcludeStartIndicator) && !empty($this->searchExcludeEndIndicator)) {
$snippet = preg_replace('#(' . preg_quote($this->searchExcludeStartIndicator) . ')(.*?)(' . preg_quote($this->searchExcludeEndIndicator) . ')#si', ' ', $snippet);
}
$html .= ' ' . $snippet;
}
}
}
$this->addHtmlToIndex($html, $title, $description, $link, $language, $country, $restrictions, $encoding, $host);
\Pimcore\Logger::debug('LuceneSearch: Added to indexer stack [ ' . $link . ' ]');
return TRUE;
}