当前位置: 首页>>代码示例>>PHP>>正文


PHP Zend_Search_Lucene_Document_Html::loadHTML方法代码示例

本文整理汇总了PHP中Zend_Search_Lucene_Document_Html::loadHTML方法的典型用法代码示例。如果您正苦于以下问题:PHP Zend_Search_Lucene_Document_Html::loadHTML方法的具体用法?PHP Zend_Search_Lucene_Document_Html::loadHTML怎么用?PHP Zend_Search_Lucene_Document_Html::loadHTML使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Zend_Search_Lucene_Document_Html的用法示例。


在下文中一共展示了Zend_Search_Lucene_Document_Html::loadHTML方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。

示例1: highlightMatches

 public function highlightMatches($inputHTML)
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
     $colorIndex = 0;
     $this->highlightMatchesDOM($doc, $colorIndex);
     return $doc->getHTML();
 }
开发者ID:hackingman,项目名称:TubeX,代码行数:7,代码来源:Query.php

示例2: _indexate

 protected function _indexate($url)
 {
     if (!stristr($url, 'http://')) {
         $url = HTTP_HOST . $url;
     }
     $url = substr($url, -1) == '/' ? substr($url, 0, -1) : $url;
     if (!in_array($url, $this->_indexedUrl)) {
         if (stristr($url, HTTP_HOST)) {
             array_push($this->_indexedUrl, $url);
             $html = file_get_contents($url);
             libxml_use_internal_errors(true);
             $doc = Zend_Search_Lucene_Document_Html::loadHTML($html);
             libxml_use_internal_errors(false);
             if (preg_match('/<\\!--index-->(.*)<\\!--\\/index-->/isu', $html, $matches)) {
                 $html = $matches[1];
             }
             $html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $html);
             $html = strip_tags($html);
             $doc->addField(Zend_Search_Lucene_Field::Text('content', $html, 'utf-8'));
             $doc->addField(Zend_Search_Lucene_Field::UnIndexed('body', '', 'utf-8'));
             $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'utf-8'));
             $this->_indexHandle->addDocument($doc);
             Zend_Registry::get('Logger')->info('Search index is created: ' . $url, Zend_Log::INFO);
             foreach ($doc->getLinks() as $link) {
                 $temp = explode('.', $link);
                 $ext = end($temp);
                 if ($link == $ext || in_array($ext, array('php', 'html', 'txt', 'htm'))) {
                     $this->_indexate($link);
                 }
             }
         }
     }
 }
开发者ID:kytvi2p,项目名称:ZettaFramework,代码行数:33,代码来源:CronController.php

示例3: testHtml

 public function testHtml()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $doc->highlight('document', '#66ffff');
     $this->assertTrue(strpos($doc->getHTML(), "<b style=\"color:black;background-color:#66ffff\">Document</b> body.") !== false);
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true);
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
     $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
 }
开发者ID:jon9872,项目名称:zend-framework,代码行数:11,代码来源:DocumentTest.php

示例4: testHtml

 public function testHtml()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $doc->highlight('document', '#66ffff');
     $this->assertEquals($doc->getHTML(), "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html>\n<head><title>Page title</title></head>\n<body><p><b style=\"color:black;background-color:#66ffff\">Document</b> body.</p></body>\n</html>\n");
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_files/_indexSource/contributing.documentation.html', true);
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
     $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
 }
开发者ID:jorgenils,项目名称:zend-framework,代码行数:11,代码来源:DocumentTest.php

示例5: testHtmlNoFollowLinks

 public function testHtmlNoFollowLinks()
 {
     $html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>';
     $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
     $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
     $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html'));
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
     $doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html);
     $this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html'));
 }
开发者ID:lortnus,项目名称:zf1,代码行数:13,代码来源:DocumentTest.php

示例6: indexFile

 /**
  * index a file
  *
  * @author Jörn Dreyer <jfd@butonic.de>
  *
  * @param string $path the path of the file
  *
  * @return bool
  */
 public static function indexFile($path = '', $user = null)
 {
     if (!Filesystem::isValidPath($path)) {
         return;
     }
     if ($path === '') {
         //ignore the empty path element
         return false;
     }
     if (is_null($user)) {
         $view = Filesystem::getView();
         $user = \OCP\User::getUser();
     } else {
         $view = new \OC\Files\View('/' . $user . '/files');
     }
     if (!$view) {
         Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
         return false;
     }
     $root = $view->getRoot();
     $pk = md5($root . $path);
     // the cache already knows mime and other basic stuff
     $data = $view->getFileInfo($path);
     if (isset($data['mimetype'])) {
         $mimetype = $data['mimetype'];
         if ('text/html' === $mimetype) {
             $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
         } else {
             if ('application/msword' === $mimetype) {
                 // FIXME uses ZipArchive ... make compatible with OC\Files\Filesystem
                 //$doc = Zend_Search_Lucene_Document_Docx::loadDocxFile(OC\Files\Filesystem::file_get_contents($path));
                 //no special treatment yet
                 $doc = new \Zend_Search_Lucene_Document();
             } else {
                 $doc = new \Zend_Search_Lucene_Document();
             }
         }
         // store fscacheid as unique id to lookup by when deleting
         $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
         // Store document URL to identify it in the search results
         $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimetype));
         self::extractMetadata($doc, $path, $view, $mimetype);
         Lucene::updateFile($doc, $path, $user);
         return true;
     } else {
         Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
         return false;
     }
 }
开发者ID:CDN-Sparks,项目名称:owncloud,代码行数:60,代码来源:indexer.php

示例7: indexWebsite

 /**
  * @param  string $websiteId
  * @return string
  */
 public function indexWebsite($websiteId)
 {
     $websiteService = new Website('Website');
     if (!$websiteService->existsWebsiteAlready($websiteId)) {
         throw new CmsException('602', __METHOD__, __LINE__);
     }
     // Zum Rendern muss die Business-Schicht verwendet werden
     $renderBusiness = new BusinessRender('Render');
     $modulService = new Modul('Modul');
     $pageService = new Page('Page');
     $allPageIds = $pageService->getIdsByWebsiteId($websiteId);
     $indexFileOfWebsite = $this->getIndexFileForWebsite($websiteId);
     if (is_array($allPageIds) && count($allPageIds) > 0) {
         if (file_exists($indexFileOfWebsite)) {
             $index = \Zend_Search_Lucene::open($indexFileOfWebsite);
             $numberOfIndexedDocuments = $index->numDocs();
             for ($id = 0; $id < $numberOfIndexedDocuments; ++$id) {
                 if (!$index->isDeleted($id)) {
                     $document = $index->delete($id);
                 }
             }
         } else {
             $index = \Zend_Search_Lucene::create($indexFileOfWebsite);
         }
         foreach ($allPageIds as $pageId) {
             $pageContent = $this->getPageContent($websiteId, $pageId);
             if ($this->isStoreContentEnabled()) {
                 $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, true, 'UTF-8');
             } else {
                 $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, false, 'UTF-8');
             }
             $document->addField(\Zend_Search_Lucene_Field::unIndexed('md5', md5($pageContent)));
             $document->addField(\Zend_Search_Lucene_Field::unIndexed('pageId', $pageId));
             $index->addDocument($document);
         }
         $index->commit();
         $index->optimize();
         unset($index);
     }
     return $indexFileOfWebsite;
 }
开发者ID:rukzuk,项目名称:rukzuk,代码行数:45,代码来源:Indexer.php

示例8: getPlainTextFromHtml

 /**
  *
  * removes html, javascript and additional whitespaces from string
  *
  * @param  $html
  * @return mixed|string
  */
 protected function getPlainTextFromHtml($html)
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML($html, false, "utf-8");
     $html = $doc->getHTML();
     //remove scripts and stuff
     $search = array('@(<script[^>]*?>.*?</script>)@si', '@<style[^>]*?>.*?</style>@siU', '@<![\\s\\S]*?--[ \\t\\n\\r]*>@');
     $text = preg_replace($search, "", $html);
     //remove html tags
     $text = strip_tags($text);
     //remove additional whitespaces
     $text = preg_replace('@[ \\t\\n\\r\\f]+@', " ", $text);
     return $text;
 }
开发者ID:weblizards-gmbh,项目名称:search-php,代码行数:20,代码来源:Crawler.php

示例9: indexCatalog

 public function indexCatalog($catalogGuid)
 {
     $index = $this->_index;
     $tblCatalog = new Kutu_Core_Orm_Table_Catalog();
     $rowsetCatalog = $tblCatalog->find($catalogGuid);
     if (count($rowsetCatalog)) {
         //check if guid exist in index, then delete
         $term = new Zend_Search_Lucene_Index_Term($catalogGuid, 'guid');
         $docIds = $index->termDocs($term);
         foreach ($docIds as $id) {
             $doc = $index->getDocument($id);
             $index->delete($id);
         }
         $rowCatalog = $rowsetCatalog->current();
         $doc = new Zend_Search_Lucene_Document();
         $doc->addField(Zend_Search_Lucene_Field::Keyword('guid', $rowCatalog->guid));
         //fill parentGuid with catalogGuid if it's kutu_doc
         if ($rowCatalog->profileGuid == 'kutu_doc') {
             $tblRelatedItem = new Kutu_Core_Orm_Table_RelatedItem();
             $rowset = $tblRelatedItem->fetchAll("itemGuid='{$rowCatalog->guid}' AND relateAs='RELATED_FILE'");
             if (count($rowset)) {
                 $row = $rowset->current();
                 $parentCatalogGuid = $row->relatedGuid;
                 $doc->addField(Zend_Search_Lucene_Field::Keyword('parentGuid', $parentCatalogGuid));
             }
         } else {
             $doc->addField(Zend_Search_Lucene_Field::Keyword('parentGuid', $rowCatalog->guid));
         }
         $doc->addField(Zend_Search_Lucene_Field::Text('profile', $rowCatalog->profileGuid));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('publishedDate', $this->_filterDateTime($rowCatalog->publishedDate)));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('expiredDate', $this->_filterDateTime($rowCatalog->expiredDate)));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('createdBy', $rowCatalog->createdBy));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('modifiedBy', $rowCatalog->modifiedBy));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('createdDate', $this->_filterDateTime($rowCatalog->createdDate)));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('modifiedDate', $this->_filterDateTime($rowCatalog->modifiedDate)));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('status', $rowCatalog->status));
         if ($rowCatalog->profileGuid == 'kutu_doc') {
             $doc->addField(Zend_Search_Lucene_Field::Keyword('objectType', 'file'));
         } else {
             $doc->addField(Zend_Search_Lucene_Field::Keyword('objectType', 'catalog'));
         }
         $rowsetCatalogAttribute = $rowCatalog->findDependentRowsetCatalogAttribute();
         if (count($rowsetCatalogAttribute)) {
             foreach ($rowsetCatalogAttribute as $rowCatalogAttribute) {
                 switch ($rowCatalogAttribute->attributeGuid) {
                     case 'fixedTitle':
                     case 'title':
                         $doc->addField(Zend_Search_Lucene_Field::Text('title', $rowCatalogAttribute->value));
                         break;
                     case 'fixedSubTitle':
                     case 'subTitle':
                         $doc->addField(Zend_Search_Lucene_Field::Text('subtitle', $rowCatalogAttribute->value));
                         break;
                     case 'fixedContent':
                     case 'content':
                         $docHtml = Zend_Search_Lucene_Document_Html::loadHTML($rowCatalogAttribute->value);
                         $cleanedText = $docHtml->getFieldValue('body');
                         $doc->addField(Zend_Search_Lucene_Field::UnStored('content', $cleanedText));
                         break;
                     case 'fixedKeywords':
                     case 'keywords':
                         $doc->addField(Zend_Search_Lucene_Field::UnStored('keywords', $rowCatalogAttribute->value));
                         break;
                     case 'fixedDescription':
                     case 'description':
                         $doc->addField(Zend_Search_Lucene_Field::Text('description', $rowCatalogAttribute->value));
                         break;
                     case 'ptsKetua':
                         $doc->addField(Zend_Search_Lucene_Field::Text('judge', $rowCatalogAttribute->value));
                         break;
                     case 'prtNomor':
                     case 'fixedNomor':
                     case 'fixedNumber':
                     case 'nomor':
                     case 'ptsNomor':
                         $doc->addField(Zend_Search_Lucene_Field::UnStored('number', $rowCatalogAttribute->value));
                         break;
                     case 'prtTahun':
                     case 'fixedTahun':
                     case 'fixedYear':
                     case 'tahun':
                     case 'ptsTahun':
                         $doc->addField(Zend_Search_Lucene_Field::UnStored('year', $rowCatalogAttribute->value));
                         break;
                     default:
                         //check if attribute is a datetime field
                         $tblAttribute = new Kutu_Core_Orm_Table_Attribute();
                         $rowAttribute = $tblAttribute->find($rowCatalogAttribute->attributeGuid)->current();
                         if ($rowAttribute->type == 4) {
                             $doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $this->_filterDateTime($rowCatalogAttribute->value)));
                         } else {
                             if ($rowAttribute->type == 2) {
                                 $docHtml = Zend_Search_Lucene_Document_Html::loadHTML($rowCatalogAttribute->value);
                                 $cleanedText = $docHtml->getFieldValue('body');
                                 $doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $cleanedText));
                             } else {
                                 $doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $rowCatalogAttribute->value));
                             }
                         }
                         break;
//.........这里部分代码省略.........
开发者ID:hukumonline,项目名称:gtz,代码行数:101,代码来源:ZendLucene.php

示例10: _spider

 protected function _spider($url)
 {
     $queue = array();
     $visited = array();
     array_push($queue, $url);
     while (!empty($queue)) {
         $doc = null;
         $url = array_shift($queue);
         if ($url = $this->_sanitizeUrl($url)) {
             if (!in_array($url, $visited)) {
                 $visited[] = $url;
                 Bbx_Log::write('Spidering url ' . $url, null, Bbx_Search::LOG);
                 $cachePath = APPLICATION_PATH . '/../www/cached' . $url . '.html';
                 if (file_exists($cachePath)) {
                     Bbx_Log::write('Found file in cache', null, Bbx_Search::LOG);
                     try {
                         $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($cachePath, false, 'utf-8');
                     } catch (Exception $e) {
                         Bbx_Log::write('Unable to open file: ' . $cachePath, null, Bbx_Search::LOG);
                     }
                 } else {
                     $this->_client->setUri($this->_getAbsoluteUrl($url));
                     try {
                         $response = $this->_client->request();
                         $status = $response->getStatus();
                         Bbx_Log::write('Client response code ' . $status, null, Bbx_Search::LOG);
                         if ($status == '200') {
                             $data = $response->getBody();
                             $doc = Zend_Search_Lucene_Document_Html::loadHTML($data, false, 'utf-8');
                         }
                     } catch (Exception $e) {
                         Bbx_Log::write('Request failed: ' . $e->getMessage(), null, Bbx_Search::LOG);
                     }
                 }
                 if ($doc !== null) {
                     $this->_search()->indexDoc($doc, $url);
                     $this->_indexed++;
                     $links = array_diff($doc->getLinks(), $this->_visited);
                     if (count($visited) < $this->_maxLinks) {
                         $queue = array_merge($queue, $links);
                     } else {
                         Bbx_Log::write('Reached max number of links (' . $this->_maxLinks . '), exiting', null, Bbx_Search::LOG);
                         exit;
                     }
                 }
             }
         }
     }
 }
开发者ID:rdallasgray,项目名称:bbx,代码行数:49,代码来源:Spider.php

示例11: build

 /**
  * Loop through all URIs
  * 
  * @return void
  * @access public
  */
 public function build()
 {
     // Process the queue
     $i = 0;
     foreach ($this->pageList as $page) {
         $uri = translateURL("page/{$page['id']}");
         try {
             /**
              * check if customised template for indexing exists
              * this is DEPRECATED approach how to customise indexable content, use getExcludes() instead
              * remember that you need also to create controller for the template
              */
             if (file_exists(ONXSHOP_PROJECT_DIR . "templates/node/page/{$page['node_controller']}_indexable.html")) {
                 $toFetch = "request/sys/html5.node/page/{$page['node_controller']}_indexable~id={$page['id']}~";
             } else {
                 $toFetch = "request/sys/html5.node~id={$page['id']}~";
             }
             msg("Fetching page {$page['id']}: {$uri} using {$toFetch}");
             $this->client->setUri($this->profile['uri'] . $toFetch);
             $response = $this->client->request();
             if ($response->isSuccessful() && !$response->isRedirect() && !$response->isError()) {
                 $response_body = $this->filterHtmlDocument($response->getBody());
                 $this->index($uri, Zend_Search_Lucene_Document_Html::loadHTML($response_body, true));
             }
         } catch (Exception $e) {
             msg("HTTP fetch exception: " . $e->getMessage());
         }
         $i++;
         // if ($i == 10) break;
     }
     // Optimize index.
     $this->indexOptimize();
 }
开发者ID:AppChecker,项目名称:onxshop,代码行数:39,代码来源:search_index_build.php

示例12: splitWordsFromCatalog

 static function splitWordsFromCatalog($catalogGuid, $iLimit)
 {
     $desc = Kutu_Core_Util::getCatalogAttributeValue($catalogGuid, 'fixedDescription');
     $content = Kutu_Core_Util::getCatalogAttributeValue($catalogGuid, 'fixedContent');
     $desc = Zend_Search_Lucene_Document_Html::loadHTML($desc);
     $content = Zend_Search_Lucene_Document_Html::loadHTML($content);
     $desc = $desc->getFieldValue('body');
     $content = $content->getFieldValue('body');
     if (!empty($desc)) {
         if ($iLimit > str_word_count($desc)) {
             return $desc;
         } else {
             return Kutu_Core_Util::getNumberOfWords($desc, $iLimit);
         }
     }
     if (!empty($content)) {
         if ($iLimit > str_word_count($content)) {
             return $content;
         } else {
             return Kutu_Core_Util::getNumberOfWords($content, $iLimit);
         }
     }
     return '';
 }
开发者ID:psykomo,项目名称:kutump,代码行数:24,代码来源:Util.php

示例13: indexContent

 /**
  * Index with Zend_Lucene
  *
  * @param unknown_type $uri
  * @param unknown_type $htmlString
  */
 function indexContent($uri, $htmlString)
 {
     require_once 'Zend/Search/Lucene.php';
     $index_location = ONXSHOP_PROJECT_DIR . 'var/index';
     if (is_dir($index_location)) {
         // Open existing index
         try {
             $index = Zend_Search_Lucene::open($index_location);
         } catch (Exception $e) {
             // Create index
             try {
                 $index = Zend_Search_Lucene::create($index_location);
             } catch (Exception $e) {
                 $index = false;
             }
         }
     }
     if ($index) {
         // find and remove pages with the same URI
         $hits = $index->find("uri:" . $uri);
         foreach ($hits as $hit) {
             $index->delete($hit);
         }
         $doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString, true);
         $doc->addField(Zend_Search_Lucene_Field::Keyword('uri', $uri));
         $index->addDocument($doc);
         $index->commit();
     }
 }
开发者ID:AppChecker,项目名称:onxshop,代码行数:35,代码来源:onxshop.bootstrap.php

示例14: onIndexContent

 /**
  * 
  * @param $article
  * @param $isNew
  */
 function onIndexContent($article, $isNew = false)
 {
     //FIXME move the content type tests and following transformations to the helper
     global $mainframe;
     $pk = $article->id;
     if (!$isNew) {
         JuceneHelper::removeFromIndex('pk:' . $pk);
     }
     $index = JuceneHelper::getIndex();
     $xml_field = substr($article->fulltext, 0, 5) != '<?xml' ? $article->introtext : $article->fulltext;
     if (substr($xml_field, 0, 5) == '<?xml') {
         $dom = new DOMDocument();
         $pmml = true;
         $xslt = new DOMDocument();
         $error = false;
         //load xslt stylesheet
         if (!@$xslt->load(JPATH_SITE . DS . 'administrator' . DS . 'components' . DS . 'com_jucene' . DS . 'xslt/jucene.xsl')) {
             $error = true;
             $this->raiseMessage("XSLTLOADERROR", 'error');
         }
         $proc = new XSLTProcessor();
         if (!$proc->importStylesheet($xslt)) {
             $error = true;
             $this->raiseMessage("XSLTIMPORTERROR", 'error');
         }
         unset($artcile->fulltext);
         unset($record->introtext);
         if ($dom->loadXML($xml_field) && !$error && $pmml) {
             //simplify the document - prepare it for the indexation process
             $xslOutput = $proc->transformToXml($dom);
             //create new DOM document to preserve output and transform the XML to the indexable one
             $transXml = new DOMDocument();
             $transXml->preserveWhitespace = false;
             @$transXml->loadXML($xslOutput);
             //unset unneccessary variables
             unset($xslOutput);
             unset($dom);
             unset($xslt);
             //index every assoc rule as document with same credentials
             if (!$error) {
                 $rules = $transXml->getElementsByTagName("AssociationRule");
                 $rulesCount = $rules->length;
                 if ($rulesCount == 0) {
                     $error = true;
                     $this->raiseMessage('XMLDOCUMENTNORULES', 'error');
                 }
                 $rule_doc_position = 0;
                 foreach ($rules as $rule) {
                     $additional['rating'] = 0;
                     $additional['position'] = $rule_doc_position;
                     JPluginHelper::importPlugin('content');
                     $dispatcher =& JDispatcher::getInstance();
                     $results = $dispatcher->trigger('onIndexPmml', array($rule, $additional));
                     $rule_doc_position++;
                 }
             }
         }
     } else {
         $zendDoc = Zend_Search_Lucene_Document_Html::loadHTML($article->fulltext, false, UTF - 8);
         $index->addDocument($zendDoc);
     }
 }
开发者ID:KIZI,项目名称:sewebar-cms,代码行数:67,代码来源:contindexpmml.php

示例15: indexFile

 /**
  * index a file
  *
  * @author Jörn Dreyer <jfd@butonic.de>
  *
  * @param string $path the path of the file
  *
  * @return bool
  */
 public static function indexFile($path = '', $user = null)
 {
     if (!Filesystem::isValidPath($path)) {
         return;
     }
     if ($path === '') {
         //ignore the empty path element
         return false;
     }
     if (is_null($user)) {
         $view = Filesystem::getView();
         $user = \OCP\User::getUser();
     } else {
         $view = new \OC\Files\View('/' . $user . '/files');
     }
     if (!$view) {
         Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
         return false;
     }
     if (!$view->file_exists($path)) {
         Util::writeLog('search_lucene', 'file vanished, ignoring', Util::DEBUG);
         return true;
     }
     $root = $view->getRoot();
     $pk = md5($root . $path);
     // the cache already knows mime and other basic stuff
     $data = $view->getFileInfo($path);
     if (isset($data['mimetype'])) {
         $mimeType = $data['mimetype'];
         // initialize plain lucene document
         $doc = new \Zend_Search_Lucene_Document();
         // index content for local files only
         $localFile = $view->getLocalFile($path);
         if ($localFile) {
             //try to use special lucene document types
             if ('text/plain' === $mimeType) {
                 $body = $view->file_get_contents($path);
                 if ($body != '') {
                     $doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body));
                 }
             } else {
                 if ('text/html' === $mimeType) {
                     //TODO could be indexed, even if not local
                     $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
                 } else {
                     if ('application/pdf' === $mimeType) {
                         $doc = Pdf::loadPdf($view->file_get_contents($path));
                         // commented the mimetype checks, as the zend classes only understand docx and not doc files.
                         // FIXME distinguish doc and docx, xls and xlsx, ppt and pptx, in oc core mimetype helper ...
                         //} else if ('application/msword' === $mimeType) {
                     } else {
                         if (strtolower(substr($data['name'], -5)) === '.docx') {
                             $doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($localFile);
                             //} else if ('application/msexcel' === $mimeType) {
                         } else {
                             if (strtolower(substr($data['name'], -5)) === '.xlsx') {
                                 $doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($localFile);
                                 //} else if ('application/mspowerpoint' === $mimeType) {
                             } else {
                                 if (strtolower(substr($data['name'], -5)) === '.pptx') {
                                     $doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($localFile);
                                 } else {
                                     if (strtolower(substr($data['name'], -4)) === '.odt') {
                                         $doc = Odt::loadOdtFile($localFile);
                                     } else {
                                         if (strtolower(substr($data['name'], -4)) === '.ods') {
                                             $doc = Ods::loadOdsFile($localFile);
                                         }
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
         // Store filecache id as unique id to lookup by when deleting
         $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
         // Store filename
         $doc->addField(\Zend_Search_Lucene_Field::Text('filename', $data['name'], 'UTF-8'));
         // Store document path to identify it in the search results
         $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path, 'UTF-8'));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimeType));
         //self::extractMetadata($doc, $path, $view, $mimeType);
         Lucene::updateFile($doc, $path, $user);
         return true;
     } else {
         Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
         return false;
     }
//.........这里部分代码省略.........
开发者ID:omusico,项目名称:isle-web-framework,代码行数:101,代码来源:indexer.php


注:本文中的Zend_Search_Lucene_Document_Html::loadHTML方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。