本文整理汇总了PHP中Zend_Search_Lucene_Document_Html::loadHTML方法的典型用法代码示例。如果您正苦于以下问题:PHP Zend_Search_Lucene_Document_Html::loadHTML方法的具体用法?PHP Zend_Search_Lucene_Document_Html::loadHTML怎么用?PHP Zend_Search_Lucene_Document_Html::loadHTML使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Zend_Search_Lucene_Document_Html
的用法示例。
在下文中一共展示了Zend_Search_Lucene_Document_Html::loadHTML方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。
示例1: highlightMatches
public function highlightMatches($inputHTML)
{
$doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
$colorIndex = 0;
$this->highlightMatchesDOM($doc, $colorIndex);
return $doc->getHTML();
}
示例2: _indexate
protected function _indexate($url)
{
if (!stristr($url, 'http://')) {
$url = HTTP_HOST . $url;
}
$url = substr($url, -1) == '/' ? substr($url, 0, -1) : $url;
if (!in_array($url, $this->_indexedUrl)) {
if (stristr($url, HTTP_HOST)) {
array_push($this->_indexedUrl, $url);
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = Zend_Search_Lucene_Document_Html::loadHTML($html);
libxml_use_internal_errors(false);
if (preg_match('/<\\!--index-->(.*)<\\!--\\/index-->/isu', $html, $matches)) {
$html = $matches[1];
}
$html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $html);
$html = strip_tags($html);
$doc->addField(Zend_Search_Lucene_Field::Text('content', $html, 'utf-8'));
$doc->addField(Zend_Search_Lucene_Field::UnIndexed('body', '', 'utf-8'));
$doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'utf-8'));
$this->_indexHandle->addDocument($doc);
Zend_Registry::get('Logger')->info('Search index is created: ' . $url, Zend_Log::INFO);
foreach ($doc->getLinks() as $link) {
$temp = explode('.', $link);
$ext = end($temp);
if ($link == $ext || in_array($ext, array('php', 'html', 'txt', 'htm'))) {
$this->_indexate($link);
}
}
}
}
}
示例3: testHtml
public function testHtml()
{
$doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
$this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
$doc->highlight('document', '#66ffff');
$this->assertTrue(strpos($doc->getHTML(), "<b style=\"color:black;background-color:#66ffff\">Document</b> body.") !== false);
$doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true);
$this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
$this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
}
示例4: testHtml
public function testHtml()
{
$doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
$this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
$doc->highlight('document', '#66ffff');
$this->assertEquals($doc->getHTML(), "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html>\n<head><title>Page title</title></head>\n<body><p><b style=\"color:black;background-color:#66ffff\">Document</b> body.</p></body>\n</html>\n");
$doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_files/_indexSource/contributing.documentation.html', true);
$this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
$this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
}
示例5: testHtmlNoFollowLinks
public function testHtmlNoFollowLinks()
{
$html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>';
$oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
$doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
$this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html'));
Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
$doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html);
$this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc2->getLinks()) == array('link1.html'));
}
示例6: indexFile
/**
* index a file
*
* @author Jörn Dreyer <jfd@butonic.de>
*
* @param string $path the path of the file
*
* @return bool
*/
public static function indexFile($path = '', $user = null)
{
if (!Filesystem::isValidPath($path)) {
return;
}
if ($path === '') {
//ignore the empty path element
return false;
}
if (is_null($user)) {
$view = Filesystem::getView();
$user = \OCP\User::getUser();
} else {
$view = new \OC\Files\View('/' . $user . '/files');
}
if (!$view) {
Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
return false;
}
$root = $view->getRoot();
$pk = md5($root . $path);
// the cache already knows mime and other basic stuff
$data = $view->getFileInfo($path);
if (isset($data['mimetype'])) {
$mimetype = $data['mimetype'];
if ('text/html' === $mimetype) {
$doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
} else {
if ('application/msword' === $mimetype) {
// FIXME uses ZipArchive ... make compatible with OC\Files\Filesystem
//$doc = Zend_Search_Lucene_Document_Docx::loadDocxFile(OC\Files\Filesystem::file_get_contents($path));
//no special treatment yet
$doc = new \Zend_Search_Lucene_Document();
} else {
$doc = new \Zend_Search_Lucene_Document();
}
}
// store fscacheid as unique id to lookup by when deleting
$doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
// Store document URL to identify it in the search results
$doc->addField(\Zend_Search_Lucene_Field::Text('path', $path));
$doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
$doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimetype));
self::extractMetadata($doc, $path, $view, $mimetype);
Lucene::updateFile($doc, $path, $user);
return true;
} else {
Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
return false;
}
}
示例7: indexWebsite
/**
* @param string $websiteId
* @return string
*/
public function indexWebsite($websiteId)
{
$websiteService = new Website('Website');
if (!$websiteService->existsWebsiteAlready($websiteId)) {
throw new CmsException('602', __METHOD__, __LINE__);
}
// Zum Rendern muss die Business-Schicht verwendet werden
$renderBusiness = new BusinessRender('Render');
$modulService = new Modul('Modul');
$pageService = new Page('Page');
$allPageIds = $pageService->getIdsByWebsiteId($websiteId);
$indexFileOfWebsite = $this->getIndexFileForWebsite($websiteId);
if (is_array($allPageIds) && count($allPageIds) > 0) {
if (file_exists($indexFileOfWebsite)) {
$index = \Zend_Search_Lucene::open($indexFileOfWebsite);
$numberOfIndexedDocuments = $index->numDocs();
for ($id = 0; $id < $numberOfIndexedDocuments; ++$id) {
if (!$index->isDeleted($id)) {
$document = $index->delete($id);
}
}
} else {
$index = \Zend_Search_Lucene::create($indexFileOfWebsite);
}
foreach ($allPageIds as $pageId) {
$pageContent = $this->getPageContent($websiteId, $pageId);
if ($this->isStoreContentEnabled()) {
$document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, true, 'UTF-8');
} else {
$document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, false, 'UTF-8');
}
$document->addField(\Zend_Search_Lucene_Field::unIndexed('md5', md5($pageContent)));
$document->addField(\Zend_Search_Lucene_Field::unIndexed('pageId', $pageId));
$index->addDocument($document);
}
$index->commit();
$index->optimize();
unset($index);
}
return $indexFileOfWebsite;
}
示例8: getPlainTextFromHtml
/**
*
* removes html, javascript and additional whitespaces from string
*
* @param $html
* @return mixed|string
*/
protected function getPlainTextFromHtml($html)
{
$doc = Zend_Search_Lucene_Document_Html::loadHTML($html, false, "utf-8");
$html = $doc->getHTML();
//remove scripts and stuff
$search = array('@(<script[^>]*?>.*?</script>)@si', '@<style[^>]*?>.*?</style>@siU', '@<![\\s\\S]*?--[ \\t\\n\\r]*>@');
$text = preg_replace($search, "", $html);
//remove html tags
$text = strip_tags($text);
//remove additional whitespaces
$text = preg_replace('@[ \\t\\n\\r\\f]+@', " ", $text);
return $text;
}
示例9: indexCatalog
public function indexCatalog($catalogGuid)
{
$index = $this->_index;
$tblCatalog = new Kutu_Core_Orm_Table_Catalog();
$rowsetCatalog = $tblCatalog->find($catalogGuid);
if (count($rowsetCatalog)) {
//check if guid exist in index, then delete
$term = new Zend_Search_Lucene_Index_Term($catalogGuid, 'guid');
$docIds = $index->termDocs($term);
foreach ($docIds as $id) {
$doc = $index->getDocument($id);
$index->delete($id);
}
$rowCatalog = $rowsetCatalog->current();
$doc = new Zend_Search_Lucene_Document();
$doc->addField(Zend_Search_Lucene_Field::Keyword('guid', $rowCatalog->guid));
//fill parentGuid with catalogGuid if it's kutu_doc
if ($rowCatalog->profileGuid == 'kutu_doc') {
$tblRelatedItem = new Kutu_Core_Orm_Table_RelatedItem();
$rowset = $tblRelatedItem->fetchAll("itemGuid='{$rowCatalog->guid}' AND relateAs='RELATED_FILE'");
if (count($rowset)) {
$row = $rowset->current();
$parentCatalogGuid = $row->relatedGuid;
$doc->addField(Zend_Search_Lucene_Field::Keyword('parentGuid', $parentCatalogGuid));
}
} else {
$doc->addField(Zend_Search_Lucene_Field::Keyword('parentGuid', $rowCatalog->guid));
}
$doc->addField(Zend_Search_Lucene_Field::Text('profile', $rowCatalog->profileGuid));
$doc->addField(Zend_Search_Lucene_Field::Keyword('publishedDate', $this->_filterDateTime($rowCatalog->publishedDate)));
$doc->addField(Zend_Search_Lucene_Field::Keyword('expiredDate', $this->_filterDateTime($rowCatalog->expiredDate)));
$doc->addField(Zend_Search_Lucene_Field::Keyword('createdBy', $rowCatalog->createdBy));
$doc->addField(Zend_Search_Lucene_Field::Keyword('modifiedBy', $rowCatalog->modifiedBy));
$doc->addField(Zend_Search_Lucene_Field::Keyword('createdDate', $this->_filterDateTime($rowCatalog->createdDate)));
$doc->addField(Zend_Search_Lucene_Field::Keyword('modifiedDate', $this->_filterDateTime($rowCatalog->modifiedDate)));
$doc->addField(Zend_Search_Lucene_Field::Keyword('status', $rowCatalog->status));
if ($rowCatalog->profileGuid == 'kutu_doc') {
$doc->addField(Zend_Search_Lucene_Field::Keyword('objectType', 'file'));
} else {
$doc->addField(Zend_Search_Lucene_Field::Keyword('objectType', 'catalog'));
}
$rowsetCatalogAttribute = $rowCatalog->findDependentRowsetCatalogAttribute();
if (count($rowsetCatalogAttribute)) {
foreach ($rowsetCatalogAttribute as $rowCatalogAttribute) {
switch ($rowCatalogAttribute->attributeGuid) {
case 'fixedTitle':
case 'title':
$doc->addField(Zend_Search_Lucene_Field::Text('title', $rowCatalogAttribute->value));
break;
case 'fixedSubTitle':
case 'subTitle':
$doc->addField(Zend_Search_Lucene_Field::Text('subtitle', $rowCatalogAttribute->value));
break;
case 'fixedContent':
case 'content':
$docHtml = Zend_Search_Lucene_Document_Html::loadHTML($rowCatalogAttribute->value);
$cleanedText = $docHtml->getFieldValue('body');
$doc->addField(Zend_Search_Lucene_Field::UnStored('content', $cleanedText));
break;
case 'fixedKeywords':
case 'keywords':
$doc->addField(Zend_Search_Lucene_Field::UnStored('keywords', $rowCatalogAttribute->value));
break;
case 'fixedDescription':
case 'description':
$doc->addField(Zend_Search_Lucene_Field::Text('description', $rowCatalogAttribute->value));
break;
case 'ptsKetua':
$doc->addField(Zend_Search_Lucene_Field::Text('judge', $rowCatalogAttribute->value));
break;
case 'prtNomor':
case 'fixedNomor':
case 'fixedNumber':
case 'nomor':
case 'ptsNomor':
$doc->addField(Zend_Search_Lucene_Field::UnStored('number', $rowCatalogAttribute->value));
break;
case 'prtTahun':
case 'fixedTahun':
case 'fixedYear':
case 'tahun':
case 'ptsTahun':
$doc->addField(Zend_Search_Lucene_Field::UnStored('year', $rowCatalogAttribute->value));
break;
default:
//check if attribute is a datetime field
$tblAttribute = new Kutu_Core_Orm_Table_Attribute();
$rowAttribute = $tblAttribute->find($rowCatalogAttribute->attributeGuid)->current();
if ($rowAttribute->type == 4) {
$doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $this->_filterDateTime($rowCatalogAttribute->value)));
} else {
if ($rowAttribute->type == 2) {
$docHtml = Zend_Search_Lucene_Document_Html::loadHTML($rowCatalogAttribute->value);
$cleanedText = $docHtml->getFieldValue('body');
$doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $cleanedText));
} else {
$doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $rowCatalogAttribute->value));
}
}
break;
//.........这里部分代码省略.........
示例10: _spider
protected function _spider($url)
{
$queue = array();
$visited = array();
array_push($queue, $url);
while (!empty($queue)) {
$doc = null;
$url = array_shift($queue);
if ($url = $this->_sanitizeUrl($url)) {
if (!in_array($url, $visited)) {
$visited[] = $url;
Bbx_Log::write('Spidering url ' . $url, null, Bbx_Search::LOG);
$cachePath = APPLICATION_PATH . '/../www/cached' . $url . '.html';
if (file_exists($cachePath)) {
Bbx_Log::write('Found file in cache', null, Bbx_Search::LOG);
try {
$doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($cachePath, false, 'utf-8');
} catch (Exception $e) {
Bbx_Log::write('Unable to open file: ' . $cachePath, null, Bbx_Search::LOG);
}
} else {
$this->_client->setUri($this->_getAbsoluteUrl($url));
try {
$response = $this->_client->request();
$status = $response->getStatus();
Bbx_Log::write('Client response code ' . $status, null, Bbx_Search::LOG);
if ($status == '200') {
$data = $response->getBody();
$doc = Zend_Search_Lucene_Document_Html::loadHTML($data, false, 'utf-8');
}
} catch (Exception $e) {
Bbx_Log::write('Request failed: ' . $e->getMessage(), null, Bbx_Search::LOG);
}
}
if ($doc !== null) {
$this->_search()->indexDoc($doc, $url);
$this->_indexed++;
$links = array_diff($doc->getLinks(), $this->_visited);
if (count($visited) < $this->_maxLinks) {
$queue = array_merge($queue, $links);
} else {
Bbx_Log::write('Reached max number of links (' . $this->_maxLinks . '), exiting', null, Bbx_Search::LOG);
exit;
}
}
}
}
}
}
示例11: build
/**
* Loop through all URIs
*
* @return void
* @access public
*/
public function build()
{
// Process the queue
$i = 0;
foreach ($this->pageList as $page) {
$uri = translateURL("page/{$page['id']}");
try {
/**
* check if customised template for indexing exists
* this is DEPRECATED approach how to customise indexable content, use getExcludes() instead
* remember that you need also to create controller for the template
*/
if (file_exists(ONXSHOP_PROJECT_DIR . "templates/node/page/{$page['node_controller']}_indexable.html")) {
$toFetch = "request/sys/html5.node/page/{$page['node_controller']}_indexable~id={$page['id']}~";
} else {
$toFetch = "request/sys/html5.node~id={$page['id']}~";
}
msg("Fetching page {$page['id']}: {$uri} using {$toFetch}");
$this->client->setUri($this->profile['uri'] . $toFetch);
$response = $this->client->request();
if ($response->isSuccessful() && !$response->isRedirect() && !$response->isError()) {
$response_body = $this->filterHtmlDocument($response->getBody());
$this->index($uri, Zend_Search_Lucene_Document_Html::loadHTML($response_body, true));
}
} catch (Exception $e) {
msg("HTTP fetch exception: " . $e->getMessage());
}
$i++;
// if ($i == 10) break;
}
// Optimize index.
$this->indexOptimize();
}
示例12: splitWordsFromCatalog
static function splitWordsFromCatalog($catalogGuid, $iLimit)
{
$desc = Kutu_Core_Util::getCatalogAttributeValue($catalogGuid, 'fixedDescription');
$content = Kutu_Core_Util::getCatalogAttributeValue($catalogGuid, 'fixedContent');
$desc = Zend_Search_Lucene_Document_Html::loadHTML($desc);
$content = Zend_Search_Lucene_Document_Html::loadHTML($content);
$desc = $desc->getFieldValue('body');
$content = $content->getFieldValue('body');
if (!empty($desc)) {
if ($iLimit > str_word_count($desc)) {
return $desc;
} else {
return Kutu_Core_Util::getNumberOfWords($desc, $iLimit);
}
}
if (!empty($content)) {
if ($iLimit > str_word_count($content)) {
return $content;
} else {
return Kutu_Core_Util::getNumberOfWords($content, $iLimit);
}
}
return '';
}
示例13: indexContent
/**
* Index with Zend_Lucene
*
* @param unknown_type $uri
* @param unknown_type $htmlString
*/
function indexContent($uri, $htmlString)
{
require_once 'Zend/Search/Lucene.php';
$index_location = ONXSHOP_PROJECT_DIR . 'var/index';
if (is_dir($index_location)) {
// Open existing index
try {
$index = Zend_Search_Lucene::open($index_location);
} catch (Exception $e) {
// Create index
try {
$index = Zend_Search_Lucene::create($index_location);
} catch (Exception $e) {
$index = false;
}
}
}
if ($index) {
// find and remove pages with the same URI
$hits = $index->find("uri:" . $uri);
foreach ($hits as $hit) {
$index->delete($hit);
}
$doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString, true);
$doc->addField(Zend_Search_Lucene_Field::Keyword('uri', $uri));
$index->addDocument($doc);
$index->commit();
}
}
示例14: onIndexContent
/**
*
* @param $article
* @param $isNew
*/
function onIndexContent($article, $isNew = false)
{
//FIXME move the content type tests and following transformations to the helper
global $mainframe;
$pk = $article->id;
if (!$isNew) {
JuceneHelper::removeFromIndex('pk:' . $pk);
}
$index = JuceneHelper::getIndex();
$xml_field = substr($article->fulltext, 0, 5) != '<?xml' ? $article->introtext : $article->fulltext;
if (substr($xml_field, 0, 5) == '<?xml') {
$dom = new DOMDocument();
$pmml = true;
$xslt = new DOMDocument();
$error = false;
//load xslt stylesheet
if (!@$xslt->load(JPATH_SITE . DS . 'administrator' . DS . 'components' . DS . 'com_jucene' . DS . 'xslt/jucene.xsl')) {
$error = true;
$this->raiseMessage("XSLTLOADERROR", 'error');
}
$proc = new XSLTProcessor();
if (!$proc->importStylesheet($xslt)) {
$error = true;
$this->raiseMessage("XSLTIMPORTERROR", 'error');
}
unset($artcile->fulltext);
unset($record->introtext);
if ($dom->loadXML($xml_field) && !$error && $pmml) {
//simplify the document - prepare it for the indexation process
$xslOutput = $proc->transformToXml($dom);
//create new DOM document to preserve output and transform the XML to the indexable one
$transXml = new DOMDocument();
$transXml->preserveWhitespace = false;
@$transXml->loadXML($xslOutput);
//unset unneccessary variables
unset($xslOutput);
unset($dom);
unset($xslt);
//index every assoc rule as document with same credentials
if (!$error) {
$rules = $transXml->getElementsByTagName("AssociationRule");
$rulesCount = $rules->length;
if ($rulesCount == 0) {
$error = true;
$this->raiseMessage('XMLDOCUMENTNORULES', 'error');
}
$rule_doc_position = 0;
foreach ($rules as $rule) {
$additional['rating'] = 0;
$additional['position'] = $rule_doc_position;
JPluginHelper::importPlugin('content');
$dispatcher =& JDispatcher::getInstance();
$results = $dispatcher->trigger('onIndexPmml', array($rule, $additional));
$rule_doc_position++;
}
}
}
} else {
$zendDoc = Zend_Search_Lucene_Document_Html::loadHTML($article->fulltext, false, UTF - 8);
$index->addDocument($zendDoc);
}
}
示例15: indexFile
/**
* index a file
*
* @author Jörn Dreyer <jfd@butonic.de>
*
* @param string $path the path of the file
*
* @return bool
*/
public static function indexFile($path = '', $user = null)
{
if (!Filesystem::isValidPath($path)) {
return;
}
if ($path === '') {
//ignore the empty path element
return false;
}
if (is_null($user)) {
$view = Filesystem::getView();
$user = \OCP\User::getUser();
} else {
$view = new \OC\Files\View('/' . $user . '/files');
}
if (!$view) {
Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
return false;
}
if (!$view->file_exists($path)) {
Util::writeLog('search_lucene', 'file vanished, ignoring', Util::DEBUG);
return true;
}
$root = $view->getRoot();
$pk = md5($root . $path);
// the cache already knows mime and other basic stuff
$data = $view->getFileInfo($path);
if (isset($data['mimetype'])) {
$mimeType = $data['mimetype'];
// initialize plain lucene document
$doc = new \Zend_Search_Lucene_Document();
// index content for local files only
$localFile = $view->getLocalFile($path);
if ($localFile) {
//try to use special lucene document types
if ('text/plain' === $mimeType) {
$body = $view->file_get_contents($path);
if ($body != '') {
$doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body));
}
} else {
if ('text/html' === $mimeType) {
//TODO could be indexed, even if not local
$doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
} else {
if ('application/pdf' === $mimeType) {
$doc = Pdf::loadPdf($view->file_get_contents($path));
// commented the mimetype checks, as the zend classes only understand docx and not doc files.
// FIXME distinguish doc and docx, xls and xlsx, ppt and pptx, in oc core mimetype helper ...
//} else if ('application/msword' === $mimeType) {
} else {
if (strtolower(substr($data['name'], -5)) === '.docx') {
$doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($localFile);
//} else if ('application/msexcel' === $mimeType) {
} else {
if (strtolower(substr($data['name'], -5)) === '.xlsx') {
$doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($localFile);
//} else if ('application/mspowerpoint' === $mimeType) {
} else {
if (strtolower(substr($data['name'], -5)) === '.pptx') {
$doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($localFile);
} else {
if (strtolower(substr($data['name'], -4)) === '.odt') {
$doc = Odt::loadOdtFile($localFile);
} else {
if (strtolower(substr($data['name'], -4)) === '.ods') {
$doc = Ods::loadOdsFile($localFile);
}
}
}
}
}
}
}
}
}
// Store filecache id as unique id to lookup by when deleting
$doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
// Store filename
$doc->addField(\Zend_Search_Lucene_Field::Text('filename', $data['name'], 'UTF-8'));
// Store document path to identify it in the search results
$doc->addField(\Zend_Search_Lucene_Field::Text('path', $path, 'UTF-8'));
$doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
$doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimeType));
//self::extractMetadata($doc, $path, $view, $mimeType);
Lucene::updateFile($doc, $path, $user);
return true;
} else {
Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
return false;
}
//.........这里部分代码省略.........