本文整理汇总了PHP中Zend_Search_Lucene_Document_Html类的典型用法代码示例。如果您正苦于以下问题:PHP Zend_Search_Lucene_Document_Html类的具体用法?PHP Zend_Search_Lucene_Document_Html怎么用?PHP Zend_Search_Lucene_Document_Html使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Zend_Search_Lucene_Document_Html类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。
示例1: _indexate
protected function _indexate($url)
{
if (!stristr($url, 'http://')) {
$url = HTTP_HOST . $url;
}
$url = substr($url, -1) == '/' ? substr($url, 0, -1) : $url;
if (!in_array($url, $this->_indexedUrl)) {
if (stristr($url, HTTP_HOST)) {
array_push($this->_indexedUrl, $url);
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = Zend_Search_Lucene_Document_Html::loadHTML($html);
libxml_use_internal_errors(false);
if (preg_match('/<\\!--index-->(.*)<\\!--\\/index-->/isu', $html, $matches)) {
$html = $matches[1];
}
$html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $html);
$html = strip_tags($html);
$doc->addField(Zend_Search_Lucene_Field::Text('content', $html, 'utf-8'));
$doc->addField(Zend_Search_Lucene_Field::UnIndexed('body', '', 'utf-8'));
$doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'utf-8'));
$this->_indexHandle->addDocument($doc);
Zend_Registry::get('Logger')->info('Search index is created: ' . $url, Zend_Log::INFO);
foreach ($doc->getLinks() as $link) {
$temp = explode('.', $link);
$ext = end($temp);
if ($link == $ext || in_array($ext, array('php', 'html', 'txt', 'htm'))) {
$this->_indexate($link);
}
}
}
}
}
示例2: insert
/**
* Inserts the provided action
*/
public function insert()
{
if (!$this->shouldIndex()) {
return;
}
throw new sfException(__CLASS__ . ' not implemented');
extract($this->getActionProperties());
$output = $this->executeAction($params);
$content = $output->getContent();
$doc = Zend_Search_Lucene_Document_Html::loadHtml($content);
$doc->addField('sfl_title', $output->getLastTitle(), 2);
$doc->addField('sfl_uri', $this->getUri($params));
$doc->addField('sfl_description', $content);
$doc->addField('sfl_type', 'action');
$categories = $this->getActionCategories();
if (count($categories)) {
foreach ($categories as $category) {
$this->addCategory($category);
}
$doc->addField('sfl_category', implode(', ', $categories));
}
$doc->addField('sfl_categories_cache', serialize($categories));
$guid = $this->getGuid($params);
$this->addDocument($doc, $guid, 'action');
$this->getSearch()->getEventDispatcher()->notify(new sfEvent($this, 'indexer.log', array('Inserted action "%s" of module "%s" to index', $this->getAction(), $this->getModule())));
return $this;
}
示例3: highlightMatches
public function highlightMatches($inputHTML)
{
$doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
$colorIndex = 0;
$this->highlightMatchesDOM($doc, $colorIndex);
return $doc->getHTML();
}
示例4: testHtml
public function testHtml()
{
$doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
$this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
$doc->highlight('document', '#66ffff');
$this->assertEquals($doc->getHTML(), "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html>\n<head><title>Page title</title></head>\n<body><p><b style=\"color:black;background-color:#66ffff\">Document</b> body.</p></body>\n</html>\n");
$doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_files/_indexSource/contributing.documentation.html', true);
$this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
$this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
}
示例5: testHtml
public function testHtml()
{
$doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
$this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
$doc->highlight('document', '#66ffff');
$this->assertTrue(strpos($doc->getHTML(), "<b style=\"color:black;background-color:#66ffff\">Document</b> body.") !== false);
$doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true);
$this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
$this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
}
示例6: extract
/**
* Returns a string containing the text in the given HTML document.
*
* @param String $filename Full filesystem path to the file to process.
* @return String Text extracted from the file.
*/
public static function extract($filename)
{
if (!file_exists($filename)) {
return '';
}
try {
$doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, true);
} catch (Exception $e) {
return '';
}
return $doc->body;
}
示例7: addurl
public function addurl()
{
// use a local file for purpose of demo.
$filename = MODPATH . "kosearch" . DIRECTORY_SEPARATOR . "examples" . DIRECTORY_SEPARATOR . "kohana_home.html";
// Note: the Search class is responsible for loading the Zend libraries, so as we
// want to instantiate Zend_Search_Lucene_Document_Html prior to calling singleton,
// we must first call Search::instance()->load_search_libs();
Search::instance()->load_search_libs();
$doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, TRUE, "utf-8");
Search::instance()->addDocument($doc);
$this->index('Kohana page successfully added ↓ <a href="#form2" title="scroll down">scroll down</a> ↓');
}
示例8: testHtmlNoFollowLinks
public function testHtmlNoFollowLinks()
{
$html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>';
$oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
$doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
$this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html'));
Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
$doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html);
$this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html);
$this->assertTrue(array_values($doc2->getLinks()) == array('link1.html'));
}
示例9: indexFile
/**
* index a file
*
* @author Jörn Dreyer <jfd@butonic.de>
*
* @param string $path the path of the file
*
* @return bool
*/
public static function indexFile($path = '', $user = null)
{
if (!Filesystem::isValidPath($path)) {
return;
}
if ($path === '') {
//ignore the empty path element
return false;
}
if (is_null($user)) {
$view = Filesystem::getView();
$user = \OCP\User::getUser();
} else {
$view = new \OC\Files\View('/' . $user . '/files');
}
if (!$view) {
Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
return false;
}
$root = $view->getRoot();
$pk = md5($root . $path);
// the cache already knows mime and other basic stuff
$data = $view->getFileInfo($path);
if (isset($data['mimetype'])) {
$mimetype = $data['mimetype'];
if ('text/html' === $mimetype) {
$doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
} else {
if ('application/msword' === $mimetype) {
// FIXME uses ZipArchive ... make compatible with OC\Files\Filesystem
//$doc = Zend_Search_Lucene_Document_Docx::loadDocxFile(OC\Files\Filesystem::file_get_contents($path));
//no special treatment yet
$doc = new \Zend_Search_Lucene_Document();
} else {
$doc = new \Zend_Search_Lucene_Document();
}
}
// store fscacheid as unique id to lookup by when deleting
$doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
// Store document URL to identify it in the search results
$doc->addField(\Zend_Search_Lucene_Field::Text('path', $path));
$doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
$doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimetype));
self::extractMetadata($doc, $path, $view, $mimetype);
Lucene::updateFile($doc, $path, $user);
return true;
} else {
Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
return false;
}
}
示例10: indexWebsite
/**
* @param string $websiteId
* @return string
*/
public function indexWebsite($websiteId)
{
$websiteService = new Website('Website');
if (!$websiteService->existsWebsiteAlready($websiteId)) {
throw new CmsException('602', __METHOD__, __LINE__);
}
// Zum Rendern muss die Business-Schicht verwendet werden
$renderBusiness = new BusinessRender('Render');
$modulService = new Modul('Modul');
$pageService = new Page('Page');
$allPageIds = $pageService->getIdsByWebsiteId($websiteId);
$indexFileOfWebsite = $this->getIndexFileForWebsite($websiteId);
if (is_array($allPageIds) && count($allPageIds) > 0) {
if (file_exists($indexFileOfWebsite)) {
$index = \Zend_Search_Lucene::open($indexFileOfWebsite);
$numberOfIndexedDocuments = $index->numDocs();
for ($id = 0; $id < $numberOfIndexedDocuments; ++$id) {
if (!$index->isDeleted($id)) {
$document = $index->delete($id);
}
}
} else {
$index = \Zend_Search_Lucene::create($indexFileOfWebsite);
}
foreach ($allPageIds as $pageId) {
$pageContent = $this->getPageContent($websiteId, $pageId);
if ($this->isStoreContentEnabled()) {
$document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, true, 'UTF-8');
} else {
$document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, false, 'UTF-8');
}
$document->addField(\Zend_Search_Lucene_Field::unIndexed('md5', md5($pageContent)));
$document->addField(\Zend_Search_Lucene_Field::unIndexed('pageId', $pageId));
$index->addDocument($document);
}
$index->commit();
$index->optimize();
unset($index);
}
return $indexFileOfWebsite;
}
示例11: highlightMatchesDOM
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
/** @todo implementation */
$words = array();
$matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/';
if (@preg_match('/\\pL/u', 'a') == 1) {
// PCRE unicode support is turned on
// add Unicode modifier to the match expression
$matchExpression .= 'u';
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($doc->getFieldUtf8Value('body'), 'UTF-8');
foreach ($tokens as $token) {
if (preg_match($matchExpression, $token->getTermText()) === 1) {
$words[] = $token->getTermText();
}
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
示例12: testLoadHtmlWithAttributesInTagHTML
/**
* @group ZF-10686
*/
public function testLoadHtmlWithAttributesInTagHTML()
{
$doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML lang="en_US"><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
$this->assertEquals('Page title ', $doc->title);
}
示例13: indexFile
/**
* index a file
*
* @author Jörn Dreyer <jfd@butonic.de>
*
* @param string $path the path of the file
*
* @return bool
*/
public static function indexFile($path = '', $user = null)
{
if (!Filesystem::isValidPath($path)) {
return;
}
if ($path === '') {
//ignore the empty path element
return false;
}
if (is_null($user)) {
$view = Filesystem::getView();
$user = \OCP\User::getUser();
} else {
$view = new \OC\Files\View('/' . $user . '/files');
}
if (!$view) {
Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
return false;
}
if (!$view->file_exists($path)) {
Util::writeLog('search_lucene', 'file vanished, ignoring', Util::DEBUG);
return true;
}
$root = $view->getRoot();
$pk = md5($root . $path);
// the cache already knows mime and other basic stuff
$data = $view->getFileInfo($path);
if (isset($data['mimetype'])) {
$mimeType = $data['mimetype'];
// initialize plain lucene document
$doc = new \Zend_Search_Lucene_Document();
// index content for local files only
$localFile = $view->getLocalFile($path);
if ($localFile) {
//try to use special lucene document types
if ('text/plain' === $mimeType) {
$body = $view->file_get_contents($path);
if ($body != '') {
$doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body));
}
} else {
if ('text/html' === $mimeType) {
//TODO could be indexed, even if not local
$doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
} else {
if ('application/pdf' === $mimeType) {
$doc = Pdf::loadPdf($view->file_get_contents($path));
// commented the mimetype checks, as the zend classes only understand docx and not doc files.
// FIXME distinguish doc and docx, xls and xlsx, ppt and pptx, in oc core mimetype helper ...
//} else if ('application/msword' === $mimeType) {
} else {
if (strtolower(substr($data['name'], -5)) === '.docx') {
$doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($localFile);
//} else if ('application/msexcel' === $mimeType) {
} else {
if (strtolower(substr($data['name'], -5)) === '.xlsx') {
$doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($localFile);
//} else if ('application/mspowerpoint' === $mimeType) {
} else {
if (strtolower(substr($data['name'], -5)) === '.pptx') {
$doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($localFile);
} else {
if (strtolower(substr($data['name'], -4)) === '.odt') {
$doc = Odt::loadOdtFile($localFile);
} else {
if (strtolower(substr($data['name'], -4)) === '.ods') {
$doc = Ods::loadOdsFile($localFile);
}
}
}
}
}
}
}
}
}
// Store filecache id as unique id to lookup by when deleting
$doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
// Store filename
$doc->addField(\Zend_Search_Lucene_Field::Text('filename', $data['name'], 'UTF-8'));
// Store document path to identify it in the search results
$doc->addField(\Zend_Search_Lucene_Field::Text('path', $path, 'UTF-8'));
$doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
$doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimeType));
//self::extractMetadata($doc, $path, $view, $mimeType);
Lucene::updateFile($doc, $path, $user);
return true;
} else {
Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
return false;
}
//.........这里部分代码省略.........
示例14: _extractText
private function _extractText($guid, $systemName, $fileName, $mimeType, $lang = 'id')
{
$query = "SELECT * FROM KutuRelatedItem where itemGuid='{$guid}' AND relateAs='RELATED_FILE'";
$results = $this->getDbHandler($lang)->query($query);
$rowset = $results->fetchAll(PDO::FETCH_OBJ);
if (count($rowset)) {
$row = $rowset[0];
$parentCatalogGuid = $row->relatedGuid;
if (!empty($systemName)) {
$fileName = $systemName;
}
$sDir1 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName;
$sDir2 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName;
$sDir = '';
if (file_exists($sDir1)) {
$sDir = $sDir1;
} else {
if (file_exists($sDir2)) {
$sDir = $sDir2;
}
}
if (!empty($sDir)) {
$outpath = $sDir . '.txt';
switch ($mimeType) {
case 'application/pdf':
//$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&literal.name=content&commit=true');
/*$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&fmap.content=content&commit=true');
curl_setopt ($ch, CURLOPT_POSTFIELDS, array('myfile'=>'@'.$sDir));
curl_setopt ($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-type:multipart/form-data'));
$result = curl_exec ($ch);*/
/*$mapping_array = [
"literal.id" => "$guid",
"fmap.content" => "content",
"commit" => "true"
];
$ch = curl_init();
$solr_extraction_endpoint = "http://192.168.0.61:8983/solr/corehol/update/extract";
curl_setopt($ch, CURLOPT_POST, TRUE);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_URL, ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')));
$cfile = curl_file_create($sDir);
curl_setopt($ch, CURLOPT_POSTFIELDS, array('myfile' => $cfile));
if(!curl_exec($ch) == TRUE)
{
throw new Exception('Curl Error:' . curl_error($ch));
echo "<br/>Curl Error:<br/>" . curl_error($ch);
}
curl_close($ch);
die;*/
//curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type:multipart/form-data'));
/*$cfile = $this->getCurlValue($sDir,'multipart/form-data',$fileName);
$data = array('file' => $cfile);
$ch = curl_init();
$options = array(CURLOPT_URL => ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')),
CURLOPT_RETURNTRANSFER => true,
CURLINFO_HEADER_OUT => true, //Request header
CURLOPT_HEADER => true, //Return header
CURLOPT_SSL_VERIFYPEER => false, //Don't veryify server certificate
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $data
);
curl_setopt_array($ch, $options);
$result = curl_exec($ch);
$header_info = curl_getinfo($ch,CURLINFO_HEADER_OUT);
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = substr($result, 0, $header_size);
$body = substr($result, $header_size);
curl_close($ch);*/
//system('curl "http://192.168.0.61:8983/solr/corehol/update/extract?literal.id="'.$guid.'"&fmap.content=content&commit=true" -F "myfile=@"'.$sDir);
//system('curl "'.($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')).'" -F "myfile=@"'.$sDir);
$pdfExtractor = $this->_pdfExtractor;
system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret);
if ($ret == 0) {
$value = file_get_contents($outpath);
unlink($outpath);
echo 'content PDF: ' . $sDir . ' ' . strlen($value) . "\n";
if (strlen($value) > 20) {
return (new Pandamp_Utility_Posts())->sanitize_post_content($value);
} else {
echo "content file kosong\n";
return '';
}
}
if ($ret == 127) {
print "Could not find pdftotext tool.\n";
}
return '';
if ($ret == 1) {
print "Could not find pdf file.\n";
}
return '';
break;
case 'text/html':
case 'text/plain':
$docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir);
//.........这里部分代码省略.........
示例15: htmlFragmentHighlightMatches
/**
* Highlight matches in $inputHtmlFragment and return it (without HTML header and body tag)
*
* @param string $inputHtmlFragment
* @param string $encoding Input HTML string encoding
* @param Zend_Search_Lucene_Search_Highlighter_Interface|null $highlighter
* @return string
*/
public function htmlFragmentHighlightMatches($inputHtmlFragment, $encoding = 'UTF-8', $highlighter = null)
{
if ($highlighter === null) {
// require_once 'Zend/Search/Lucene/Search/Highlighter/Default.php';
$highlighter = new Zend_Search_Lucene_Search_Highlighter_Default();
}
$inputHTML = '<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($encoding, 'UTF-8//IGNORE', $inputHtmlFragment) . '</body></html>';
/** Zend_Search_Lucene_Document_Html */
// require_once 'Zend/Search/Lucene/Document/Html.php';
$doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
$highlighter->setDocument($doc);
$this->_highlightMatches($highlighter);
return $doc->getHtmlBody();
}