本文整理汇总了PHP中SiteConfig::build方法的典型用法代码示例。如果您正苦于以下问题:PHP SiteConfig::build方法的具体用法?PHP SiteConfig::build怎么用?PHP SiteConfig::build使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类SiteConfig
的用法示例。
在下文中一共展示了SiteConfig::build方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。
示例1: getSinglePage
function getSinglePage($item, $html, $url)
{
global $http, $extractor;
//$url = "http://chinese.engadget.com/2014/04/21/nintendo-game-boy-25th-anniversary/";
//echo "getSinglePage: " . $url . "\n";
$host = @parse_url($url, PHP_URL_HOST);
$site_config = SiteConfig::build($host);
if ($site_config === false) {
// check for fingerprints
if (!empty($extractor->fingerprints) && ($_fphost = $extractor->findHostUsingFingerprints($html))) {
$site_config = SiteConfig::build($_fphost);
}
if ($site_config === false) {
$site_config = new SiteConfig();
}
SiteConfig::add_to_cache($host, $site_config);
return false;
} else {
SiteConfig::add_to_cache($host, $site_config);
}
$splink = null;
if (!empty($site_config->single_page_link)) {
$splink = $site_config->single_page_link;
} elseif (!empty($site_config->single_page_link_in_feed)) {
// single page link xpath is targeted at feed
$splink = $site_config->single_page_link_in_feed;
// so let's replace HTML with feed item description
$html = $item->get_description();
}
if (isset($splink)) {
// Build DOM tree from HTML
$readability = new Readability($html, $url);
$xpath = new DOMXPath($readability->dom);
// Loop through single_page_link xpath expressions
$single_page_url = null;
foreach ($splink as $pattern) {
$elems = @$xpath->evaluate($pattern, $readability->dom);
if (is_string($elems)) {
$single_page_url = trim($elems);
break;
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $item) {
if ($item instanceof DOMElement && $item->hasAttribute('href')) {
$single_page_url = $item->getAttribute('href');
break;
} elseif ($item instanceof DOMAttr && $item->value) {
$single_page_url = $item->value;
break;
}
}
}
}
// If we've got URL, resolve against $url
if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
// check it's not what we have already!
if ($single_page_url != $url) {
// it's not, so let's try to fetch it...
$_prev_ref = $http->referer;
$http->referer = $single_page_url;
if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
$http->referer = $_prev_ref;
return $response;
}
$http->referer = $_prev_ref;
}
}
}
return false;
}
示例2: buildSiteConfig
public function buildSiteConfig($url, $html = '', $add_to_cache = true)
{
// extract host name
$host = @parse_url($url, PHP_URL_HOST);
$host = strtolower($host);
if (substr($host, 0, 4) == 'www.') {
$host = substr($host, 4);
}
// is merged version already cached?
if (SiteConfig::is_cached("{$host}.merged")) {
$this->debug("Returning cached and merged site config for {$host}");
return SiteConfig::build("{$host}.merged");
}
// let's build from site_config/custom/ and standard/
$config = SiteConfig::build($host);
if ($add_to_cache && $config && !SiteConfig::is_cached("{$host}")) {
SiteConfig::add_to_cache($host, $config);
}
// if no match, use defaults
if (!$config) {
$config = new SiteConfig();
}
// load fingerprint config?
if ($config->autodetect_on_failure()) {
// check HTML for fingerprints
if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
if ($config_fingerprint = SiteConfig::build($_fphost)) {
$this->debug("Appending site config settings from {$_fphost} (fingerprint match)");
$config->append($config_fingerprint);
if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
//$config_fingerprint->cache_in_apc = true;
SiteConfig::add_to_cache($_fphost, $config_fingerprint);
}
}
}
}
// load global config?
if ($config->autodetect_on_failure()) {
if ($config_global = SiteConfig::build('global', true)) {
$this->debug('Appending site config settings from global.txt');
$config->append($config_global);
if ($add_to_cache && !SiteConfig::is_cached('global')) {
//$config_global->cache_in_apc = true;
SiteConfig::add_to_cache('global', $config_global);
}
}
}
// store copy of merged config
if ($add_to_cache) {
// do not store in APC if wildcard match
$use_apc = $host == $config->cache_key;
$config->cache_key = null;
SiteConfig::add_to_cache("{$host}.merged", $config, $use_apc);
}
return $config;
}
示例3: apc_cache_info
echo '<p>Saved to <strong>' . $savepath . '</strong></p>';
// check caching
if ($options->caching) {
echo '<p>Note: caching is enabled — you may have to disable caching or delete cache files to see changes.<p>';
}
if ($options->apc && function_exists('apc_delete') && function_exists('apc_cache_info')) {
$_apc_data = apc_cache_info('user');
foreach ($_apc_data['cache_list'] as $_apc_item) {
if (substr($_apc_item['info'], 0, 3) == 'sc.') {
apc_delete($_apc_item['info']);
}
}
echo '<p>Cleared site config cache in APC.</p>';
}
SiteConfig::set_config_path(dirname($savepath));
$sconfig = SiteConfig::build($save, $exact_host_match = true);
if ($sconfig) {
if (!empty($sconfig->test_url)) {
echo '<h4>Test URLs</h4>';
echo '<ul>';
foreach ($sconfig->test_url as $test_url) {
$ftr_test_url = $test_url;
if (strtolower(substr($ftr_test_url, 0, 7)) == 'http://') {
$ftr_test_url = substr($ftr_test_url, 7);
}
$ftr_test_url = '../makefulltextfeed.php?url=' . urlencode($ftr_test_url);
echo '<li>';
echo '<a href="' . htmlspecialchars($test_url) . '" target="_blank">' . htmlspecialchars($test_url) . '</a>';
echo ' | <a href="' . $ftr_test_url . '" target="_blank">Full-Text RSS result</a>';
echo ' | <a href="' . $ftr_test_url . '&debug" target="_blank">Debug</a>';
echo '</li>';
示例4: process
public function process($html, $url, $smart_tidy = true)
{
$this->reset();
// extract host name
$host = @parse_url($url, PHP_URL_HOST);
if (!($this->config = SiteConfig::build($host))) {
// no match, so use defaults
$this->config = new SiteConfig();
}
// store copy of config in our static cache array in case we need to process another URL
SiteConfig::add_to_cache($host, $this->config);
// use tidy (if it exists)?
// This fixes problems with some sites which would otherwise
// trouble DOMDocument's HTML parsing. (Although sometimes it
// makes matters worse, which is why you can override it in site config files.)
$tidied = false;
if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) {
$this->debug('Using Tidy');
$tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
if (tidy_clean_repair($tidy)) {
$original_html = $html;
$tidied = true;
$html = $tidy->value;
}
unset($tidy);
}
// load and parse html
$this->readability = new Readability($html, $url);
// we use xpath to find elements in the given HTML document
// see http://en.wikipedia.org/wiki/XPath_1.0
$xpath = new DOMXPath($this->readability->dom);
// strip elements (using xpath expressions)
foreach ($this->config->strip as $pattern) {
$elems = @$xpath->query($pattern, $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping ' . $elems->length . ' elements (strip)');
for ($i = $elems->length - 1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
// strip elements (using id and class attribute values)
foreach ($this->config->strip_id_or_class as $string) {
$string = strtr($string, array("'" => '', '"' => ''));
$elems = @$xpath->query("//*[contains(@class, '{$string}') or contains(@id, '{$string}')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping ' . $elems->length . ' elements (strip_id_or_class)');
for ($i = $elems->length - 1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
// strip images (using src attribute values)
foreach ($this->config->strip_image_src as $string) {
$string = strtr($string, array("'" => '', '"' => ''));
$elems = @$xpath->query("//img[contains(@src, '{$string}')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping ' . $elems->length . ' image elements');
for ($i = $elems->length - 1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
// strip elements using Readability.com and Instapaper.com ignore class names
// .entry-unrelated and .instapaper_ignore
// See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
// and http://blog.instapaper.com/post/730281947
$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping ' . $elems->length . ' .entry-unrelated,.instapaper_ignore elements');
for ($i = $elems->length - 1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
// strip elements that contain style="display: none;"
$elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping ' . $elems->length . ' elements with inline display:none style');
for ($i = $elems->length - 1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
// try to get title
foreach ($this->config->title as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if (is_string($elems)) {
$this->debug('Title expression evaluated as string');
$this->title = trim($elems);
break;
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
$this->debug('Title matched');
$this->title = $elems->item(0)->textContent;
break;
}
}
//.........这里部分代码省略.........
示例5: getSinglePage
function getSinglePage($item, $html, $url)
{
global $http;
$host = @parse_url($url, PHP_URL_HOST);
$site_config = SiteConfig::build($host);
if ($site_config === false) {
return false;
}
$splink = null;
if (!empty($site_config->single_page_link)) {
$splink = $site_config->single_page_link;
} elseif (!empty($site_config->single_page_link_in_feed)) {
// single page link xpath is targeted at feed
$splink = $site_config->single_page_link_in_feed;
// so let's replace HTML with feed item description
$html = $item->get_description();
}
if (isset($splink)) {
// Build DOM tree from HTML
$readability = new Readability($html, $url);
$xpath = new DOMXPath($readability->dom);
// Loop through single_page_link xpath expressions
$single_page_url = null;
foreach ($splink as $pattern) {
$elems = @$xpath->evaluate($pattern, $readability->dom);
if (is_string($elems)) {
$single_page_url = trim($elems);
break;
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $item) {
if ($item->hasAttribute('href')) {
$single_page_url = $item->getAttribute('href');
break;
}
}
}
}
// If we've got URL, resolve against $url
if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
// check it's not what we have already!
if ($single_page_url != $url) {
// it's not, so let's try to fetch it...
if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
return $response;
}
}
}
}
return false;
}
示例6: process
public function process($html, $url, $smart_tidy = true)
{
$this->reset();
// extract host name
$host = @parse_url($url, PHP_URL_HOST);
if (!($this->config = SiteConfig::build($host))) {
// no match, check HTML for fingerprints
if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
$this->config = SiteConfig::build($_fphost);
}
unset($_fphost);
if (!$this->config) {
// no match, so use defaults
$this->config = new SiteConfig();
}
}
//echo count($this->config->body);
// store copy of config in our static cache array in case we need to process another URL
SiteConfig::add_to_cache($host, $this->config);
// do string replacements
foreach ($this->config->replace_string as $_repl) {
$html = str_replace($_repl[0], $_repl[1], $html);
}
unset($_repl);
// use tidy (if it exists)?
// This fixes problems with some sites which would otherwise
// trouble DOMDocument's HTML parsing. (Although sometimes it
// makes matters worse, which is why you can override it in site config files.)
$tidied = false;
if ($this->config->tidy && function_exists('tidy_parse_string') && $smart_tidy) {
$this->debug('Using Tidy');
$tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
if (tidy_clean_repair($tidy)) {
$original_html = $html;
$tidied = true;
$html = $tidy->value;
}
unset($tidy);
}
// load and parse html
$this->readability = new Readability($html, $url);
// we use xpath to find elements in the given HTML document
// see http://en.wikipedia.org/wiki/XPath_1.0
$xpath = new DOMXPath($this->readability->dom);
// try to get title
foreach ($this->config->title as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if (is_string($elems)) {
$this->debug('Title expression evaluated as string');
$this->title = trim($elems);
break;
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
$this->debug('Title matched');
$this->title = $elems->item(0)->textContent;
// remove title from document
try {
$elems->item(0)->parentNode->removeChild($elems->item(0));
} catch (DOMException $e) {
// do nothing
}
break;
}
}
// try to get author (if it hasn't already been set)
if (empty($this->author)) {
foreach ($this->config->author as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if (is_string($elems)) {
$this->debug('Author expression evaluated as string');
if (trim($elems) != '') {
$this->author[] = trim($elems);
break;
}
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $elem) {
if (!isset($elem->parentNode)) {
continue;
}
$this->author[] = trim($elem->textContent);
}
if (!empty($this->author)) {
break;
}
}
}
}
// try to get language
$_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
foreach ($_lang_xpath as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if (is_string($elems)) {
if (trim($elems) != '') {
$this->language = trim($elems);
break;
}
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $elem) {
if (!isset($elem->parentNode)) {
continue;
}
//.........这里部分代码省略.........