当前位置: 首页>>代码示例>>PHP>>正文


PHP simple_html_dom::clear方法代码示例

本文整理汇总了PHP中simple_html_dom::clear方法的典型用法代码示例。如果您正苦于以下问题:PHP simple_html_dom::clear方法的具体用法?PHP simple_html_dom::clear怎么用?PHP simple_html_dom::clear使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在simple_html_dom的用法示例。


在下文中一共展示了simple_html_dom::clear方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的PHP代码示例。

示例1: clubURL

function clubURL($url)
{
    $html = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $clubName = trim(str_replace(' ', '', $dom->find('table', 0)->find('tr', 2)->plaintext));
    $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName));
    $_GLOBAL['clubs'][] = $formatClubName;
    echo 'running ' . $formatClubName . "\n";
    foreach ($dom->find('table', 2)->find('tr') as $row) {
        if (is_numeric($row->find('td', 0)->plaintext)) {
            $year = trim($row->find('td', 0)->plaintext);
            $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext));
            if (trim($position) == 'Champion') {
                $position = 1;
            }
            $leagueLevel = trim($row->find('td', 2)->plaintext);
            $overallPosition = trim($row->find('td', 3)->plaintext);
            $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext));
            $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext));
            $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance);
            scraperwiki::save(array('club', 'year'), $dataset);
        }
    }
    /*
     * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak
     */
    $dom->clear();
    unset($dom);
}
开发者ID:flyeven,项目名称:scraperwiki-scraper-vault,代码行数:30,代码来源:pauls-hmhse-scraper.php

示例2: save

 public function save($html, $dir)
 {
     import("@.ORG.htmltodocx.documentation.support_functions");
     $phpword_object = new PHPWord();
     $section = $phpword_object->createSection();
     // HTML Dom object:
     $html_dom = new simple_html_dom();
     $html_dom->load('<html><body>' . $html . '</body></html>');
     // Note, we needed to nest the html in a couple of dummy elements.
     // Create the dom array of elements which we are going to work on:
     $html_dom_array = $html_dom->find('html', 0)->children();
     // We need this for setting base_root and base_path in the initial_state array
     // (below). We are using a function here (derived from Drupal) to create these
     // paths automatically - you may want to do something different in your
     // implementation. This function is in the included file
     // documentation/support_functions.inc.
     $paths = htmltodocx_paths();
     // Provide some initial settings:
     $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => TRUE, 'style_sheet' => htmltodocx_styles_example());
     // Convert the HTML and put it into the PHPWord object
     htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state);
     // Clear the HTML dom object:
     $html_dom->clear();
     unset($html_dom);
     // Save File
     $str = explode(".", $h2d_file_uri);
     $h2d_file_uri = $dir . "wordtemp/" . time() . ".docx";
     if (!file_exists($dir . "wordtemp/")) {
         $this->createFolders($dir . "wordtemp/");
         //判断目标文件夹是否存在
     }
     $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007');
     $objWriter->save($h2d_file_uri);
     return $h2d_file_uri;
 }
开发者ID:tmlsoft,项目名称:main,代码行数:35,代码来源:HtmlToDocx.php

示例3: parse

 public function parse($isUpdate = false)
 {
     Ibos::import("application.extensions.simple_html_dom", true);
     if ($isUpdate) {
         $model = preg_replace("/\\s+data-id\\s?=\\s?\"?\\d+\"?/i", "", $this->printmodel);
         $max = 0;
     } else {
         $model = $this->printmodel;
         $max = intval($this->itemmax);
     }
     $elements = array();
     $doc = new simple_html_dom();
     $doc->load($model, true, true, CHARSET);
     $items = $doc->find("ic");
     $config = $this->getItemConfig();
     if (!empty($items) && !empty($config)) {
         $this->refactor($items, $config, $max, $elements);
     }
     $html = $doc->save();
     $this->_cache = $elements;
     CacheUtil::set("form_" . $this->ID, $elements);
     $form["printmodelshort"] = $html;
     if ($max != $this->itemmax) {
         $form["itemmax"] = $max;
     }
     $doc->clear();
     FlowFormType::model()->modify($this->ID, $form);
 }
开发者ID:AxelPanda,项目名称:ibos,代码行数:28,代码来源:SimpleHtmlParser.php

示例4: str_get_html

function str_get_html($str, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) {
    $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
    if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
        $dom->clear();
        return false;
    }
    $dom->load($str, $lowercase, $stripRN);
    return $dom;
}
开发者ID:salatproduction,项目名称:GeekTimes-MegaMozg-Parser-Standalone,代码行数:9,代码来源:htmldom.class.php

示例5: generate_docx

function generate_docx($html, $file_path, &$file_takeout_tmp_files)
{
    $phpword_object = new PHPWord();
    $section = $phpword_object->createSection();
    $html_dom = new simple_html_dom();
    $html_dom->load($html);
    $html_dom_array = $html_dom->find('html', 0)->children();
    $paths = htmltodocx_paths();
    $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => FALSE, 'style_sheet' => htmltodocx_styles(), 'download_img_path' => elgg_get_data_path(), 'download_img_tmp' => &$file_takeout_tmp_files);
    htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state);
    $html_dom->clear();
    unset($html_dom);
    $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007');
    // Word2007 is the only option :-(
    $objWriter->save($file_path);
}
开发者ID:aleph1888,项目名称:elgg_file_takeout,代码行数:16,代码来源:file_takeout.php

示例6: clean_children

 private function clean_children(&$a_sHTML)
 {
     $l_sTmp = '<crawler>' . $a_sHTML . '</crawler>';
     $l_oTheHtml = new simple_html_dom();
     $l_oTheHtml->load($l_sTmp);
     $l_sResult = $l_oTheHtml->find('crawler', 0);
     $x = (string) $l_sResult->innertext;
     for ($i = 0; $i < sizeof($l_sResult->children()); $i++) {
         $x = str_replace($l_sResult->children($i), '', $x);
     }
     $l_oTheHtml->clear();
     unset($l_sTmp);
     unset($l_sResult);
     unset($l_oTheHtml);
     return $x;
 }
开发者ID:sergrin,项目名称:crawlers-il,代码行数:16,代码来源:CExtractor.class.php

示例7: foreach

 function add_h_filter($foo)
 {
     $source_html = $foo;
     $p = new simple_html_dom();
     $p->load('<html><body>' . $source_html . '<html><body>');
     $hrefs = $p->find("a");
     foreach ($hrefs as $elm) {
         error_log($elm->href);
         $match;
         if (preg_match("/\\?page_id\\=(\\d+?)\$/", $elm->href, $match)) {
             $page_id = $match[1];
             $page = get_page($page_id);
             $content = $page->post_content;
             $p2 = new simple_html_dom();
             $p2->load('<html><body>' . $content . '<html><body>');
             $has_id = $p2->find('h1[id]');
             $submenu_array = array();
             foreach ($has_id as $idh1) {
                 error_log($idh1->id);
                 $h1_id = $idh1->id;
                 $h1_txt = $idh1->plaintext;
                 array_push($submenu_array, array($h1_id, $h1_txt));
             }
             if (sizeof($submenu_array) !== 0) {
                 $submenu = "<ul class='submenu'>\n";
                 $blogurl = get_bloginfo('url');
                 foreach ($submenu_array as $sub) {
                     $submenu .= '<li><a href="' . $blogurl . '?page_id=' . $page_id . '#' . $sub[0] . '">' . $sub[1] . '</a></h1>' . "\n";
                 }
                 $submenu .= '</ul>';
                 $elm->outertext = $elm->outertext . $submenu;
             }
             $p2->clear();
             unset($p2);
         }
     }
     $foo = $p->outertext;
     $p->clear();
     unset($p);
     return $foo;
 }
开发者ID:hryk,项目名称:my-wp-plugins,代码行数:41,代码来源:toc-pages-widget.php

示例8: addToTable

	function addToTable($text,$position,$button){
		$dom = new simple_html_dom();
		$dom->load($text);

		$tableEl = $dom->find('.'.$position,0);
		if(!$tableEl){
			$table = '<table class="'.$position	.' myApiShareTable"></table>';
			$text 	= ($position == 'myApiShareTop') ?  $table.$text : $text.$table;
			$dom->load($text);
		}
		
		$text 	= $dom->save();
		$dom->load($text);
		
		$rowEl = $dom->find('.'.$position,0)->find('.myApiButtons',0);
		if(!$rowEl){
			$tr = '<tr class="myApiButtons"><td><table><tr><td>'.$button.'</td></tr></table></td></tr>';
			$row = $dom->find('.'.$position,0);
			$row->innertext = $tr.$row->innertext;
		}else{
			$rowEl->find('table',0)->find('tr',0)->innertext = '<td>'.$button.'</td>'.$rowEl->find('table',0)->find('tr',0)->innertext;
		}
		
		$text 	= $dom->save();
		$dom->load($text);
		
		$commentsTable = $dom->find('.myApiShareBottom',0);
		if($commentsTable){
			$commentsEl = $commentsTable->find('.myApiCommentsCell',0);
			if($commentsEl){
				$buttonRow = $commentsTable->find('.myApiButtons',0);
				if($buttonRow){
					$commentsEl->colspan = sizeof($buttonRow->find('td'));
					$text 	= $dom->save();
				}
			}
		}
		$dom->clear(); unset($dom);	
		return $text;
	}
开发者ID:rhumme,项目名称:myApi,代码行数:40,代码来源:myApiDom.php

示例9: _Process_Recieved_Content

 public static function _Process_Recieved_Content($_HTML_CONTENT, $_Cung1, $_Cung2, $_Summary, $_SourceUri, $_LinkId, $_ImageLink)
 {
     if ($_HTML_CONTENT != '') {
         // Create a DOM object
         require_once Kohana::find_file('classes', 'vendor/simple_html_dom');
         $html = new simple_html_dom();
         // Load HTML from a string
         $html->load($_HTML_CONTENT);
         unset($_HTML_CONTENT);
         if ($html) {
             $story = new Model_Horoscope_XungHop();
             $ktra = true;
             if ($_Cung1 == '-' || $_Cung2 == '-') {
                 $ktra = FALSE;
             }
             $story->cung_1 = $_Cung1;
             $story->cung_2 = $_Cung2;
             $story->alias = $_Cung1 . '_' . $_Cung2;
             if (self::CheckRecordByAlias($story->alias)) {
                 $story->alias = $_Cung1 . '__' . $_Cung2;
             }
             $story->tom_tat = $_Summary;
             $story->ngay_tao = date("Y-m-d");
             $story->url_nguon = $_SourceUri;
             $story->auto_get = true;
             //begin find elements
             #find date post
             $date = $html->find('div[class="datetime"]', 0);
             if ($date) {
                 $d = explode(',', $date->plaintext);
                 if (isset($d[1])) {
                     //var_dump($d);
                     //exit;
                     $d1 = explode(' ', trim($d[1]));
                     list($ngay, $thang, $nam) = explode('/', $d1[0]);
                     $story->source_date = date("Y-m-d h:i:s", strtotime($nam . '-' . $thang . '-' . $ngay . ' ' . $d1[1] . ':00'));
                 } else {
                     $story->source_date = date("Y-m-d h:i:s");
                 }
             } else {
                 $story->source_date = date("Y-m-d h:i:s");
             }
             //find content
             $content = $html->find('div[id="content_document"]', 0);
             if ($content) {
                 $string = $content->innertext;
                 # remove white space
                 $string = str_replace(array("\r\n", "\r", "\n", "\t"), '', $string);
                 $string = preg_replace('/(<!--.+?-->)/s', '', $string);
                 $string = preg_replace('@<a[^>]*>(.*)</a>@ismUx', '$1', $string);
                 $string = preg_replace('/<p[ ]class="pAuthor">.*<\\/p>/ismxU', '', $string);
                 $string = preg_replace('/<p[ ]class="pSource">.*<\\/p>/ismxU', '', $string);
                 $story->noi_dung = $string;
                 $story->kiem_tra = $ktra;
                 $story->save();
                 if ($story->identifier()) {
                     if ($ktra) {
                         //get image thumb => save to disk => update record in db
                         $path = 'assets/horoscope/xung-hop/' . $story->alias . '/';
                         $img = Vendor_Crawler::get_file_from_url_by_curl($_ImageLink, $save_to_path = $path, $file_name_to_set = $story->alias . '-thumb');
                         if ($img) {
                             //check file size, if = 0 -> mean file can't get
                             if (filesize($img) == 0) {
                                 @copy('assets/horoscope/thumb_140.jpg', $img);
                             }
                             $story->hinh_anh = '/' . $img;
                         } else {
                             $story->hinh_anh = $_ImageLink;
                         }
                     } else {
                         $story->hinh_anh = $_ImageLink;
                     }
                     if ($ktra != FALSE) {
                         //print_r($img);
                         $html2 = new simple_html_dom();
                         $html2->load($story->noi_dung);
                         $images = $html2->find('img');
                         if (count($images) > 0) {
                             for ($i = 0; $i < count($images); $i++) {
                                 unset($images[$i]->onclick);
                                 $file_name = 'anh_' . $i + 1;
                                 $get_file = Vendor_Crawler::get_file_from_url_by_curl($images[$i]->src, $save_to_path = $path, $file_name_to_set = $file_name);
                                 if (filesize(ltrim($get_file, '/')) == 0) {
                                     unset($images[$i]);
                                 } else {
                                     $images[$i]->src = '/' . $get_file;
                                 }
                             }
                         }
                         $story->noi_dung = $html2->save();
                         $html2->clear();
                         unset($html2);
                     } else {
                         $story->hinh_anh = $_ImageLink;
                     }
                     $story->save();
                     //insert done => update from tmp table
                     Model_Horoscope_XungHopLinkBLL::UpdateRecordStatus($_LinkId);
                     self::_print_to_console('Done: ' . $_SourceUri);
                 } else {
//.........这里部分代码省略.........
开发者ID:abdul-baten,项目名称:hbcms,代码行数:101,代码来源:xunghopbll.php

示例10: scrapeHTML

function scrapeHTML($param, $type)
{
    $html = scraperWiki::scrape(BASE_URL . "?type={$param}");
    $dom = new simple_html_dom();
    $dom->load($html);
    // Iterate over table rows and get flight details.
    foreach ($dom->find("TR[@HEIGHT='25']") as $data) {
        // Flight details.
        $tds = $data->find("td");
        $airline = removeSpaces($tds[0]->plaintext);
        $flight_type = $type;
        $flight_num = removeSpaces($tds[1]->plaintext);
        $destination = removeSpaces($tds[2]->plaintext);
        $time = removeSpaces($tds[3]->plaintext);
        $gate = removeSpaces($tds[4]->plaintext);
        $remarks = removeSpaces($tds[5]->plaintext);
        // Skip header row. Cheesy, but effective.
        if ($airline == "Airline") {
            continue;
        }
        // Set the date.
        $date = date("m.d.y");
        // Build up record to store.
        $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks);
        // Save the record.
        saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data);
    }
    $dom->clear();
}
开发者ID:flyeven,项目名称:scraperwiki-scraper-vault,代码行数:29,代码来源:php.php

示例11: lightboxPlusReplace


//.........这里部分代码省略.........
                         }
                         break;
                 }
             }
             break;
         default:
             /**
              *  find all links with image only else if (do not autolightbox textlinks) then
              */
             foreach ($html->find('a[href*=jpg$] img, a[href*=gif$] img, a[href*=png$] img, a[href*=jpeg$] img, a[href*=bmp$] img') as $e) {
                 /**
                  * Generate HTML5 yes/no
                  */
                 switch ($lightboxPlusOptions['output_htmlv']) {
                     case 1:
                         $htmlv_prop = 'data-' . $lightboxPlusOptions['data_name'];
                         switch ($lightboxPlusOptions['use_class_method']) {
                             /**
                              * Use Class Method is selected - yes/no
                              */
                             case 1:
                                 if ($e->parent()->class && $e->parent()->class != $lightboxPlusOptions['class_name']) {
                                     $e->parent()->class .= ' ' . $lightboxPlusOptions['class_name'];
                                     if (!$e->parent()->{$htmlv_prop}) {
                                         $e->parent()->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 } else {
                                     $e->parent()->class = $lightboxPlusOptions['class_name'];
                                     if (!$e->parent()->{$htmlv_prop}) {
                                         $e->parent()->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 }
                                 break;
                             default:
                                 if (!$e->parent()->{$htmlv_prop}) {
                                     $e->parent()->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']';
                                 }
                                 break;
                         }
                         break;
                     default:
                         switch ($lightboxPlusOptions['use_class_method']) {
                             /**
                              * Use Class Method is selected - yes/no
                              */
                             case 1:
                                 if ($e->parent()->class && $e->parent()->class != $lightboxPlusOptions['class_name']) {
                                     $e->parent()->class .= ' ' . $lightboxPlusOptions['class_name'];
                                     if (!$e->parent()->rel) {
                                         $e->parent()->rel = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 } else {
                                     $e->parent()->class = $lightboxPlusOptions['class_name'];
                                     if (!$e->parent()->rel) {
                                         $e->parent()->rel = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 }
                                 break;
                             default:
                                 if (!$e->parent()->rel) {
                                     $e->parent()->rel = 'lightbox[' . $postGroupID . $unq_id . ']';
                                 }
                                 break;
                         }
                         break;
                 }
                 /**
                  * Do Not Display Title is select - yes/no
                  */
                 switch ($lightboxPlusOptions['no_display_title']) {
                     case 1:
                         $e->parent()->title = null;
                         break;
                     default:
                         if (!$e->parent()->title) {
                             if ($e->title) {
                                 $e->parent()->title = $e->title;
                             } else {
                                 $e->parent()->title = $postGroupTitle;
                             }
                         }
                         if ($lightboxPlusOptions['use_caption_title']) {
                             //if ($e->parent()->next_sibling()->innertext) { $e->parent()->title = $e->parent()->next_sibling()->innertext; }
                             //if ($e->parent()->next_sibling()->innertext) { $e->title = $e->parent()->next_sibling()->innertext; }
                             if ($e->find('img[src*=jpg$], img[src*=gif$], img[src*=png$], img[src*=jpeg$], img[src*=bmp$]') && ($e->next_sibling()->class = 'wp-caption-text')) {
                                 $e->title = $e->next_sibling()->innertext;
                             } elseif ($e->find('img[src*=jpg$], img[src*=gif$], img[src*=png$], img[src*=jpeg$], img[src*=bmp$]') && ($e->parent()->next_sibling()->class = 'gallery-caption')) {
                                 $e->title = $e->parent()->next_sibling()->innertext;
                             }
                         }
                         break;
                 }
             }
             break;
     }
     $content = $html->save();
     $html->clear();
     unset($html);
     return $content;
 }
开发者ID:Paulf-999,项目名称:HollyFry.com,代码行数:101,代码来源:filters.class.php

示例12: action_sua

 public function action_sua($story_id)
 {
     $this->template->title = __('Sửa bài viết: xung - hợp cung');
     $this->template->section_title = __('Sửa bài viết: xung - hợp cung');
     $data = array();
     $story = Model_Horoscope_XungHopBLL::getInstance()->find($story_id);
     if ($story) {
         if (Request::$method == 'POST') {
             //                print_r($_POST);
             //                    die();
             $post = $story->validate_update($_POST);
             if ($post->check()) {
                 //begin save
                 $post_values = $post->as_array();
                 $old_alias = $story->alias;
                 //
                 //alias changed => image changed => directory changed => images in content not get :(
                 $story->hinh_anh = $post_values['hinh_anh'];
                 $story->alias = $post_values['alias'];
                 $story->cung_1 = $post_values['cung_1'];
                 $story->cung_2 = $post_values['cung_2'];
                 $story->tom_tat = trim($post_values['tom_tat']);
                 $story->noi_dung = $post_values['noi_dung'];
                 $story->kiem_tra = true;
                 $story->save();
                 //print_r($img);
                 // Create a DOM object
                 if ($old_alias != $post_values['alias']) {
                     //remove old folder (if existed when update)
                     @rmdir('assets/horoscope/xung-hop/' . $old_alias . '/');
                     require_once Kohana::find_file('classes', 'vendor/simple_html_dom');
                     $html2 = new simple_html_dom();
                     $html2->load($story->noi_dung);
                     $images = $html2->find('img');
                     if ($images) {
                         $i = 1;
                         foreach ($images as $image) {
                             unset($image->onclick);
                             $path = 'assets/horoscope/xung-hop/' . $story->alias . '/';
                             $file_name = 'anh_' . $i;
                             $get_file = Vendor_Crawler::get_file_from_url_by_curl($image->src, $save_to_path = $path, $file_name_to_set = $file_name);
                             if (filesize($get_file) == 0) {
                                 unset($image);
                             } else {
                                 $image->src = '/' . $get_file;
                             }
                             $i++;
                         }
                     }
                     $story->noi_dung = $html2->save();
                     $html2->clear();
                     unset($html2);
                     $story->save();
                 }
                 Request::instance()->redirect('admin/horoscope_xunghop/index');
             } else {
                 $_POST = $post->as_array();
                 #Affects errors for further display
                 $data['errors'] = $post->errors();
             }
         }
         $data['story'] = $story->toArray();
         $this->template->content = View::factory('horoscope/admin/xung-hop/sua', $data);
     } else {
         Request::instance()->redirect('admin/horoscope_xunghop/index');
     }
 }
开发者ID:abdul-baten,项目名称:hbcms,代码行数:67,代码来源:xunghop.php

示例13: collect

 public function collect()
 {
     $url = trim($this->_post('url'));
     //返回结果
     $res = array('title' => '', 'content' => '');
     //分析网页是否包含视频
     $video = $this->uVideoUpload($url);
     if ($video != '10' && $video != '11') {
         //获取标题
         $htm = file_get_html($url);
         $title = $htm->find('title', 0)->plaintext;
         $htm->clear();
         $res['title'] = $title;
         $res['content'] = $video;
         echo json_encode($res);
         exit;
     }
     //不含视频,则按文章处理
     $collect = D('collect');
     $domin = '';
     $match = "/http:\\/\\/([^\\/]*).*/i";
     if (!substr_count($url, "http")) {
         $url = "http://" . $url;
     }
     preg_match($match, $url, $out);
     $domin = $out[1];
     if (!empty($domin)) {
         //分析是不是音乐网站
         $music_websites = C('MUSIC_WEBSITES');
         if (in_array($domin, $music_websites)) {
             $htm = file_get_html($url);
             $p = preg_match('/var\\s*?_xiamitoken\\s*?=\\s*?[\'\\"](.*?)[\'\\"]/i', $htm, $out);
             $token = $out[1];
             //onclick="playalbum(682938274, '', '时间的歌', '');
             $xid = '';
             if (preg_match('/playalbum\\((\\d+),\\s*?\'*?\',\\s*?\'(.*?)\',\\s*?\'*?\'\\)/i', $htm, $out)) {
                 //xid
                 $xid = $out[1];
                 //title
                 $title = $out[2];
             } else {
                 if (preg_match('/\\/album\\/(\\d{1,})/', $htm, $out)) {
                     $xid = $out[1];
                     $title = $htm->find('div#title', 0)->plaintext;
                 } else {
                     if (preg_match('/var\\s*?cid\\s*?=\\s*?[\'\\"](.*?)[\'\\"]/i', $htm, $out)) {
                         #var cid = '22454617';
                         $xid = $out[1];
                         $title = $htm->find('title', 0)->plaintext;
                     }
                 }
             }
             if ($xid) {
                 //http://www.xiami.com/ajax/getquote/type/2/id/682938274?_xiamitoken=0802020a13ba3df687e7ca4ef45cf1a8
                 $zurl = "http://www.xiami.com/ajax/getquote/type/2/id/{$xid}?_xiamitoken={$token}";
                 $htm = file_get_html($zurl);
                 $content = $htm->find('textarea.tarea', 1)->innertext;
                 $res['title'] = trim($title);
                 $res['content'] = $content;
                 //清除内存消耗
                 $htm->clear();
             } else {
                 $res['title'] = '';
                 $res['content'] = '没有找到音乐';
             }
             echo json_encode($res);
             exit;
         }
         //查看数据库中是否已经有该域名的记录
         $c = $collect->where('domain="' . $domin . '"')->find();
         if (!$c) {
             //没有数据库记录,则title为页面title,content为body正文
             $collect->data(array('alias' => $domin, 'domain' => $domin, 'match' => '123'))->add();
             //查找body
             $htm = file_get_html($url);
             $title = $htm->find('title', 0)->plaintext;
             $content = $htm->find('body', 0)->innertext;
             //title取正文的10个左右字符
             $res['title'] = $title;
             $res['content'] = $content;
         } else {
             //找到了匹配规则
             //新浪博客URL特殊处理,去掉结尾的 ?tj=...
             if ($domin == 'blog.sina.com.cn') {
                 $url = preg_replace('/\\?tj=.*/i', '', $url);
             }
             $htm = file_get_html($url);
             //获取title
             $matchlist = $this->collect_match->get_matchlist_by_collect_type($c['id'], self::TYPE_TITLE);
             if (!empty($matchlist)) {
                 $exec = '$htm';
                 foreach ($matchlist as $match) {
                     $exec .= "->find( '{$match['match']}', {$match['pos']} )";
                 }
                 $exec = $exec . '->plaintext;';
                 eval("\$str = {$exec};");
                 $res['title'] = $str;
             } else {
                 $title = $htm->find('title', 0)->plaintext;
                 $res['title'] = $title;
//.........这里部分代码省略.........
开发者ID:ArronYR,项目名称:collect,代码行数:101,代码来源:CollectAction.class.php

示例14: convertImpl


//.........这里部分代码省略.........
        } else {
            $credit = "<h2>Реквизиты переводчиков</h2>";
            if ($this->command) {
                $credit .= "<p>Перевод команды {$this->command}</p>";
            }
            foreach ($this->workers as $activity => $workers) {
                $credit .= '<p>' . $activity . ': <b>' . implode('</b>, <b>', $workers) . "</b></p>\n";
            }
            $credit .= '<p>Версия от ' . date('d.m.Y', $this->touched) . '</p>
						  <p><b>Любое коммерческое использование данного текста или его фрагментов запрещено</b></p>';
        }
        if ($this->height == 0) {
            $text = preg_replace('/(<p[^>]*>)?<img[^>]*>(<\\/p>)?/u', '', $text);
        } else {
            for ($i = 1; $i < count($this->covers); ++$i) {
                $image = $this->images[$this->covers[$i]];
                $text = "<img src=\"" . $image['thumbnail'] . "\" width=\"" . $image['convert_width'] . "\" height=\"" . $image['convert_height'] . "\" />" . $text;
            }
            $text = preg_replace_callback('/(<a[^>]*>)?<img[^>]*data-resource-id="(-?\\d*)"[^>]*>(<\\/a>)?/u', function ($match) use(&$images) {
                if ($match[2] < 0) {
                    return '';
                }
                $image = $this->images[$match[2]];
                /* Width and height are unimportant. Actual resizing is done not in this class. We must save aspect ratio though. */
                return "<img src=\"" . $image['thumbnail'] . "\" width=\"" . $image['convert_width'] . "\" height=\"" . $image['convert_height'] . "\" />";
            }, $text);
        }
        $footnotes = array();
        $footnotes_temp = explode(',;,', $this->footnotes);
        for ($i = 0; $i < sizeof($footnotes_temp); $i++) {
            if (is_numeric($footnotes_temp[$i])) {
                $footnotes[$footnotes_temp[$i]] = $footnotes_temp[$i + 1];
                $i++;
            }
        }
        $text = trim($text);
        $epubText = "<html>\n\t<body>\n\t\t{$descr['coverpage']}\n\t\t{$descr['author']}\n\t\t{$descr['sequence']}\n\t    {$descr['annotation']}\n\t\t{$credit}\n\t\t{$text}\n\t</body>\n\t</html>";
        $epubText = preg_replace_callback('@(<span[^>]*><a href="#cite_note-(\\d*)"[^>]*>.{0,15}</span>)@', function ($match) use(&$footnotes) {
            $footnote = $footnotes[$match[2]];
            $footnote = preg_replace('@</p>\\s*<p[^>]*>@', '<br/>', $footnote);
            if ($footnote) {
                return '<footnote>' . $footnote . '</footnote>';
            } else {
                return $match[1];
            }
        }, $epubText);
        //preg_replace('@cite_note-(\d*)@',"<footnote></footnote>", $epubText);
        //echo '<xmp>'.$epubText;
        //echo $footnotes[137603266];
        //exit;
        //echo '<xmp>'.$epubText;
        //exit;
        $epubText = preg_replace('@section@', "div", $epubText);
        /* Delete extra <br/> tag before images */
        $epubText = preg_replace('@<div>(.){0,20}<br\\/>(.){0,20}<img src@', '<div><img src', $epubText);
        /* Eliminate caret return before <h1> (Each div starts with caret return in h2d_htmlconverter.php) */
        $epubText = preg_replace('@\\s*<div>(.{0,40})(<h1>.*?<\\/h1>)@', '\\1\\2<div>', $epubText);
        /* NGNL Specific names */
        //$text=str_replace('<span style="position: relative; text-indent: 0;"><span style="display: inline-block; font-style: normal">&#12302;&#12288;&#12288;&#12288;&#12303;</span><span style="position: absolute; font-size: .7em; top: -11px; left: 50%"><span style="position: relative; left: -50%;">','&#12302;<sup>',$text);
        //$text=str_replace('</span></span></span>','</sup>&#12303;',$text);
        // Styles of elements in which footnote is nested should not count. Thus close them
        $epubText = preg_replace('@pb@', "br", $epubText);
        //echo '<xmp>'.$epubText;
        //exit;
        //PHPWord doesn't support tags nested in link element. Unnest images from them
        $epubText = preg_replace('@<a[^>]*>(<img[^>]*>)<\\/a>@', "\\1", $epubText);
        // Delete extra page breaks related to images.
        $epubText = preg_replace('@<div[^>]*>(.){0,20}(<img[^>]*>)(.){0,20}<\\/div>@', "\\1\\2\\3", $epubText);
        $epubText = preg_replace('@<p[^>]*>(.){0,20}(<img[^>]*>)(.){0,20}<\\/p>@', "\\1\\2\\3", $epubText);
        /* Swap h2 and img tags if img follows h2. (It gave a bad look in docx). */
        $epubText = preg_replace('@(<h2>.{0,100}<\\/h2>)(<img[^>]*>)@', '\\2\\1', $epubText);
        /* After swap we often needs to further lift img tag in previous <div> or <p> tag */
        $epubText = preg_replace('@<\\/div>(<img[^>]*>)<h2@', '\\1</div><h2', $epubText);
        $epubText = preg_replace('@<\\/p>(<img[^>]*>)<h2@', '\\1</p><h2', $epubText);
        //echo '<xmp>'.$epubText;
        //exit;
        $phpword_object = new \PhpOffice\PhpWord\PhpWord();
        \PhpOffice\PhpWord\Settings::setCompatibility(false);
        $html_dom = new \simple_html_dom();
        $html_dom->load($epubText);
        $html_dom_array = $html_dom->find('html', 0)->children();
        $paths = htmltodocx_paths();
        $initial_state = ['phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => ['size' => '11'], 'parents' => [0 => 'body'], 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => true, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => true, 'treat_div_as_paragraph' => true, 'structure_headings' => true, 'structure_document' => true, 'style_sheet' => htmltodocx_styles_example()];
        htmltodocx_insert_html($phpword_object, $html_dom_array[0]->nodes, $initial_state);
        //var_dump($html_dom_array[0]->nodes);
        //		exit;
        $html_dom->clear();
        unset($html_dom);
        $h2d_file_uri = tempnam(sys_get_temp_dir(), 'htd');
        /*if ($h2d_file_uri === false) {
              var_dump(sys_get_temp_dir());
          }*/
        $objWriter = \PhpOffice\PhpWord\IOFactory::createWriter($phpword_object, 'Word2007');
        $objWriter->save($h2d_file_uri);
        $bin = file_get_contents($h2d_file_uri);
        unlink($h2d_file_uri);
        //echo 'sdfjnsdlkvjn';
        //exit;
        return $bin;
    }
开发者ID:samogot,项目名称:rura-convertors,代码行数:101,代码来源:DocxConverter.php

示例15: process_page

function process_page($html)
{
    $dom = new simple_html_dom();
    $dom->load($html);
    $apps = array();
    global $authority_code;
    global $nearby_api_key;
    foreach ($dom->find("table[class='AppDetailsTable'] tr") as $row) {
        #  Man, this is hacky, but I'm not using dom here in case 'td' shows in plaintext of var
        if (stristr($row, 'FINALISED') || stristr($row, 'CONDITIONAL') || stristr($row, 'APPEALED') || stristr($row, 'WITHDRAWN') || stristr($row, 'NEW<') || stristr($row, 'APPROVED') || stristr($row, 'REFUSED')) {
            $appref = $authority_code . substr($row->children[0]->plaintext, 0, 2) . "/" . substr($row->children[0]->plaintext, 2);
            $rawappref = trim($row->children[0]->plaintext);
            $url = "http://planning.corkcity.ie/InternetEnquiry/rpt_ViewApplicDetails.asp?validFileNum=1&app_num_file=" . $rawappref;
            $rawdate = substr($row->children[4]->plaintext, 0, 10);
            $date = substr($rawdate, -4) . "-" . substr($rawdate, 3, 2) . "-" . substr($rawdate, 0, 2);
            $applicant = trim($row->children[5]->plaintext);
            $address = str_replace("<br>", ",", str_replace("<BR>", ",", $row->children[6]->innertext));
            #print $row;
            #print "row";
            #print $row;
            #print $row->children[15]->innertext;
            #$sizedetails = $row->children[15]->innertext;
            # Now fetch additional information.  Part one, full description of plan
            $fullapphtml = scraperwiki::scrape($url);
            $fullappdom = new simple_html_dom();
            $fullappdom->load($fullapphtml);
            $fullappdetails = $fullappdom->find("table[class='AppDetailsTable'] tr", 15)->children(1)->plaintext;
            #print $fullappdetails;
            unset($fullapphtml);
            $sizehtml = scraperwiki::scrape($url);
            $sizedom = new simple_html_dom();
            $sizedom->load($sizehtml);
            $signifdetail = $sizedom->find("table[class='AppDetailsTable'] tr", 23)->children(1)->plaintext;
            $sizedetail = $sizedom->find("table[class='AppDetailsTable'] tr", 23)->children(4)->plaintext;
            unset($sizehtml);
            if (strpos($fullappdetails, "Protected Structure") !== false) {
                $protected = "Protected Structure";
            } else {
                $protected = "";
            }
            #print $protected;
            $spam_found = false;
            # mobile,council,gov etc
            $business = array("retail", "Hotel", "Ltd", "Limited", " shop", " shop");
            foreach ($business as $businessword) {
                if (strrpos($row, $businessword)) {
                    $spam_found = true;
                    break;
                }
            }
            if ($spam_found) {
                $category = "Business";
            } else {
                if (strpos($fullappdetails, "dwelling") !== false) {
                    $category = "residential";
                } else {
                    $category = "";
                }
            }
            #print $category;
            # Part two, location of application
            $lochtml = scraperwiki::scrape('http://planning.corkcity.ie/InternetEnquiry/rpt_ViewSiteLocDetails.asp?page_num=0&file_number=' . $rawappref);
            if (!stristr($lochtml, "No Site Location Details Found")) {
                $locdom = new simple_html_dom();
                $locdom->load($lochtml);
                $locnorthing = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(1)->plaintext));
                $loceasting = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(4)->plaintext));
                # Part three, convert E&N to WGS84 using geograph class
                $c = new ConversionsLatLong();
                $res = $c->irish_to_wgs84($loceasting, $locnorthing);
                $lat = $res[0];
                $long = $res[1];
                $locdom->clear();
                unset($locdom);
                unset($lochtml);
                $apps["{$appref}"] = array('url' => $url, 'appref' => $appref, 'date' => $date, 'applicant' => $applicant, 'address' => $address, 'details' => $fullappdetails, 'signif' => $signifdetail, 'size' => $sizedetail, 'category' => $category, 'protected' => $protected, 'latitude' => $lat, 'longitude' => $long);
            }
        }
    }
    $dom->clear();
    unset($dom);
    return $apps;
}
开发者ID:flyeven,项目名称:scraperwiki-scraper-vault,代码行数:83,代码来源:ie_planningalerts_corkcitytype.php


注:本文中的simple_html_dom::clear方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。