当前位置: 首页>>代码示例>>PHP>>正文

PHP scraperwiki::scrape方法代码示例

本文整理汇总了PHP中scraperwiki::scrape方法的典型用法代码示例。如果您正苦于以下问题:PHP scraperwiki::scrape方法的具体用法?PHP scraperwiki::scrape怎么用?PHP scraperwiki::scrape使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scraperwiki的用法示例。


示例1: parseModelsPage

 function parseModelsPage($brandId, $brandName, $page)
     $html_content = scraperwiki::scrape($page);
     $this->html = str_get_html($html_content);
     foreach ($this->html->find("div.makers a") as $el) {
         $img = $el->find('img', 0);
         $m['name'] = $brandName . ' ' . $el->find('strong', 0)->innertext;
         $m['img'] = $img->src;
         $m['link'] = 'http://www.gsmarena.com/' . $el->href;
         $m['desc'] = $img->title;
         $temp = explode('-', $el->href);
         $m['id'] = (int) substr($temp[1], 0, -4);
         $m['brand_id'] = $brandId;
         scraperwiki::save_sqlite(array("id" => $m['id']), $m, "cell_model");
     $pagination = $this->html->find("div.nav-pages", 0);
     if ($pagination) {
         $nextPageLink = $pagination->lastChild();
         if ($nextPageLink && $nextPageLink->title == "Next page") {
             $this->parseModelsPage($brandId, $brandName, 'http://www.gsmarena.com/' . $nextPageLink->href);

示例2: clubURL

function clubURL($url)
    $html = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $clubName = trim(str_replace(' ', '', $dom->find('table', 0)->find('tr', 2)->plaintext));
    $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName));
    $_GLOBAL['clubs'][] = $formatClubName;
    echo 'running ' . $formatClubName . "\n";
    foreach ($dom->find('table', 2)->find('tr') as $row) {
        if (is_numeric($row->find('td', 0)->plaintext)) {
            $year = trim($row->find('td', 0)->plaintext);
            $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext));
            if (trim($position) == 'Champion') {
                $position = 1;
            $leagueLevel = trim($row->find('td', 2)->plaintext);
            $overallPosition = trim($row->find('td', 3)->plaintext);
            $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext));
            $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext));
            $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance);
            scraperwiki::save(array('club', 'year'), $dataset);
     * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak

示例3: getCategories

function getCategories($u)
    global $baseurl, $f;
    $path = "";
    $d = new simple_html_dom();
    echo "Loaded URL: " . $u . "\n";
    if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) {
        $breadcrumb = $d->find('div[id=breadcrumb]', 0);
        //foreach($breadcrumb as $b) {
        //echo "Breadcrumb = " . $b;}
        if (!is_null($breadcrumb)) {
            foreach ($breadcrumb->children() as $crumb) {
                $path .= trim($crumb->innertext) . "/";
            $path .= trim(strrchr($breadcrumb->innertext, ">"), "> ");
        foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) {
            $name = trim(strstr($div->children(0)->innertext, "(", true));
            $url = $baseurl . $div->children(0)->href;
            $data = array("Name" => $name, "Path" => $path, "URL" => $url);
            echo $path . "/" . $name . "\n";
            if ($local) {
                fputcsv($f, array($name, $path, $url));
            } else {
                scraperwiki::save_sqlite(array("URL"), $data);

示例4: getIngredients

function getIngredients($html)
    $i = 0;
    $dom = new simple_html_dom();
    //foreach($dom->find('result-item',1)->href as $data)
    // if ($data != null)
    //$res = trim($data->plaintext);
    $res = $dom->find('a[class=callout]', 1)->href;
    $res = str_replace("reviews/", "", $res);
    echo "http://www.foodnetwork.com" . $res;
    $html1 = scraperwiki::scrape("http://www.foodnetwork.com" . $res);
    $domFoods = new simple_html_dom();
    $h = str_get_html($html1);
    //echo $domFoods;
    echo "\n\n";
    foreach ($h->find('li[class=ingredient]') as $data) {
        $ingredient = $data->plaintext;
        if (isset($h->href)) {
            $href = $h->href;
        //foreach($domFoods->find('ul[class=kv-ingred-list1]',1)->children() as $data){
        //echo $data->plaintext;
        scraperwiki::save(array('ing'), array('ing' => $ingredient, 'href' => $href));

示例5: getProducts

function getProducts($u, $cat)
    global $o;
    $d = new simple_html_dom();
    //echo "Loaded URL: " . $u . "\n";
    $items = $d->find('li.grid-item');
    if (count($items) > 0) {
        foreach ($items as $p) {
            $prod = $p->find('p.product-name > a', 0);
            $prodname = trim($prod->innertext);
            $prodURL = $prod->href;
            if (!is_null($p->find('p.minimal-price', 0))) {
                $prodtype = 1;
            } else {
                $prodtype = 0;
            fputcsv($o, array($prodname, $prodtype, $cat, $prodURL));
            echo $prodname . "\n";
        if (!is_null($d->find('p.next', 0))) {
            getProducts($d->find('p.next', 0)->href, $cat);

示例6: ripById

function ripById($id)
    $pathToDetails = 'http://beheshtezahra.tehran.ir/Default.aspx?tabid=92&ctl=SearchDetails&mid=653&srid=' . $id;
    $output = scraperwiki::scrape($pathToDetails);
    $firstnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblNameBound_0"><b>(.*)<\\//smiU';
    $surnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblLastNameBound_0"><b>(.*)<\\//smiU';
    $fathernamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblFatherNameBound_0"><b>(.*)<\\//smiU';
    $birthdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblBirthDateBound_0"><b>(.*)<\\//smiU';
    $deathdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnDateBound_0"><b>(.*)<\\//smiU';
    $deathplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDeastTownshipTitle_0"><b>(.*)<\\//smiU';
    $graveplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnPlace_0"><b>(.*)<\\//smiU';
    preg_match($firstnamepattern, $output, $temp);
    $firstname = isset($temp[1]) ? $temp[1] : '';
    preg_match($surnamepattern, $output, $temp);
    $surname = isset($temp[1]) ? $temp[1] : '';
    preg_match($fathernamepattern, $output, $temp);
    $fathername = isset($temp[1]) ? $temp[1] : '';
    preg_match($birthdatepattern, $output, $temp);
    $birthdate = isset($temp[1]) ? $temp[1] : '';
    preg_match($deathdatepattern, $output, $temp);
    $deathdate = isset($temp[1]) ? $temp[1] : '';
    preg_match($deathplacepattern, $output, $temp);
    $deathplace = isset($temp[1]) ? $temp[1] : '';
    preg_match($graveplacepattern, $output, $temp);
    $graveplace = isset($temp[1]) ? $temp[1] : '';
    scraperwiki::save_sqlite(array('data'), array('id' => $id, 'firstname' => $firstname, 'surname' => $surname, 'fathername' => $fathername, 'birthdate' => $birthdate, 'deathdate' => $deathdate, 'deathplace' => $deathplace, 'graveplace' => $graveplace));

示例7: do_day

function do_day($rec)
    $html = scraperwiki::scrape($rec['url']);
    $dom = new simple_html_dom();
    $cell = $dom->find('a[name=discs]');
    $lines = $cell[0]->parent->find('text');
    print $lines[10] . "\n";
    print count($lines) . "\n";
    # loop by number, as null lines stop a foreach
    $n = 0;
    for ($line_no = 0; $line_no < count($lines); $line_no++) {
        $line = $lines[$line_no];
        if (strlen($line) == 3) {
            # the DOM object crashes on this row, so ignore
        #if (preg_match("#^" . $n . "#", $line, $matches)) {
        print $line_no . " " . strlen($line) . "\n";
        $n = $n + 1;
        print $line . "\n";
    #scraperwiki::save(array('data'), array('data' => $data->plaintext));

示例8: handle_products

function handle_products($product_link)
    global $base_url_host, $base_url_scheme, $total;
    if (!empty($product_link)) {
        $link_3 = $product_link;
        $cat_raw = str_replace("http://www.thule.com/en-US/US/Products/", "", $product_link);
        $cats = dirname($cat_raw);
        $cat_terms = array("Base-Racks/Feet", "Base-Racks/LoadAccessories", "Base-Racks/LoadBars", "Bike-Carriers/Accessories", "Bike-Carriers/Hitch", "Bike-Carriers/RearDoor", "Bike-Carriers/RoofCarriers", "Bike-Carriers/SpareTire", "Bike-Carriers/TruckBed", "Cargo-Carriers/Bags", "Cargo-Carriers/Baskets", "Cargo-Carriers/Boxes", "Cargo-Carriers/HitchCargo", "Luggage/DaypacksAndMessengers", "Luggage/LaptopAndTablet", "Luggage/LuggageAndDuffels", "Snow-Chains/SnowChains", "Snowsports/Accessories", "Snowsports/HitchSki", "Snowsports/SkiBoxes", "Snowsports/SkiCarriers", "Watersports/Accessories", "Watersports/WatersportCarriers");
        $cat_cleaned = array("Base Racks/Feet", "Base Racks/Load Accessories", "Base Racks/Load Bars", "Bike Carriers/Accessories", "Bike Carriers/Hitch", "Bike Carriers/Rear Door", "Bike Carriers/Roof Carriers", "Bike Carriers/Spare Tire", "Bike Carriers/Truck Bed", "Cargo Carriers/Bags", "Cargo Carriers/Baskets", "Cargo Carriers/Boxes", "Cargo Carriers/Hitch Cargo", "Luggage/Daypacks And Messengers", "Luggage/Laptop And Tablet", "Luggage/Luggage And Duffels", "Snow Chains/Snow Chains", "Snowsports/Accessories", "Snowsports/Hitch Ski", "Snowsports/Ski Boxes", "Snowsports/Ski Carriers", "Watersports/Accessories", "Watersports/Watersport Carriers");
        $cat = str_replace($cat_terms, $cat_cleaned, $cats);
        $html_content = scraperwiki::scrape($link_3);
        $html = str_get_html($html_content);
        $name_raw = trim($html->find("div[@class='column details_overview'] h2 span", 0));
        $name = !empty($name_raw) ? strip_tags($name_raw) : "";
        $desc_raw = trim($html->find("div[@class='column details_overview'] h3 span", 0));
        $desc = !empty($desc_raw) ? strip_tags($desc_raw) : "";
        $price_raw = trim($html->find("div[@class='pricing'] span[@id='phcontent_0_ctl00_lblPriceText']", 0));
        $price = strip_tags($price_raw);
        $price = str_replace("MSRP \$", "", $price);
        $price = trim(str_replace(" (USD)", "", $price));
        $image = $html->find("img[@id='imgProductBomImage_0']", 0)->src;
        echo "{$name}: {$image}\n";
        // Add it to an array.
        $record = array('id' => $total, 'product_name' => trim($name), 'desciption' => trim($desc), 'price' => $price, 'img' => $image, 'category' => $cat);
        // Add it to the table.
        scraperwiki::save_sqlite(array('id'), array($record), "products_support", 2);
        // Increment the 'id' counter.

示例9: scrapeDetails

function scrapeDetails($ngo)
    $html_content = scraperwiki::scrape($ngo["url"]);
    $dom = new simple_html_dom();
    $infosWeWant = array('Telefon', 'Rechtsform', 'Steuerstatus', 'Weltanschauliche Ausrichtung', 'Anzahl Mitarbeiter', 'Gesamteinnahmen:', 'Davon Sammlungseinnahmen', 'Bezugsjahr:');
    // Scrape Details from all paragraphs
    $paragraphs = $dom->find('p');
    foreach ($paragraphs as $p) {
        if (strstr($p->plaintext, "Website")) {
            $ngo["website"] = $p->find('a', 0)->href;
        if (strstr($p->plaintext, "Email")) {
            $ngo["email"] = $p->find('a', 0)->plaintext;
        foreach ($infosWeWant as $key => $info) {
            $res = extractInfo($p, $info);
            if ($res) {
                $ngo[$info] = $res;
                //Do not search for this info again
    return $ngo;

示例10: ripById

function ripById($id)
    $pathToDetails = 'http://www.shborujen.ir/DesktopModules/eFormViewer/eFormViewerEdit.aspx?TabID=4753&Site=DouranPortal&MId=14286&Lang=fa-IR&ItemID=1&fID=1228&keyID=itemid%7C' . $id;
    $output = scraperwiki::scrape($pathToDetails);
    $firstnamepattern = '/<input name="eFormEditData1228\\$field1421\\$controlToValidate_Field72\\$Field72_Value".*" value="(.*)".*>/smiU';
    $surnamepattern = '/<input name="eFormEditData1228\\$field1415\\$controlToValidate_Field73\\$Field73_Value.*" value="(.*)".*>/smiU';
    $fathernamepattern = '/<input name="eFormEditData1228\\$field1416\\$controlToValidate_Field74\\$Field74_Value.*value="(.*)".*>/smiU';
    $deathdatepattern = '/<input name="eFormEditData1228\\$field1418\\$ctl00\\$txt.*" value="(.*)".*>/smiU';
    $blockpattern = '/<input name="eFormEditData1228\\$field1414\\$controlToValidate_Field78\\$Field78_Value.*" value="(.*)".*>/smiU';
    $rowpattern = '/<input name="eFormEditData1228\\$field1434\\$controlToValidate_Field1434\\$Field1434_Value.*" value="(.*)".*>/smiU';
    $placepattern = '/<input name="eFormEditData1228\\$field1413\\$controlToValidate_Field77\\$Field77_Value.*" value="(.*)".*>/smiU';
    $gravepattern = '/<input name="eFormEditData1228\\$field1439\\$controlToValidate_Field1439\\$Field1439_Value.*" value="(.*)".*>/smiU';
    preg_match($firstnamepattern, $output, $temp);
    $firstname = isset($temp[1]) ? $temp[1] : '';
    preg_match($surnamepattern, $output, $temp);
    $surname = isset($temp[1]) ? $temp[1] : '';
    preg_match($fathernamepattern, $output, $temp);
    $fathername = isset($temp[1]) ? $temp[1] : '';
    preg_match($deathdatepattern, $output, $temp);
    $deathdate = isset($temp[1]) ? $temp[1] : '';
    preg_match($placepattern, $output, $temp);
    $place = isset($temp[1]) ? $temp[1] : '';
    preg_match($rowpattern, $output, $temp);
    $row = isset($temp[1]) ? $temp[1] : '';
    preg_match($blockpattern, $output, $temp);
    $block = isset($temp[1]) ? $temp[1] : '';
    preg_match($gravepattern, $output, $temp);
    $grave = isset($temp[1]) ? $temp[1] : '';
    scraperwiki::save_sqlite(array('data'), array('id' => $id, 'firstname' => $firstname, 'surname' => $surname, 'fathername' => $fathername, 'birthdate' => $birthdate, 'deathdate' => $deathdate, 'place' => $place, 'block' => $block, 'row' => $row, 'grave' => $grave));

示例11: scraper

function scraper($url_search, $country_id)
    $has_next = false;
    $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet";
    $html = scraperwiki::scrape($url_search);
    $dom = new simple_html_dom();
    foreach ($dom->find('table[class=JResult]') as $result) {
        foreach ($result->find('td[class=JRTitle] a') as $job_page) {
            $chars = explode("'", $job_page->onclick);
            $url_job = $base_url . substr($chars[1], 1);
            $url_id = strstr($url_job, 'uniqueJvId=');
            $url_id = str_replace('uniqueJvId=', "", $url_id);
            echo "JOB: " . $url_job . "<br />";
        foreach ($result->find('th') as $data) {
            $text = trim($data->plaintext);
            if ($text == 'Description:') {
                $description = trim($data->next_sibling()->plaintext);
                echo "DESCRIPTION: " . $description . "<br />";
            if ($text == 'Source:') {
                $source = trim($data->next_sibling()->plaintext);
                $source = str_replace("'", "\\'", $source);
                if ($source != '' && $source != '&nbsp;') {
                    $source_id = insert_name('source', $source);
                    echo "SOURCE: " . $source . "<br /><br />";
        $description = str_replace("'", "\\'", $description);
        $description = str_replace("</BR>", "", $description);
        $sql = mysql_query("SELECT * FROM job WHERE url = '{$url_job}'");
        $cont = mysql_num_rows($sql);
        if ($cont == 0) {
            mysql_query("INSERT INTO job SET \n\t\t\t\t\turl = '{$url_job}', \n\t\t\t\t\turl_id = '{$url_id}', \n\t\t\t\t\tdescription = '{$description}', \n\t\t\t\t\tsource_id = '{$source_id}', \n\t\t\t\t\turl_search = '{$url_search}', \n\t\t\t\t\tcountry_id='{$country_id}',\n\t\t\t\t\turl_scraper_date = SYSDATE(),\t \n\t\t\t\t\turl_scraper_hour = SYSDATE()");
        } else {
            echo "Job URL already extracted: " . $url_job . "<br /><br />";
    foreach ($dom->find('div[class=prevNext] a') as $next_page) {
        $text = $next_page->plaintext;
        if ($text == "Next page") {
            $url_next = substr($next_page->href, 1);
            $url_next = $base_url . $url_next;
            $has_next = true;
            print "<br /><br />NEXT: " . $url_next . "<br /><br />";
    unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search);
    //Comment this for tests, uncomment this to get all data
    //	if ($has_next == true){
    //		sleep(1);
    //		scraper($url_next, $country_id);
    //	}

示例12: ripByPage

function ripByPage($page)
    $pathToDetails = 'http://aramestan.e-sanandaj.ir/BurialRequest/DeadSearch?keyword=&firstName=&lastName=&fatherName=&partNo=0&rowNo=&graveNo=&deathDateFrom=&deathDateTo=&bornDateFrom=&bornDateTo=&page=' . $page;
    $output = scraperwiki::scrape($pathToDetails);
    $resultingJsonObject = json_decode($output);
    for ($id = 0; $id <= 9; $id++) {
        $entry = array('id' => $resultingJsonObject->{'result'}[$id]->{'Id'}, 'fullname' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFullName'}), 'fathername' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFatherName'}), 'birthdate' => strVal($resultingJsonObject->{'result'}[$id]->{'BornDate'}), 'deathdate' => strVal($resultingJsonObject->{'result'}[$id]->{'DeathDate'}), 'partno' => strVal($resultingJsonObject->{'result'}[$id]->{'PartNo'}), 'rowno' => strVal($resultingJsonObject->{'result'}[$id]->{'RowNo'}), 'graveno' => strVal($resultingJsonObject->{'result'}[$id]->{'GraveNo'}), 'gender' => strVal($resultingJsonObject->{'result'}[$id]->{'Gender'}), 'identitycode' => strVal($resultingJsonObject->{'result'}[$id]->{'IdentityCode'}));
        scraperwiki::save_sqlite(array('data'), $entry);
        $pagecount = $resultingJsonObject->{'PageNumber'};

示例13: scrape

function scrape($source)
    global $source, $utmSource, $utmMedium, $utmTerm, $utmContent, $utmCampaign;
    $link = scraperwiki::scrape($source);
    $html = str_get_html($link);
    foreach ($html->find('a[href]') as $a) {
        $href = $a->href;
        $a->href = $href . '#utm_source=' . $utmSource . '&utm_medium=' . $utmMedium . '&utm_term=' . $utmTerm . '&utm_content=' . $utmContent . '&utm_campaign=' . $utmCampaign;
    print $html;

示例14: getLangs

function getLangs()
    $url = "http://mappings.dbpedia.org/server/statistics/";
    $html = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $i = 0;
    $langs = array();
    foreach ($dom->find('/html/body/p/a') as $result) {
        $lang = str_replace("/", "", trim($result->href));
        $langs[] = $lang;
    return $langs;

示例15: scrapeIndex

function scrapeIndex($url)
    $html_content = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $ngos = array();
    foreach ($dom->find('h2') as $h2) {
        $name = str_replace("&#8211;", "-", html_entity_decode($h2->plaintext));
        $url = $h2->find('a', 0);
        $url = $url->href;
        $ngos[] = array("name" => $name, "url" => $url);
        scraperwiki::save_sqlite(array("name"), array("name" => $name, "url" => $url), "ngos");
    return $ngos;
