当前位置: 首页>>代码示例>>Java>>正文


Java Page.getParseData方法代码示例

本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.Page.getParseData方法的典型用法代码示例。如果您正苦于以下问题:Java Page.getParseData方法的具体用法?Java Page.getParseData怎么用?Java Page.getParseData使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在edu.uci.ics.crawler4j.crawler.Page的用法示例。


在下文中一共展示了Page.getParseData方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    System.out.println("URL: " + url); 
    
 
    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String html = htmlParseData.getHtml();
        String title = htmlParseData.getTitle();
        
        System.out.println("Title: "+ title);           
        String baseUri = url;
        Elements validLinks = PageParser.getLinks(html, baseUri);
        
        writeContentToDB(url,validLinks);   //д�����ݿ�
    	 System.out.println("Saved updates to database.");
    }
}
 
开发者ID:wrayzheng,项目名称:webpage-update-subscribe,代码行数:20,代码来源:MyCrawler.java

示例2: store

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void store(Page page) {

    if (page.getParseData() instanceof HtmlParseData) {
        try {

            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

            insertKeyStatement.setString(1, htmlParseData.getHtml());
            insertKeyStatement.setString(2, htmlParseData.getText());
            insertKeyStatement.setString(3, page.getWebURL().getURL());
            insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime()));
            insertKeyStatement.executeUpdate();
        } catch (SQLException e) {
            logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e);
            throw new RuntimeException(e);
        }
    }
}
 
开发者ID:rzo1,项目名称:crawler4j-postgres-sample,代码行数:20,代码来源:PostgresDBServiceImpl.java

示例3: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    logger.info("URL: " + url);

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        Set<WebURL> links = htmlParseData.getOutgoingUrls();

        logger.info("Text length: " + text.length());
        logger.info("Html length: " + html.length());
        logger.info("Number of outgoing links: " + links.size());

        try {
            postgresDBService.store(page);
        } catch (RuntimeException e) {
            logger.error("Storing failed", e);
        }
    }
}
 
开发者ID:rzo1,项目名称:crawler4j-postgres-sample,代码行数:23,代码来源:PostgresWebCrawler.java

示例4: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);

if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();

System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
}
 
开发者ID:vjymits,项目名称:musicFinder,代码行数:21,代码来源:MyCrawler.java

示例5: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
	int docid = page.getWebURL().getDocid();
	String url = page.getWebURL().getURL();
	int parentDocid = page.getWebURL().getParentDocid();

	System.out.println("Docid: " + docid);
	System.out.println("URL: " + url);
	System.out.println("Docid of parent page: " + parentDocid);

	if (page.getParseData() instanceof HtmlParseData) {
		HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
		String text = htmlParseData.getText();
		String html = htmlParseData.getHtml();
		List<WebURL> links = htmlParseData.getOutgoingUrls();

		System.out.println("Text length: " + text.length());
		System.out.println("Html length: " + html.length());
		System.out.println("Number of outgoing links: " + links.size());
	}

	System.out.println("=============");
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:24,代码来源:BasicCrawler.java

示例6: processUrl

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
public void processUrl(String url) {
	System.out.println("Processing: " + url);
	Page page = download(url);
	if (page != null) {
		ParseData parseData = page.getParseData();
		if (parseData != null) {
			if (parseData instanceof HtmlParseData) {
				HtmlParseData htmlParseData = (HtmlParseData) parseData;
				System.out.println("Title: " + htmlParseData.getTitle());
				System.out.println("Text length: " + htmlParseData.getText().length());
				System.out.println("Html length: " + htmlParseData.getHtml().length());
			}
		} else {
			System.out.println("Couldn't parse the content of the page.");
		}
	} else {
		System.out.println("Couldn't fetch the content of the page.");
	}
	System.out.println("==============");
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:21,代码来源:Downloader.java

示例7: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
	System.out.println("Visited: " + page.getWebURL().getURL());
	myCrawlStat.incProcessedPages();

	if (page.getParseData() instanceof HtmlParseData) {
		HtmlParseData parseData = (HtmlParseData) page.getParseData();
		List<WebURL> links = parseData.getOutgoingUrls();
		myCrawlStat.incTotalLinks(links.size());
		try {
			myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
		} catch (UnsupportedEncodingException ignored) {
			// Do nothing
		}
	}
	// We dump this crawler statistics after processing every 50 pages
	if (myCrawlStat.getTotalProcessedPages() % 50 == 0) {
		dumpMyData();
	}
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:21,代码来源:LocalDataCollectorCrawler.java

示例8: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
 * This function is called when a page is fetched and ready to be processed. It is important that we don't let any
 * Graph exceptions escape this method as this would cause the calling thread to die and eventually the crawler
 * would run out of threads.
 */
@Override
public void visit(Page page) {
    WebURL webUrl = page.getWebURL();

    String statusMessage = FailedUrls.getInstance().getStatusMessage("FailedRequest", webUrl);
    if (statusMessage != null) {
        logger.warn("Ignoring bad URL " + webUrl + " - " + statusMessage);
        return;
    }

    int pageCounter = atomicPageCounter.incrementAndGet();

    if (graphImporter != null) {
        logger.info("Importing page # " + pageCounter + ": " + webUrl + " (node count so far: "
                + graphImporter.getNumberOfPageNodes() + ")");

        if (page.getParseData() instanceof HtmlParseData) {
            visitHtmlPage(webUrl.getURL(), (HtmlParseData) page.getParseData());
        } else {
            visitNonHtmlPage(webUrl.getURL());
        }
    }
}
 
开发者ID:fgavilondo,项目名称:neo4j-webgraph,代码行数:29,代码来源:HtmlOnlyCrawler.java

示例9: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
    int docid = page.getWebURL().getDocid();
    String url = page.getWebURL().getURL();

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        if (text.contains("shipping route")) {
            out.println("\nURL: " + url);
            out.println("Text: " + text);
            out.println("Text length: " + text.length());
        }
    }
}
 
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:16,代码来源:SampleCrawler.java

示例10: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
 * 由crawler4j调用,链接(访问)过滤将在这里进行匹配
 */
@Override
public void visit(Page page) {
	String url = page.getWebURL().getURL();
	if (App.visitFilterPattern.matcher(url).find() && page.getParseData() instanceof HtmlParseData) {
		HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
		Platform.runLater(() -> {
			App.mainController.stautsLabel.setText("validating url: " + url);
			App.mainController.htmlContent.appendText(Values.VISITING_TIP + url + "\r\n");
		});
		downloadURL(url, htmlParseData.getHtml());
	}
}
 
开发者ID:zhazhapan,项目名称:visual-spider,代码行数:16,代码来源:Crawler.java

示例11: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
	String url = page.getWebURL().getURL();

	// We are only interested in processing images which are bigger than 10k
	if (!imgPatterns.matcher(url).matches() || !((page.getParseData() instanceof BinaryParseData)
			|| (page.getContentData().length < (10 * 1024)))) {
		return;
	}

	gatheredURLs.add(url);
	System.out.println("Fetched URL : " + url);
}
 
开发者ID:yasuflatland-lf,项目名称:liferay-dummy-factory,代码行数:14,代码来源:ImageCrawler.java

示例12: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
    logger.info("URL: " + page.getWebURL().getURL());
    if (page.getParseData() instanceof HtmlParseData) {
        String text = getPageText(page);

        for (Memo memo :
                memos) {
            findAndSaveMemo(text, memo);
        }
    }
}
 
开发者ID:edu-xored,项目名称:memorise,代码行数:13,代码来源:ActionsCrawler.java

示例13: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
 * This function is called when a page is fetched and ready
 * to be processed by your program.
 */
@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    System.out.println("URL: " + url);

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        Set<WebURL> links = htmlParseData.getOutgoingUrls();

        System.out.println("Text length: " + text.length());
        System.out.println("Html length: " + html.length());
        System.out.println("Number of outgoing links: " + links.size());
       // System.out.println(html);

        org.jsoup.nodes.Document doc = Jsoup.parseBodyFragment(html);
        Elements alinks = doc.select("a[href]");




        for (Element a : alinks) {
            if(a.attr("title").equals("Full text at publisher's site")) {

                String aurl =  a.attr("abs:href");
                String aurlParts[] = url.split("/");
                System.out.println("" + aurlParts[aurlParts.length -1] + ", " +   aurl);
                localData.put(Integer.parseInt(aurlParts[aurlParts.length -1]),aurl);
            }
        }




    }
}
 
开发者ID:charithwiki,项目名称:DIA-Protection,代码行数:42,代码来源:TestCrawler.java

示例14: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
	String url = page.getWebURL().getURL();

	// We are only interested in processing images
	if (!(page.getParseData() instanceof BinaryParseData)) {
		return;
	}

	if (!imgPatterns.matcher(url).matches()) {
		return;
	}

	// Not interested in very small images
	if (page.getContentData().length < 10 * 1024) {
		return;
	}

	// get a unique name for storing this image
	String extension = url.substring(url.lastIndexOf("."));
	String hashedName = Cryptography.MD5(url) + extension;

	// store image
	IO.writeBytesToFile(page.getContentData(), storageFolder.getAbsolutePath() + "/" + hashedName);

	System.out.println("Stored: " + url);
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:28,代码来源:ImageCrawler.java

示例15: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
 * This function is called when a page is fetched and ready to be processed
 * by your program.
 */
@Override
public void visit(Page page) {
	int docid = page.getWebURL().getDocid();
	String url = page.getWebURL().getURL();
	String domain = page.getWebURL().getDomain();
	String path = page.getWebURL().getPath();
	String subDomain = page.getWebURL().getSubDomain();
	String parentUrl = page.getWebURL().getParentUrl();
	String anchor = page.getWebURL().getAnchor();

	System.out.println("Docid: " + docid);
	System.out.println("URL: " + url);
	System.out.println("Domain: '" + domain + "'");
	System.out.println("Sub-domain: '" + subDomain + "'");
	System.out.println("Path: '" + path + "'");
	System.out.println("Parent page: " + parentUrl);
	System.out.println("Anchor text: " + anchor);
	
	if (page.getParseData() instanceof HtmlParseData) {
		HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
		String text = htmlParseData.getText();
		String html = htmlParseData.getHtml();
		List<WebURL> links = htmlParseData.getOutgoingUrls();

		System.out.println("Text length: " + text.length());
		System.out.println("Html length: " + html.length());
		System.out.println("Number of outgoing links: " + links.size());
	}

	Header[] responseHeaders = page.getFetchResponseHeaders();
	if (responseHeaders != null) {
		System.out.println("Response headers:");
		for (Header header : responseHeaders) {
			System.out.println("\t" + header.getName() + ": " + header.getValue());
		}
	}
	
	System.out.println("=============");
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:44,代码来源:BasicCrawler.java


注:本文中的edu.uci.ics.crawler4j.crawler.Page.getParseData方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。