当前位置: 首页>>代码示例>>Java>>正文


Java Page类代码示例

本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.Page的典型用法代码示例。如果您正苦于以下问题:Java Page类的具体用法?Java Page怎么用?Java Page使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Page类属于edu.uci.ics.crawler4j.crawler包,在下文中一共展示了Page类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    System.out.println("URL: " + url); 
    
 
    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String html = htmlParseData.getHtml();
        String title = htmlParseData.getTitle();
        
        System.out.println("Title: "+ title);           
        String baseUri = url;
        Elements validLinks = PageParser.getLinks(html, baseUri);
        
        writeContentToDB(url,validLinks);   //д�����ݿ�
    	 System.out.println("Saved updates to database.");
    }
}
 
开发者ID:wrayzheng,项目名称:webpage-update-subscribe,代码行数:20,代码来源:MyCrawler.java

示例2: store

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void store(Page page) {

    if (page.getParseData() instanceof HtmlParseData) {
        try {

            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

            insertKeyStatement.setString(1, htmlParseData.getHtml());
            insertKeyStatement.setString(2, htmlParseData.getText());
            insertKeyStatement.setString(3, page.getWebURL().getURL());
            insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime()));
            insertKeyStatement.executeUpdate();
        } catch (SQLException e) {
            logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e);
            throw new RuntimeException(e);
        }
    }
}
 
开发者ID:rzo1,项目名称:crawler4j-postgres-sample,代码行数:20,代码来源:PostgresDBServiceImpl.java

示例3: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    logger.info("URL: " + url);

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        Set<WebURL> links = htmlParseData.getOutgoingUrls();

        logger.info("Text length: " + text.length());
        logger.info("Html length: " + html.length());
        logger.info("Number of outgoing links: " + links.size());

        try {
            postgresDBService.store(page);
        } catch (RuntimeException e) {
            logger.error("Storing failed", e);
        }
    }
}
 
开发者ID:rzo1,项目名称:crawler4j-postgres-sample,代码行数:23,代码来源:PostgresWebCrawler.java

示例4: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);

if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();

System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
}
 
开发者ID:vjymits,项目名称:musicFinder,代码行数:21,代码来源:MyCrawler.java

示例5: shouldVisit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
 * You should implement this function to specify whether the given url
 * should be crawled or not (based on your crawling logic).
 */
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
	
	String href = url.getURL().toLowerCase();
	
	// Ignore the url if it has an extension that matches our defined set of
	// image extensions.
	if (IMAGE_EXTENSIONS.matcher(href).matches()) {
		return false;
	}
	
	// return href.startsWith("http://www.ladyironchef.com/");
	
	return href.startsWith("http://www.misstamchiak.com/");
}
 
开发者ID:almightyGOSU,项目名称:CZ4045-NLP-Crawler,代码行数:20,代码来源:BasicCrawler.java

示例6: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
	int docid = page.getWebURL().getDocid();
	String url = page.getWebURL().getURL();
	int parentDocid = page.getWebURL().getParentDocid();

	System.out.println("Docid: " + docid);
	System.out.println("URL: " + url);
	System.out.println("Docid of parent page: " + parentDocid);

	if (page.getParseData() instanceof HtmlParseData) {
		HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
		String text = htmlParseData.getText();
		String html = htmlParseData.getHtml();
		List<WebURL> links = htmlParseData.getOutgoingUrls();

		System.out.println("Text length: " + text.length());
		System.out.println("Html length: " + html.length());
		System.out.println("Number of outgoing links: " + links.size());
	}

	System.out.println("=============");
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:24,代码来源:BasicCrawler.java

示例7: download

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
private Page download(String url) {
	WebURL curURL = new WebURL();
	curURL.setURL(url);
	PageFetchResult fetchResult = null;
	try {
		fetchResult = pageFetcher.fetchHeader(curURL);
		if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
			try {
				Page page = new Page(curURL);
				fetchResult.fetchContent(page);
				if (parser.parse(page, curURL.getURL())) {
					return page;
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	} finally {
		if (fetchResult != null)
		{
			fetchResult.discardContentIfNotConsumed();
		}			
	}
	return null;
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:26,代码来源:Downloader.java

示例8: processUrl

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
public void processUrl(String url) {
	System.out.println("Processing: " + url);
	Page page = download(url);
	if (page != null) {
		ParseData parseData = page.getParseData();
		if (parseData != null) {
			if (parseData instanceof HtmlParseData) {
				HtmlParseData htmlParseData = (HtmlParseData) parseData;
				System.out.println("Title: " + htmlParseData.getTitle());
				System.out.println("Text length: " + htmlParseData.getText().length());
				System.out.println("Html length: " + htmlParseData.getHtml().length());
			}
		} else {
			System.out.println("Couldn't parse the content of the page.");
		}
	} else {
		System.out.println("Couldn't fetch the content of the page.");
	}
	System.out.println("==============");
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:21,代码来源:Downloader.java

示例9: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
	System.out.println("Visited: " + page.getWebURL().getURL());
	myCrawlStat.incProcessedPages();

	if (page.getParseData() instanceof HtmlParseData) {
		HtmlParseData parseData = (HtmlParseData) page.getParseData();
		List<WebURL> links = parseData.getOutgoingUrls();
		myCrawlStat.incTotalLinks(links.size());
		try {
			myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
		} catch (UnsupportedEncodingException ignored) {
			// Do nothing
		}
	}
	// We dump this crawler statistics after processing every 50 pages
	if (myCrawlStat.getTotalProcessedPages() % 50 == 0) {
		dumpMyData();
	}
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:21,代码来源:LocalDataCollectorCrawler.java

示例10: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
 * This function is called when a page is fetched and ready to be processed. It is important that we don't let any
 * Graph exceptions escape this method as this would cause the calling thread to die and eventually the crawler
 * would run out of threads.
 */
@Override
public void visit(Page page) {
    WebURL webUrl = page.getWebURL();

    String statusMessage = FailedUrls.getInstance().getStatusMessage("FailedRequest", webUrl);
    if (statusMessage != null) {
        logger.warn("Ignoring bad URL " + webUrl + " - " + statusMessage);
        return;
    }

    int pageCounter = atomicPageCounter.incrementAndGet();

    if (graphImporter != null) {
        logger.info("Importing page # " + pageCounter + ": " + webUrl + " (node count so far: "
                + graphImporter.getNumberOfPageNodes() + ")");

        if (page.getParseData() instanceof HtmlParseData) {
            visitHtmlPage(webUrl.getURL(), (HtmlParseData) page.getParseData());
        } else {
            visitNonHtmlPage(webUrl.getURL());
        }
    }
}
 
开发者ID:fgavilondo,项目名称:neo4j-webgraph,代码行数:29,代码来源:HtmlOnlyCrawler.java

示例11: shouldVisit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
    String href = url.getURL().toLowerCase();
    if (IMAGE_EXTENSIONS.matcher(href).matches()) {
        return false;
    }
    return href.startsWith("https://en.wikipedia.org/wiki/");
}
 
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:9,代码来源:SampleCrawler.java

示例12: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
    int docid = page.getWebURL().getDocid();
    String url = page.getWebURL().getURL();

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        if (text.contains("shipping route")) {
            out.println("\nURL: " + url);
            out.println("Text: " + text);
            out.println("Text length: " + text.length());
        }
    }
}
 
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:16,代码来源:SampleCrawler.java

示例13: shouldVisit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
 * 由crawler4j调用,前置(爬虫)过滤将在这里进行匹配
 */
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
	String urlStr = url.getURL();
	if (App.crawlFilterPattern.matcher(urlStr).find() && !App.visitUrls.contains(urlStr)) {
		for (String domain : App.domains) {
			if (urlStr.contains(domain)) {
				App.visitUrls.add(urlStr);
				return true;
			}
		}
	}
	return false;
}
 
开发者ID:zhazhapan,项目名称:visual-spider,代码行数:17,代码来源:Crawler.java

示例14: visit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
 * 由crawler4j调用,链接(访问)过滤将在这里进行匹配
 */
@Override
public void visit(Page page) {
	String url = page.getWebURL().getURL();
	if (App.visitFilterPattern.matcher(url).find() && page.getParseData() instanceof HtmlParseData) {
		HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
		Platform.runLater(() -> {
			App.mainController.stautsLabel.setText("validating url: " + url);
			App.mainController.htmlContent.appendText(Values.VISITING_TIP + url + "\r\n");
		});
		downloadURL(url, htmlParseData.getHtml());
	}
}
 
开发者ID:zhazhapan,项目名称:visual-spider,代码行数:16,代码来源:Crawler.java

示例15: shouldVisit

import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
	String href = url.getURL().toLowerCase();

	if (imgPatterns.matcher(href).matches()) {
		return true;
	}

	if (href.startsWith(crawlDomain)) {
		return true;
	}

	return false;
}
 
开发者ID:yasuflatland-lf,项目名称:liferay-dummy-factory,代码行数:15,代码来源:ImageCrawler.java


注:本文中的edu.uci.ics.crawler4j.crawler.Page类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。