Java URLCanonicalizer类代码示例

本文整理汇总了Java中edu.uci.ics.crawler4j.url.URLCanonicalizer类的典型用法代码示例。如果您正苦于以下问题：Java URLCanonicalizer类的具体用法？Java URLCanonicalizer怎么用？Java URLCanonicalizer使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

URLCanonicalizer类属于edu.uci.ics.crawler4j.url包，在下文中一共展示了URLCanonicalizer类的8个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getAllUrl

import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
public static Map<String, LinkedList<Integer>> getAllUrl() throws Exception {		
	IUrlDAO urlDAO = UrlDAOFactory.getUrlDAOInstance();
	Map<Integer,String> tempUrlMap = urlDAO.getAllUrl();
	
	Map<String, LinkedList<Integer>> urlMap = new HashMap<String, LinkedList<Integer>>();
	
	for(Map.Entry<Integer,String> entry : tempUrlMap.entrySet()) {
		String url = entry.getValue();
		//url regularization
		String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
		if(urlMap.containsKey(canonicalUrl)) {
			LinkedList<Integer> tempList = urlMap.get(canonicalUrl);
			tempList.add(entry.getKey());
		}
		else {
			LinkedList<Integer> intList = new LinkedList<Integer>();
			intList.add(entry.getKey());
			urlMap.put(canonicalUrl, intList);
		}
	}
	return urlMap; 
}

开发者ID:wrayzheng，项目名称:webpage-update-subscribe，代码行数:23，代码来源:GetURL.java

示例2: addSeed

import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
 * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
 * to extract new URLs in it and follow them for crawling. You can also
 * specify a specific document id to be assigned to this seed URL. This
 * document id needs to be unique. Also, note that if you add three seeds
 * with document ids 1,2, and 7. Then the next URL that is found during the
 * crawl will get a doc id of 8. Also you need to ensure to add seeds in
 * increasing order of document ids.
 * <p/>
 * Specifying doc ids is mainly useful when you have had a previous crawl
 * and have stored the results and want to start a new crawl with seeds
 * which get the same document ids as the previous crawl.
 *
 * @param pageUrl the URL of the seed
 * @param docId   the document id that you want to be assigned to this seed URL.
 */
public void addSeed(String pageUrl, int docId) {
    String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
    if (canonicalUrl == null) {
        logger.error("Invalid seed URL: " + pageUrl);
        return;
    }
    if (docId < 0) {
        docId = docIdServer.getDocId(canonicalUrl);
        if (docId > 0) {
            // This URL is already seen.
            return;
        }
        docId = docIdServer.getNewDocID(canonicalUrl);
    } else {
        try {
            docIdServer.addUrlAndDocId(canonicalUrl, docId);
        } catch (Exception e) {
            logger.error("Could not add seed: " + e.getMessage());
        }
    }

    WebURL webUrl = new WebURL();
    webUrl.setURL(canonicalUrl);
    webUrl.setDocid(docId);
    webUrl.setDepth((short) 0);
    if (!robotstxtServer.allows(webUrl)) {
        logger.info("Robots.txt does not allow this seed: " + pageUrl);
    } else {
        frontier.schedule(webUrl);
    }
}

开发者ID:sapienapps，项目名称:scrawler，代码行数:48，代码来源:CrawlController.java

示例3: addSeed

import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
 * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
 * to extract new URLs in it and follow them for crawling. You can also
 * specify a specific document id to be assigned to this seed URL. This
 * document id needs to be unique. Also, note that if you add three seeds
 * with document ids 1,2, and 7. Then the next URL that is found during the
 * crawl will get a doc id of 8. Also you need to ensure to add seeds in
 * increasing order of document ids.
 * 
 * Specifying doc ids is mainly useful when you have had a previous crawl
 * and have stored the results and want to start a new crawl with seeds
 * which get the same document ids as the previous crawl.
 * 
 * @param pageUrl
 *            the URL of the seed
 * @param docId
 *            the document id that you want to be assigned to this seed URL.
 * 
 */
public void addSeed(String pageUrl, int docId) {
	String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
	if (canonicalUrl == null) {
		logger.error("Invalid seed URL: {}", pageUrl);
		return;
	}
	if (docId < 0) {
		docId = docIdServer.getDocId(canonicalUrl);
		if (docId > 0) {
			// This URL is already seen.
			return;
		}
		docId = docIdServer.getNewDocID(canonicalUrl);
	} else {
		try {
			docIdServer.addUrlAndDocId(canonicalUrl, docId);
		} catch (Exception e) {
			logger.error("Could not add seed: {}", e.getMessage());
		}
	}

	WebURL webUrl = new WebURL();
	webUrl.setURL(canonicalUrl);
	webUrl.setDocid(docId);
	webUrl.setDepth((short) 0);
	if (!robotstxtServer.allows(webUrl)) {
		logger.info("Robots.txt does not allow this seed: {}", pageUrl);
	} else {
		frontier.schedule(webUrl);
	}
}

开发者ID:Chaiavi，项目名称:Crawler4j，代码行数:51，代码来源:CrawlController.java

示例4: normalizeURL

import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
 * Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
 */
public static String normalizeURL(String url) {
	if (url == null) {
		return null;
	}
	String normalized = URLCanonicalizer.getCanonicalURL(url);
	if (normalized == null) {
		normalized = url;
	}

	// convert to lower case, the url probably won't work in some cases
	// after that but we don't care we just want to compare urls to avoid
	// duplicates
	normalized = normalized.toLowerCase();

	// store all urls as http
	if (normalized.startsWith("https")) {
		normalized = "http" + normalized.substring(5);
	}

	// remove the www. part
	normalized = normalized.replace("//www.", "//");

	// feedproxy redirects to feedburner
	normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");

	// feedburner feeds have a special treatment
	if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
		normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
		normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
		normalized = StringUtils.removeEnd(normalized, "/");
	}

	return normalized;
}

开发者ID:Athou，项目名称:commafeed，代码行数:38，代码来源:FeedUtils.java

示例5: setUrl

import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
private void setUrl(String url) {
	webUrl.setURL(URLCanonicalizer.getCanonicalURL(url));
}

开发者ID:sapienapps，项目名称:scrawler，代码行数:4，代码来源:TLDListTest.java

示例6: addSeenUrl

import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
 * This function can called to assign a specific document id to a url. This
 * feature is useful when you have had a previous crawl and have stored the
 * Urls and their associated document ids and want to have a new crawl which
 * is aware of the previously seen Urls and won't re-crawl them.
 * <p/>
 * Note that if you add three seen Urls with document ids 1,2, and 7. Then
 * the next URL that is found during the crawl will get a doc id of 8. Also
 * you need to ensure to add seen Urls in increasing order of document ids.
 *
 * @param url   the URL of the page
 * @param docId the document id that you want to be assigned to this URL.
 */
public void addSeenUrl(String url, int docId) {
    String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
    if (canonicalUrl == null) {
        logger.error("Invalid Url: " + url);
        return;
    }
    try {
        docIdServer.addUrlAndDocId(canonicalUrl, docId);
    } catch (Exception e) {
        logger.error("Could not add seen url: " + e.getMessage());
    }
}

开发者ID:sapienapps，项目名称:scrawler，代码行数:26，代码来源:CrawlController.java

示例7: addSeenUrl

import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
 * This function can called to assign a specific document id to a url. This
 * feature is useful when you have had a previous crawl and have stored the
 * Urls and their associated document ids and want to have a new crawl which
 * is aware of the previously seen Urls and won't re-crawl them.
 * 
 * Note that if you add three seen Urls with document ids 1,2, and 7. Then
 * the next URL that is found during the crawl will get a doc id of 8. Also
 * you need to ensure to add seen Urls in increasing order of document ids. 
 * 
 * @param url
 *            the URL of the page
 * @param docId
 *            the document id that you want to be assigned to this URL.
 * 
 */
public void addSeenUrl(String url, int docId) {
	String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
	if (canonicalUrl == null) {
		logger.error("Invalid Url: {}", url);
		return;
	}
	try {
		docIdServer.addUrlAndDocId(canonicalUrl, docId);
	} catch (Exception e) {
		logger.error("Could not add seen url: {}", e.getMessage());
	}
}

开发者ID:Chaiavi，项目名称:Crawler4j，代码行数:29，代码来源:CrawlController.java

示例8: testCanonizalier

import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
public static void testCanonizalier() {

		assertEquals("http://www.example.com/display?category=foo%2Fbar%2Bbaz",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/display?category=foo/bar+baz"));

		assertEquals("http://www.example.com/?q=a%2Bb",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/?q=a+b"));

		assertEquals("http://www.example.com/display?category=foo%2Fbar%2Bbaz",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/display?category=foo%2Fbar%2Bbaz"));

		assertEquals("http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037",
				URLCanonicalizer
						.getCanonicalURL("http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037"));

		assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://hostname.com"));

		assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://HOSTNAME.com"));

		assertEquals("http://www.example.com/index.html",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&"));

		assertEquals("http://www.example.com/index.html",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?"));

		assertEquals("http://www.example.com/", URLCanonicalizer.getCanonicalURL("http://www.example.com"));

		assertEquals("http://www.example.com/bar.html",
				URLCanonicalizer.getCanonicalURL("http://www.example.com:80/bar.html"));

		assertEquals("http://www.example.com/index.html?name=test&rame=base",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?name=test&rame=base#123"));

		assertEquals("http://www.example.com/~username/",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/%7Eusername/"));

		assertEquals("http://www.example.com/A/B/index.html",
				URLCanonicalizer.getCanonicalURL("http://www.example.com//A//B/index.html"));

		assertEquals("http://www.example.com/index.html?x=y",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&x=y"));

		assertEquals("http://www.example.com/a.html",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/../../a.html"));

		assertEquals("http://www.example.com/a/c/d.html",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/../a/b/../c/./d.html"));

		assertEquals("http://foo.bar.com/?baz=1", URLCanonicalizer.getCanonicalURL("http://foo.bar.com?baz=1"));

		assertEquals("http://www.example.com/index.html?a=b&c=d&e=f",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&c=d&e=f&a=b"));

		assertEquals("http://www.example.com/index.html?q=a%20b",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?q=a b"));

		assertEquals("http://www.example.com/search?height=100%&width=100%",
				URLCanonicalizer.getCanonicalURL("http://www.example.com/search?width=100%&height=100%"));

		assertEquals("http://foo.bar/mydir/myfile?page=2",
				URLCanonicalizer.getCanonicalURL("?page=2", "http://foo.bar/mydir/myfile"));

	}

开发者ID:sapienapps，项目名称:scrawler，代码行数:64，代码来源:URLCanonicalizerTest.java

注：本文中的edu.uci.ics.crawler4j.url.URLCanonicalizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。