本文整理汇总了Java中edu.uci.ics.crawler4j.url.URLCanonicalizer类的典型用法代码示例。如果您正苦于以下问题:Java URLCanonicalizer类的具体用法?Java URLCanonicalizer怎么用?Java URLCanonicalizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
URLCanonicalizer类属于edu.uci.ics.crawler4j.url包,在下文中一共展示了URLCanonicalizer类的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getAllUrl
import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
public static Map<String, LinkedList<Integer>> getAllUrl() throws Exception {
IUrlDAO urlDAO = UrlDAOFactory.getUrlDAOInstance();
Map<Integer,String> tempUrlMap = urlDAO.getAllUrl();
Map<String, LinkedList<Integer>> urlMap = new HashMap<String, LinkedList<Integer>>();
for(Map.Entry<Integer,String> entry : tempUrlMap.entrySet()) {
String url = entry.getValue();
//url regularization
String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
if(urlMap.containsKey(canonicalUrl)) {
LinkedList<Integer> tempList = urlMap.get(canonicalUrl);
tempList.add(entry.getKey());
}
else {
LinkedList<Integer> intList = new LinkedList<Integer>();
intList.add(entry.getKey());
urlMap.put(canonicalUrl, intList);
}
}
return urlMap;
}
示例2: addSeed
import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling. You can also
* specify a specific document id to be assigned to this seed URL. This
* document id needs to be unique. Also, note that if you add three seeds
* with document ids 1,2, and 7. Then the next URL that is found during the
* crawl will get a doc id of 8. Also you need to ensure to add seeds in
* increasing order of document ids.
* <p/>
* Specifying doc ids is mainly useful when you have had a previous crawl
* and have stored the results and want to start a new crawl with seeds
* which get the same document ids as the previous crawl.
*
* @param pageUrl the URL of the seed
* @param docId the document id that you want to be assigned to this seed URL.
*/
public void addSeed(String pageUrl, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
if (canonicalUrl == null) {
logger.error("Invalid seed URL: " + pageUrl);
return;
}
if (docId < 0) {
docId = docIdServer.getDocId(canonicalUrl);
if (docId > 0) {
// This URL is already seen.
return;
}
docId = docIdServer.getNewDocID(canonicalUrl);
} else {
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seed: " + e.getMessage());
}
}
WebURL webUrl = new WebURL();
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (!robotstxtServer.allows(webUrl)) {
logger.info("Robots.txt does not allow this seed: " + pageUrl);
} else {
frontier.schedule(webUrl);
}
}
示例3: addSeed
import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling. You can also
* specify a specific document id to be assigned to this seed URL. This
* document id needs to be unique. Also, note that if you add three seeds
* with document ids 1,2, and 7. Then the next URL that is found during the
* crawl will get a doc id of 8. Also you need to ensure to add seeds in
* increasing order of document ids.
*
* Specifying doc ids is mainly useful when you have had a previous crawl
* and have stored the results and want to start a new crawl with seeds
* which get the same document ids as the previous crawl.
*
* @param pageUrl
* the URL of the seed
* @param docId
* the document id that you want to be assigned to this seed URL.
*
*/
public void addSeed(String pageUrl, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
if (canonicalUrl == null) {
logger.error("Invalid seed URL: {}", pageUrl);
return;
}
if (docId < 0) {
docId = docIdServer.getDocId(canonicalUrl);
if (docId > 0) {
// This URL is already seen.
return;
}
docId = docIdServer.getNewDocID(canonicalUrl);
} else {
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seed: {}", e.getMessage());
}
}
WebURL webUrl = new WebURL();
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (!robotstxtServer.allows(webUrl)) {
logger.info("Robots.txt does not allow this seed: {}", pageUrl);
} else {
frontier.schedule(webUrl);
}
}
示例4: normalizeURL
import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
*/
public static String normalizeURL(String url) {
if (url == null) {
return null;
}
String normalized = URLCanonicalizer.getCanonicalURL(url);
if (normalized == null) {
normalized = url;
}
// convert to lower case, the url probably won't work in some cases
// after that but we don't care we just want to compare urls to avoid
// duplicates
normalized = normalized.toLowerCase();
// store all urls as http
if (normalized.startsWith("https")) {
normalized = "http" + normalized.substring(5);
}
// remove the www. part
normalized = normalized.replace("//www.", "//");
// feedproxy redirects to feedburner
normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");
// feedburner feeds have a special treatment
if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
normalized = StringUtils.removeEnd(normalized, "/");
}
return normalized;
}
示例5: setUrl
import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
private void setUrl(String url) {
webUrl.setURL(URLCanonicalizer.getCanonicalURL(url));
}
示例6: addSeenUrl
import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
* This function can called to assign a specific document id to a url. This
* feature is useful when you have had a previous crawl and have stored the
* Urls and their associated document ids and want to have a new crawl which
* is aware of the previously seen Urls and won't re-crawl them.
* <p/>
* Note that if you add three seen Urls with document ids 1,2, and 7. Then
* the next URL that is found during the crawl will get a doc id of 8. Also
* you need to ensure to add seen Urls in increasing order of document ids.
*
* @param url the URL of the page
* @param docId the document id that you want to be assigned to this URL.
*/
public void addSeenUrl(String url, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
if (canonicalUrl == null) {
logger.error("Invalid Url: " + url);
return;
}
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seen url: " + e.getMessage());
}
}
示例7: addSeenUrl
import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
/**
* This function can called to assign a specific document id to a url. This
* feature is useful when you have had a previous crawl and have stored the
* Urls and their associated document ids and want to have a new crawl which
* is aware of the previously seen Urls and won't re-crawl them.
*
* Note that if you add three seen Urls with document ids 1,2, and 7. Then
* the next URL that is found during the crawl will get a doc id of 8. Also
* you need to ensure to add seen Urls in increasing order of document ids.
*
* @param url
* the URL of the page
* @param docId
* the document id that you want to be assigned to this URL.
*
*/
public void addSeenUrl(String url, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
if (canonicalUrl == null) {
logger.error("Invalid Url: {}", url);
return;
}
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seen url: {}", e.getMessage());
}
}
示例8: testCanonizalier
import edu.uci.ics.crawler4j.url.URLCanonicalizer; //导入依赖的package包/类
public static void testCanonizalier() {
assertEquals("http://www.example.com/display?category=foo%2Fbar%2Bbaz",
URLCanonicalizer.getCanonicalURL("http://www.example.com/display?category=foo/bar+baz"));
assertEquals("http://www.example.com/?q=a%2Bb",
URLCanonicalizer.getCanonicalURL("http://www.example.com/?q=a+b"));
assertEquals("http://www.example.com/display?category=foo%2Fbar%2Bbaz",
URLCanonicalizer.getCanonicalURL("http://www.example.com/display?category=foo%2Fbar%2Bbaz"));
assertEquals("http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037",
URLCanonicalizer
.getCanonicalURL("http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037"));
assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://hostname.com"));
assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://HOSTNAME.com"));
assertEquals("http://www.example.com/index.html",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&"));
assertEquals("http://www.example.com/index.html",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?"));
assertEquals("http://www.example.com/", URLCanonicalizer.getCanonicalURL("http://www.example.com"));
assertEquals("http://www.example.com/bar.html",
URLCanonicalizer.getCanonicalURL("http://www.example.com:80/bar.html"));
assertEquals("http://www.example.com/index.html?name=test&rame=base",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?name=test&rame=base#123"));
assertEquals("http://www.example.com/~username/",
URLCanonicalizer.getCanonicalURL("http://www.example.com/%7Eusername/"));
assertEquals("http://www.example.com/A/B/index.html",
URLCanonicalizer.getCanonicalURL("http://www.example.com//A//B/index.html"));
assertEquals("http://www.example.com/index.html?x=y",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&x=y"));
assertEquals("http://www.example.com/a.html",
URLCanonicalizer.getCanonicalURL("http://www.example.com/../../a.html"));
assertEquals("http://www.example.com/a/c/d.html",
URLCanonicalizer.getCanonicalURL("http://www.example.com/../a/b/../c/./d.html"));
assertEquals("http://foo.bar.com/?baz=1", URLCanonicalizer.getCanonicalURL("http://foo.bar.com?baz=1"));
assertEquals("http://www.example.com/index.html?a=b&c=d&e=f",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&c=d&e=f&a=b"));
assertEquals("http://www.example.com/index.html?q=a%20b",
URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?q=a b"));
assertEquals("http://www.example.com/search?height=100%&width=100%",
URLCanonicalizer.getCanonicalURL("http://www.example.com/search?width=100%&height=100%"));
assertEquals("http://foo.bar/mydir/myfile?page=2",
URLCanonicalizer.getCanonicalURL("?page=2", "http://foo.bar/mydir/myfile"));
}