本文整理汇总了Java中org.netpreserve.urlcanon.Canonicalizer类的典型用法代码示例。如果您正苦于以下问题:Java Canonicalizer类的具体用法?Java Canonicalizer怎么用?Java Canonicalizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Canonicalizer类属于org.netpreserve.urlcanon包,在下文中一共展示了Canonicalizer类的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: fixupUrl
import org.netpreserve.urlcanon.Canonicalizer; //导入依赖的package包/类
static String fixupUrl(String raw) {
ParsedUrl url = ParsedUrl.parseUrl(raw);
Canonicalizer.WHATWG.canonicalize(url);
// early versions of httrack wrote the URL without a scheme
if (url.getScheme().isEmpty()) {
url.setScheme(new ByteString("http"));
url.setColonAfterScheme(new ByteString(":"));
url.setSlashes(new ByteString("//"));
}
// httrack incorrectly makes requests including the fragment. Should we fix clear them?
//url.setHashSign(ByteString.EMPTY);
//url.setFragment(ByteString.EMPTY);
return url.toString();
}
示例2: canonSsurt
import org.netpreserve.urlcanon.Canonicalizer; //导入依赖的package包/类
/**
* Canonicalize and return the SURT form.
*
* - perform WHATWG canonicalization
* - lowercase the path
* - remove the fragment
* - remove www. prefix from hostname
* - replace https scheme with http
*
* These rules are a little aggressive to make defining rules less error prone.
*
* TODO: query string?
*
* TODO: reconcile this with UrlCanonicalizer. We should probably switch over to urlcanon as its a more robust
* canonicalizer but a change will require rebuilding the index. Maybe keep both implementations and allow an
* offline upgrade to be run?
*/
static String canonSsurt(String url) {
if (FeatureFlags.pandoraHacks()) {
/*
* Strip PANDORA prefix from URLs so rules so a single rule can match both PANDORA and non-PANDORA
* content.
*/
Matcher m = PANDORA_REGEX.matcher(url);
if (m.matches()) {
String hackedOffUrl = "http://" + m.group(1);
url = hackedOffUrl;
}
}
ParsedUrl parsed = ParsedUrl.parseUrl(url);
Canonicalizer.AGGRESSIVE.canonicalize(parsed);
return parsed.ssurt().toString();
}
示例3: setUrls
import org.netpreserve.urlcanon.Canonicalizer; //导入依赖的package包/类
public static void setUrls(Document doc, String url) throws TextExtractionException {
String deliveryUrl = url;
Matcher m = PANDORA_REGEX.matcher(url);
if (m.matches()) {
// TODO: consult url.map
String hackedOffUrl = "http://" + m.group(1);
url = hackedOffUrl;
}
doc.setUrl(url);
ParsedUrl parse = ParsedUrl.parseUrl(deliveryUrl);
Canonicalizer.AGGRESSIVE.canonicalize(parse);
doc.setDeliveryUrl(parse.toString());
try {
doc.setHost(new URL(url).getHost());
doc.setSite(topPrivateDomain(url));
} catch (MalformedURLException e) {
throw new TextExtractionException(e);
}
}
示例4: getSearchUrl
import org.netpreserve.urlcanon.Canonicalizer; //导入依赖的package包/类
/**
* For a url convert into a search url that should match with the way it is normalised for delivery.
* @param url
* @return
*/
public static String getSearchUrl(String url){
ParsedUrl parsed = ParsedUrl.parseUrl(url);
if(parsed.getScheme().isEmpty()){
// default to http as this is needed to force the host to be detected
parsed = ParsedUrl.parseUrl("http://"+url);
}
Canonicalizer.WHATWG.canonicalize(parsed);
parsed.setPath(parsed.getPath().asciiLowerCase());
parsed.setHost(parsed.getHost().replaceAll(WWW_PREFIX, ""));
String ret = parsed.getHost().toString() + parsed.getPath().toString();
return ret;
}
示例5: canonSsurt
import org.netpreserve.urlcanon.Canonicalizer; //导入依赖的package包/类
private static String canonSsurt(String url) {
ParsedUrl parsed = ParsedUrl.parseUrl(url);
Canonicalizer.WHATWG.canonicalize(parsed);
parsed.setPath(parsed.getPath().asciiLowerCase());
parsed.setFragment(ByteString.EMPTY);
parsed.setHashSign(ByteString.EMPTY);
parsed.setHost(parsed.getHost().replaceAll(WWW_PREFIX, ""));
if (parsed.getScheme().toString().equals("https")) {
parsed.setScheme(new ByteString("http"));
}
return parsed.ssurt().toString();
}
示例6: canonUrl
import org.netpreserve.urlcanon.Canonicalizer; //导入依赖的package包/类
private String canonUrl(String input) {
ParsedUrl url = ParsedUrl.parseUrl(input);
Canonicalizer.AGGRESSIVE.canonicalize(url);
return url.toString();
}
示例7: cleanUrl
import org.netpreserve.urlcanon.Canonicalizer; //导入依赖的package包/类
static String cleanUrl(String url) {
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
Canonicalizer.WHATWG.canonicalize(parsedUrl);
return parsedUrl.toString().replace(" ", "%20").replace("\r", "%0a").replace("\n", "%0d");
}