本文整理汇总了Java中org.netpreserve.urlcanon.ParsedUrl.parseUrl方法的典型用法代码示例。如果您正苦于以下问题:Java ParsedUrl.parseUrl方法的具体用法?Java ParsedUrl.parseUrl怎么用?Java ParsedUrl.parseUrl使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.netpreserve.urlcanon.ParsedUrl
的用法示例。
在下文中一共展示了ParsedUrl.parseUrl方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: fixupUrl
import org.netpreserve.urlcanon.ParsedUrl; //导入方法依赖的package包/类
static String fixupUrl(String raw) {
ParsedUrl url = ParsedUrl.parseUrl(raw);
Canonicalizer.WHATWG.canonicalize(url);
// early versions of httrack wrote the URL without a scheme
if (url.getScheme().isEmpty()) {
url.setScheme(new ByteString("http"));
url.setColonAfterScheme(new ByteString(":"));
url.setSlashes(new ByteString("//"));
}
// httrack incorrectly makes requests including the fragment. Should we fix clear them?
//url.setHashSign(ByteString.EMPTY);
//url.setFragment(ByteString.EMPTY);
return url.toString();
}
示例2: canonSsurt
import org.netpreserve.urlcanon.ParsedUrl; //导入方法依赖的package包/类
/**
* Canonicalize and return the SURT form.
*
* - perform WHATWG canonicalization
* - lowercase the path
* - remove the fragment
* - remove www. prefix from hostname
* - replace https scheme with http
*
* These rules are a little aggressive to make defining rules less error prone.
*
* TODO: query string?
*
* TODO: reconcile this with UrlCanonicalizer. We should probably switch over to urlcanon as its a more robust
* canonicalizer but a change will require rebuilding the index. Maybe keep both implementations and allow an
* offline upgrade to be run?
*/
static String canonSsurt(String url) {
if (FeatureFlags.pandoraHacks()) {
/*
* Strip PANDORA prefix from URLs so rules so a single rule can match both PANDORA and non-PANDORA
* content.
*/
Matcher m = PANDORA_REGEX.matcher(url);
if (m.matches()) {
String hackedOffUrl = "http://" + m.group(1);
url = hackedOffUrl;
}
}
ParsedUrl parsed = ParsedUrl.parseUrl(url);
Canonicalizer.AGGRESSIVE.canonicalize(parsed);
return parsed.ssurt().toString();
}
示例3: setUrls
import org.netpreserve.urlcanon.ParsedUrl; //导入方法依赖的package包/类
public static void setUrls(Document doc, String url) throws TextExtractionException {
String deliveryUrl = url;
Matcher m = PANDORA_REGEX.matcher(url);
if (m.matches()) {
// TODO: consult url.map
String hackedOffUrl = "http://" + m.group(1);
url = hackedOffUrl;
}
doc.setUrl(url);
ParsedUrl parse = ParsedUrl.parseUrl(deliveryUrl);
Canonicalizer.AGGRESSIVE.canonicalize(parse);
doc.setDeliveryUrl(parse.toString());
try {
doc.setHost(new URL(url).getHost());
doc.setSite(topPrivateDomain(url));
} catch (MalformedURLException e) {
throw new TextExtractionException(e);
}
}
示例4: getSearchUrl
import org.netpreserve.urlcanon.ParsedUrl; //导入方法依赖的package包/类
/**
* For a url convert into a search url that should match with the way it is normalised for delivery.
* @param url
* @return
*/
public static String getSearchUrl(String url){
ParsedUrl parsed = ParsedUrl.parseUrl(url);
if(parsed.getScheme().isEmpty()){
// default to http as this is needed to force the host to be detected
parsed = ParsedUrl.parseUrl("http://"+url);
}
Canonicalizer.WHATWG.canonicalize(parsed);
parsed.setPath(parsed.getPath().asciiLowerCase());
parsed.setHost(parsed.getHost().replaceAll(WWW_PREFIX, ""));
String ret = parsed.getHost().toString() + parsed.getPath().toString();
return ret;
}
示例5: canonSsurt
import org.netpreserve.urlcanon.ParsedUrl; //导入方法依赖的package包/类
private static String canonSsurt(String url) {
ParsedUrl parsed = ParsedUrl.parseUrl(url);
Canonicalizer.WHATWG.canonicalize(parsed);
parsed.setPath(parsed.getPath().asciiLowerCase());
parsed.setFragment(ByteString.EMPTY);
parsed.setHashSign(ByteString.EMPTY);
parsed.setHost(parsed.getHost().replaceAll(WWW_PREFIX, ""));
if (parsed.getScheme().toString().equals("https")) {
parsed.setScheme(new ByteString("http"));
}
return parsed.ssurt().toString();
}
示例6: basicMetadata
import org.netpreserve.urlcanon.ParsedUrl; //导入方法依赖的package包/类
private void basicMetadata(SolrInputDocument solr, IndexerDocument document) {
solr.addField(SolrEnum.ID.toString(), document.getDocId());
// Display URL is the original provided by Bamboo
String url = document.getBambooDocument().getUrl();
solr.addField(SolrEnum.DISPLAY_URL.toString(), url);
String deliveryUrl = document.getBambooDocument().getDeliveryUrl();
if (deliveryUrl == null || "".equals(deliveryUrl)) {
throw new IllegalArgumentException("Delivery URL is empty for document " + document.getDocId());
}
solr.addField(SolrEnum.DELIVERY_URL.toString(), deliveryUrl);
// In the vast majority of cases DELIVERY_URL == canon(DISPLAY_URL)
// But we test for that because Pandora can throw a spanner in the works
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
CANON.canonicalize(parsedUrl);
if (!parsedUrl.toString().equals(deliveryUrl)) {
// This is a pandora URL. To support exact match on both DISPLAY_URL and DELIVERY_URL
// we need to store a canonicalized version of DISPLAY_URL
solr.addField(SolrEnum.PANDORA_URL.toString(), parsedUrl.toString());
}
String filename = FilenameFinder.getFilename(url);
if (filename != null) {
solr.addField(SolrEnum.FILENAME.toString(), filename);
}
solr.addField(SolrEnum.DATE.toString(), document.getBambooDocument().getDate());
String year = dateYear.format(document.getBambooDocument().getDate());
solr.addField(SolrEnum.DECADE.toString(), year.substring(0, 3));
solr.addField(SolrEnum.YEAR.toString(), year);
domainAndTitleMetadata(solr, document);
// Optional metadata we _might_ get from html
optionalMetadata(solr, document.getBambooDocument().getDescription());
optionalMetadata(solr, document.getBambooDocument().getKeywords());
optionalMetadata(solr, document.getBambooDocument().getPublisher());
optionalMetadata(solr, document.getBambooDocument().getCreator());
optionalMetadata(solr, document.getBambooDocument().getContributor());
optionalMetadata(solr, document.getBambooDocument().getCoverage());
}
示例7: canonUrl
import org.netpreserve.urlcanon.ParsedUrl; //导入方法依赖的package包/类
private String canonUrl(String input) {
ParsedUrl url = ParsedUrl.parseUrl(input);
Canonicalizer.AGGRESSIVE.canonicalize(url);
return url.toString();
}
示例8: cleanUrl
import org.netpreserve.urlcanon.ParsedUrl; //导入方法依赖的package包/类
static String cleanUrl(String url) {
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
Canonicalizer.WHATWG.canonicalize(parsedUrl);
return parsedUrl.toString().replace(" ", "%20").replace("\r", "%0a").replace("\n", "%0d");
}