本文整理匯總了Java中org.netpreserve.urlcanon.ParsedUrl類的典型用法代碼示例。如果您正苦於以下問題:Java ParsedUrl類的具體用法?Java ParsedUrl怎麽用?Java ParsedUrl使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
ParsedUrl類屬於org.netpreserve.urlcanon包,在下文中一共展示了ParsedUrl類的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: fixupUrl
import org.netpreserve.urlcanon.ParsedUrl; //導入依賴的package包/類
static String fixupUrl(String raw) {
ParsedUrl url = ParsedUrl.parseUrl(raw);
Canonicalizer.WHATWG.canonicalize(url);
// early versions of httrack wrote the URL without a scheme
if (url.getScheme().isEmpty()) {
url.setScheme(new ByteString("http"));
url.setColonAfterScheme(new ByteString(":"));
url.setSlashes(new ByteString("//"));
}
// httrack incorrectly makes requests including the fragment. Should we fix clear them?
//url.setHashSign(ByteString.EMPTY);
//url.setFragment(ByteString.EMPTY);
return url.toString();
}
示例2: canonSsurt
import org.netpreserve.urlcanon.ParsedUrl; //導入依賴的package包/類
/**
* Canonicalize and return the SURT form.
*
* - perform WHATWG canonicalization
* - lowercase the path
* - remove the fragment
* - remove www. prefix from hostname
* - replace https scheme with http
*
* These rules are a little aggressive to make defining rules less error prone.
*
* TODO: query string?
*
* TODO: reconcile this with UrlCanonicalizer. We should probably switch over to urlcanon as its a more robust
* canonicalizer but a change will require rebuilding the index. Maybe keep both implementations and allow an
* offline upgrade to be run?
*/
static String canonSsurt(String url) {
if (FeatureFlags.pandoraHacks()) {
/*
* Strip PANDORA prefix from URLs so rules so a single rule can match both PANDORA and non-PANDORA
* content.
*/
Matcher m = PANDORA_REGEX.matcher(url);
if (m.matches()) {
String hackedOffUrl = "http://" + m.group(1);
url = hackedOffUrl;
}
}
ParsedUrl parsed = ParsedUrl.parseUrl(url);
Canonicalizer.AGGRESSIVE.canonicalize(parsed);
return parsed.ssurt().toString();
}
示例3: setUrls
import org.netpreserve.urlcanon.ParsedUrl; //導入依賴的package包/類
public static void setUrls(Document doc, String url) throws TextExtractionException {
String deliveryUrl = url;
Matcher m = PANDORA_REGEX.matcher(url);
if (m.matches()) {
// TODO: consult url.map
String hackedOffUrl = "http://" + m.group(1);
url = hackedOffUrl;
}
doc.setUrl(url);
ParsedUrl parse = ParsedUrl.parseUrl(deliveryUrl);
Canonicalizer.AGGRESSIVE.canonicalize(parse);
doc.setDeliveryUrl(parse.toString());
try {
doc.setHost(new URL(url).getHost());
doc.setSite(topPrivateDomain(url));
} catch (MalformedURLException e) {
throw new TextExtractionException(e);
}
}
示例4: getSearchUrl
import org.netpreserve.urlcanon.ParsedUrl; //導入依賴的package包/類
/**
* For a url convert into a search url that should match with the way it is normalised for delivery.
* @param url
* @return
*/
public static String getSearchUrl(String url){
ParsedUrl parsed = ParsedUrl.parseUrl(url);
if(parsed.getScheme().isEmpty()){
// default to http as this is needed to force the host to be detected
parsed = ParsedUrl.parseUrl("http://"+url);
}
Canonicalizer.WHATWG.canonicalize(parsed);
parsed.setPath(parsed.getPath().asciiLowerCase());
parsed.setHost(parsed.getHost().replaceAll(WWW_PREFIX, ""));
String ret = parsed.getHost().toString() + parsed.getPath().toString();
return ret;
}
示例5: canonSsurt
import org.netpreserve.urlcanon.ParsedUrl; //導入依賴的package包/類
private static String canonSsurt(String url) {
ParsedUrl parsed = ParsedUrl.parseUrl(url);
Canonicalizer.WHATWG.canonicalize(parsed);
parsed.setPath(parsed.getPath().asciiLowerCase());
parsed.setFragment(ByteString.EMPTY);
parsed.setHashSign(ByteString.EMPTY);
parsed.setHost(parsed.getHost().replaceAll(WWW_PREFIX, ""));
if (parsed.getScheme().toString().equals("https")) {
parsed.setScheme(new ByteString("http"));
}
return parsed.ssurt().toString();
}
示例6: basicMetadata
import org.netpreserve.urlcanon.ParsedUrl; //導入依賴的package包/類
private void basicMetadata(SolrInputDocument solr, IndexerDocument document) {
solr.addField(SolrEnum.ID.toString(), document.getDocId());
// Display URL is the original provided by Bamboo
String url = document.getBambooDocument().getUrl();
solr.addField(SolrEnum.DISPLAY_URL.toString(), url);
String deliveryUrl = document.getBambooDocument().getDeliveryUrl();
if (deliveryUrl == null || "".equals(deliveryUrl)) {
throw new IllegalArgumentException("Delivery URL is empty for document " + document.getDocId());
}
solr.addField(SolrEnum.DELIVERY_URL.toString(), deliveryUrl);
// In the vast majority of cases DELIVERY_URL == canon(DISPLAY_URL)
// But we test for that because Pandora can throw a spanner in the works
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
CANON.canonicalize(parsedUrl);
if (!parsedUrl.toString().equals(deliveryUrl)) {
// This is a pandora URL. To support exact match on both DISPLAY_URL and DELIVERY_URL
// we need to store a canonicalized version of DISPLAY_URL
solr.addField(SolrEnum.PANDORA_URL.toString(), parsedUrl.toString());
}
String filename = FilenameFinder.getFilename(url);
if (filename != null) {
solr.addField(SolrEnum.FILENAME.toString(), filename);
}
solr.addField(SolrEnum.DATE.toString(), document.getBambooDocument().getDate());
String year = dateYear.format(document.getBambooDocument().getDate());
solr.addField(SolrEnum.DECADE.toString(), year.substring(0, 3));
solr.addField(SolrEnum.YEAR.toString(), year);
domainAndTitleMetadata(solr, document);
// Optional metadata we _might_ get from html
optionalMetadata(solr, document.getBambooDocument().getDescription());
optionalMetadata(solr, document.getBambooDocument().getKeywords());
optionalMetadata(solr, document.getBambooDocument().getPublisher());
optionalMetadata(solr, document.getBambooDocument().getCreator());
optionalMetadata(solr, document.getBambooDocument().getContributor());
optionalMetadata(solr, document.getBambooDocument().getCoverage());
}
示例7: canonUrl
import org.netpreserve.urlcanon.ParsedUrl; //導入依賴的package包/類
private String canonUrl(String input) {
ParsedUrl url = ParsedUrl.parseUrl(input);
Canonicalizer.AGGRESSIVE.canonicalize(url);
return url.toString();
}
示例8: cleanUrl
import org.netpreserve.urlcanon.ParsedUrl; //導入依賴的package包/類
static String cleanUrl(String url) {
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
Canonicalizer.WHATWG.canonicalize(parsedUrl);
return parsedUrl.toString().replace(" ", "%20").replace("\r", "%0a").replace("\n", "%0d");
}