本文整理汇总了Java中org.jsoup.safety.Cleaner.clean方法的典型用法代码示例。如果您正苦于以下问题:Java Cleaner.clean方法的具体用法?Java Cleaner.clean怎么用?Java Cleaner.clean使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.jsoup.safety.Cleaner
的用法示例。
在下文中一共展示了Cleaner.clean方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: cleanContent
import org.jsoup.safety.Cleaner; //导入方法依赖的package包/类
/**
* Cleans the html content leaving only the following tags: b, em, i, strong, u, br, cite, em, i, p, strong, img, li, ul, ol, sup, sub, s
* @param content html content
* @param extraTags any other tags that you may want to keep, e. g. "a"
* @return
*/
public String cleanContent(String content, String ... extraTags) {
Whitelist allowedTags = Whitelist.simpleText(); // This whitelist allows only simple text formatting: b, em, i, strong, u. All other HTML (tags and attributes) will be removed.
allowedTags.addTags("br", "cite", "em", "i", "p", "strong", "img", "li", "ul", "ol", "sup", "sub", "s");
allowedTags.addTags(extraTags);
allowedTags.addAttributes("p", "style"); // Serve per l'allineamento a destra e sinistra
allowedTags.addAttributes("img", "src", "style", "class");
if (Arrays.asList(extraTags).contains("a")) {
allowedTags.addAttributes("a", "href", "target");
}
Document dirty = Jsoup.parseBodyFragment(content, "");
Cleaner cleaner = new Cleaner(allowedTags);
Document clean = cleaner.clean(dirty);
clean.outputSettings().escapeMode(EscapeMode.xhtml); // Non fa l'escape dei caratteri utf-8
String safe = clean.body().html();
return safe;
}
示例2: convert
import org.jsoup.safety.Cleaner; //导入方法依赖的package包/类
@Override
public Object convert(Class type, Object value) {
String htmlText = (String) value;
if (Strings.isNullOrEmpty(htmlText)) {
return null;
}
Document dirty = Jsoup.parseBodyFragment(htmlText);
Cleaner cleaner = new Cleaner(whitelist);
Document clean = cleaner.clean(dirty);
cleanInvalidIframes(clean);
clean.outputSettings().charset("ASCII");
return clean.body().html();
}
示例3: postProcess
import org.jsoup.safety.Cleaner; //导入方法依赖的package包/类
@Override
public String postProcess(String html) {
// Use a faked baseURI, otherwise all relative urls will be stripped out
Document body = Jsoup.parseBodyFragment(html, "http://localhost/sanitize");
Cleaner cleaner = new Cleaner(whiteList);
body = cleaner.clean(body);
for (HtmlTransformer transformer : htmlTransformers)
transformer.transform(body);
return body.body().html();
}
示例4: main
import org.jsoup.safety.Cleaner; //导入方法依赖的package包/类
public static void main( String[] args )
{
// load html from file
Document doc = loadHtmlFromFile("index.html", "utf-8");
// just leave if doc is null
if(doc == null) {
LogUtils.d(CLS_NAME, "main", "document is null");
return;
}
/* the dirty html */
System.out.println("===BEFORE===");
System.out.println(doc.html());
/* create and config whitelist */
Whitelist allowList = Whitelist.relaxed();
allowList
.addTags("meta", "title", "script", "iframe")
.addAttributes("meta", "charset")
.addAttributes("iframe", "src")
.addProtocols("iframe", "src", "http", "https");
/* clean the dirty doc */
Cleaner cleaner = new Cleaner(allowList);
Document newDoc = cleaner.clean(doc);
/* the clean one */
System.out.println("===AFTER===");
System.out.println(newDoc.html());
}
示例5: htmlTextToPlainText
import org.jsoup.safety.Cleaner; //导入方法依赖的package包/类
/**
* Cleans some html text by stripping all tags but <code>br</code> and then
* unescapes named entitiesl like '"e';. brs will be replaced by
* newlines.
*
* @param htmlText
* @return
*/
String htmlTextToPlainText(final String htmlText) {
final Whitelist whitelist = Whitelist.none();
whitelist.addTags("br");
final Cleaner cleaner = new Cleaner(whitelist);
final Document cleanedDocument = cleaner.clean(Jsoup.parse(htmlText));
cleanedDocument
.outputSettings()
.prettyPrint(false)
.escapeMode(EscapeMode.xhtml)
.charset(StandardCharsets.UTF_8);
return Parser.unescapeEntities(cleanedDocument.body().html().trim(), true).replaceAll("<br(?: ?/)?>", "\r\n");
}
示例6: stripHtml
import org.jsoup.safety.Cleaner; //导入方法依赖的package包/类
/**
* Strips HTML tags from a given input String, allows some tags to be retained via a whitelist
*
* @param fragment the specified String
* @param whitelistTags the specified whitelist tags
*
* @return cleaned String with allowed tags
*/
public static String stripHtml(String fragment, String... whitelistTags)
{
// Parse out html tags except those from a given list of whitelist tags
Document dirty = Jsoup.parseBodyFragment(fragment);
Whitelist whitelist = new Whitelist();
for (String whitelistTag : whitelistTags)
{
// Get the actual tag name from the whitelist tag
// this is vulnerable in general to complex tags but will suffice for our simple needs
whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]");
// Add all specified tags to the whitelist while preserving inline css
whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class");
}
Cleaner cleaner = new Cleaner(whitelist);
Document clean = cleaner.clean(dirty);
// Set character encoding to UTF-8 and make sure no line-breaks are added
clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8).prettyPrint(false);
// return 'cleaned' html body
return clean.body().html();
}
示例7: render
import org.jsoup.safety.Cleaner; //导入方法依赖的package包/类
public static String render(String markdown, String baseUri) {
if (markdown == null || markdown.isEmpty()) {
return null;
}
String html = new PegDownProcessor(PEGDOWN_OPTIONS).markdownToHtml(markdown);
Document dirty = Jsoup.parseBodyFragment(html, baseUri.toString());
Cleaner cleaner = new Cleaner(HTML_WHITELIST);
Document clean = cleaner.clean(dirty);
rewriteFragmentLinks(clean, baseUri);
return clean.body().html();
}
示例8: render
import org.jsoup.safety.Cleaner; //导入方法依赖的package包/类
@Override
public void render (final IncorporatedPart part, final PrintWriter htmlOut)
throws MimeUIException
{
InputStream inputStream = null;
try
{
inputStream = part.getInputStream();
// todo Move all of this logic to a separate class with a String sanitize(String) method.
// This normalizes and sanitizes the HTML, preventing cross site scripting attacks and other issues.
final Cleaner htmlCleaner = new Cleaner(new HTMLWhiteList());
final HTMLTransformer htmlTransformer
= new HTMLTransformer(part, this.untrustedContentUriResolver, this.contentLocationProvider);
final String htmlSource = IOUtils.toString(inputStream, part.getCharacterEncoding());
final Document dirtyDocument = Jsoup.parse(htmlSource, part.getContextLocation().toString());
final Document cleanDocument = htmlCleaner.clean(dirtyDocument);
htmlTransformer.transform(cleanDocument);
// this removes the body element, which often contains a style/class attribute.
htmlOut.println(cleanDocument.body().html());
}
catch (final IOException e)
{
throw new MimeUIException("Unable to read a textual part.", e);
}
finally
{
IOUtils.closeQuietly(inputStream);
}
}
示例9: getData
import org.jsoup.safety.Cleaner; //导入方法依赖的package包/类
/**
* Make the query to google and return the data.
*
* @param query
* textfield for google
* @return webpage in Document format
*/
private Document getData(String query) throws CaptchaException, EmptyQueryException, UnsupportedEncodingException {
if (this.query.isEmpty() || this.query == null) {
throw new EmptyQueryException();
}
Connection conn = null;
Document doc = null;
String request = "https://www.google.com/search?q=" + URLEncoder.encode( stripXSS(query), "UTF-8");
if(!tokenCookie.isEmpty()){
request = request + "&google_abuse=" + URLEncoder.encode(tokenCookie, "UTF-8");
}
try {
conn = Jsoup
.connect(request)
.method(Method.GET)
.userAgent("Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/48.0")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Cookie", tokenCookie)
.header("Connection", "keep-alive")
.ignoreHttpErrors(true)
.timeout(5000);
if(!referer.isEmpty()){
conn.header("Referer", referer);
}
Connection.Response response = conn.execute();
if (response.statusCode() == 503) {
referer = response.url().toString();
idCaptcha = getIDCaptcha(response.parse());
getCaptcha("https://ipv4.google.com/sorry/image?id=" + idCaptcha + "&hl=es&" + referer.substring(referer.indexOf('?')+1));
throw new CaptchaException();
}
doc = Jsoup.parse(response.body());
// Clean the response
Whitelist wl = new Whitelist().basic();
wl.addAttributes("span", "class");
Cleaner clean = new Cleaner(wl);
doc = clean.clean(doc);
} catch (IOException e) {
//System.out.println(e.getMessage());
e.printStackTrace();
}
return doc;
}