本文整理汇总了Java中org.htmlcleaner.CleanerProperties.setTranslateSpecialEntities方法的典型用法代码示例。如果您正苦于以下问题:Java CleanerProperties.setTranslateSpecialEntities方法的具体用法?Java CleanerProperties.setTranslateSpecialEntities怎么用?Java CleanerProperties.setTranslateSpecialEntities使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlcleaner.CleanerProperties
的用法示例。
在下文中一共展示了CleanerProperties.setTranslateSpecialEntities方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
HtmlCleaner result = new HtmlCleaner();
CleanerProperties cleanerProperties = result.getProperties();
cleanerProperties.setAdvancedXmlEscape(true);
cleanerProperties.setOmitXmlDeclaration(true);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setTranslateSpecialEntities(true);
cleanerProperties.setTransResCharsToNCR(true);
cleanerProperties.setRecognizeUnicodeChars(true);
cleanerProperties.setIgnoreQuestAndExclam(true);
cleanerProperties.setUseEmptyElementTags(false);
cleanerProperties.setPruneTags("script,title");
return result;
}
示例2: toXML
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* htmlcleaner로 html string을 xml string으로 바꿔주는 메소드.
* @param source
* @return
*/
private String toXML(String source){
try {
CleanerProperties props = new CleanerProperties();
props.setTranslateSpecialEntities(true);
props.setOmitComments(true);
props.setPruneTags("script,style");
// namespace를 무시한다.
props.setNamespacesAware(false);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
HtmlCleaner cl = new HtmlCleaner(props);
TagNode tagNode = cl.clean(source);
source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
} catch (IOException e) {
logger.error("",e);
}
return source;
}
示例3: htmlOutputStreamViaHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* To Output html Stream via Html Cleaner.
*
* @param pathOfHOCRFile String
* @param outputFilePath String
* @throws IOException
*/
public static void htmlOutputStreamViaHtmlCleaner(String pathOfHOCRFile, String outputFilePath) throws IOException {
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(false);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
// take default cleaner properties
// CleanerProperties props = cleaner.getProperties();
FileInputStream hOCRFileInputStream = new FileInputStream(pathOfHOCRFile);
TagNode tagNode = cleaner.clean(hOCRFileInputStream, UTF_ENCODING);
if (null != hOCRFileInputStream) {
hOCRFileInputStream.close();
}
try {
new PrettyHtmlSerializer(cleanerProps).writeToFile(tagNode, outputFilePath, UTF_ENCODING);
} catch (Exception e) { // NOPMD.
}
}
示例4: getSerialized
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* Convenience method (for xml/xhtml): serializes the parsed page.
*
* @param inSerializer
* {@link XmlSerializer}
* @return String the cleaned and serialized html
* @throws IOException
*/
public String getSerialized(final XmlSerializer inSerializer)
throws IOException {
if (docNode == null) {
return ""; //$NON-NLS-1$
}
final CleanerProperties lProps = new HtmlCleaner().getProperties();
lProps.setUseCdataForScriptAndStyle(true);
lProps.setRecognizeUnicodeChars(true);
lProps.setUseEmptyElementTags(true);
lProps.setAdvancedXmlEscape(true);
lProps.setTranslateSpecialEntities(true);
lProps.setBooleanAttributeValues("empty"); //$NON-NLS-1$
lProps.setNamespacesAware(true);
lProps.setOmitXmlDeclaration(false);
lProps.setOmitDoctypeDeclaration(true);
lProps.setOmitHtmlEnvelope(false);
docNode.getAttributes().remove(NS_XML);
return inSerializer.getSerializer(lProps).getXmlAsString(docNode);
}
示例5: createCleanerProperties
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static CleanerProperties createCleanerProperties() {
CleanerProperties properties = new CleanerProperties();
// See http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
properties.setIgnoreQuestAndExclam(false);
properties.setAllowHtmlInsideAttributes(true);
return properties;
}
示例6: getTextFromHtmlString
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* This method extracts the text from html string.
* @param htmlString {@link String}
* @return {@link String}
*/
public static String getTextFromHtmlString(String htmlString) {
String errorText = "";
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(true);
cleanerProps.setUseEmptyElementTags(true);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
TagNode tagNode = cleaner.clean(htmlString);
Object[] rootNode = null;
try {
rootNode = tagNode.evaluateXPath("//table");
if (null != rootNode && rootNode.length > 0) {
TagNode[] textNode = ((TagNode) rootNode[rootNode.length - 1]).getElementsByName("td", true);
for (TagNode tag : textNode) {
if (tag != null && tag.getText() != null) {
StringBuilder errorTextString = new StringBuilder();
errorTextString.append(errorText);
if (tag.getText().toString().trim().equals(" ")) {
errorTextString.append(" ");
errorText = errorTextString.toString();
} else {
errorTextString.append(tag.getText());
errorText = errorTextString.toString();
}
}
}
}
} catch (XPatherException e) {
LOGGER.error("Error extracting table node from html." + e.getMessage());
}
return errorText;
}
示例7: createHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
HtmlCleaner result = new HtmlCleaner();
CleanerProperties cleanerProperties = result.getProperties();
cleanerProperties.setOmitXmlDeclaration(true);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setRecognizeUnicodeChars(true);
cleanerProperties.setTranslateSpecialEntities(false);
cleanerProperties.setIgnoreQuestAndExclam(true);
cleanerProperties.setUseEmptyElementTags(false);
return result;
}
示例8: ExtractInfoWithHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private ExtractInfoWithHtmlCleaner()
{
CleanerProperties props = cleaner.getProperties();
props.setUseCdataForScriptAndStyle(true);
props.setRecognizeUnicodeChars(true);
props.setUseEmptyElementTags(true);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
props.setBooleanAttributeValues("empty");
}
示例9: main
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException, XPatherException {
CleanerProperties props = cleaner.getProperties();
props.setUseCdataForScriptAndStyle(true);
props.setRecognizeUnicodeChars(true);
props.setUseEmptyElementTags(true);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
props.setBooleanAttributeValues("empty");
String result ="";
File file = new File("E:/test4java/tangniaobing.htm");
URL url = new URL("http://www.haodf.com/wenda/anzhentaohong_g_638200415.htm");
TagNode node = cleaner.clean(url,"gb2312");
//Object[] ns = node.getElementsByName("", true);
Object[] ns = node.evaluateXPath("//*[@class=\"bb_d3 bl_d3 pb20\"]/div[3]/div[2]/p[2]");
//Object[] ns = node.("//*[@id=\"shequREP_pageNumLab\"]/a");
for (Object object : ns)
{
TagNode dd = (TagNode) object;
result = result +dd.getText()+"\n";
}
result = result.replace(" ", "").replace("\r", "").replace(";", "");
result = CommonUtil.getDateString(result,".*?([0-9]+.[0-9]+.[0-9]+).*");
/*result = "?uthorid=4917458&page=6&tid=16785968";
String rex = "\\?(?!authorid=).*";
Pattern p = Pattern.compile(rex);
Matcher m = p.matcher(result);
boolean s = m.matches();
for(int i=1;i<=m.groupCount();i++)
{
System.out.println(m.group(i));
}*/
System.out.print(result);
}
示例10: html2xhtml
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private String html2xhtml(String html) {
if (StringUtils.isEmpty(html)){
return "";
}
// Garante tag raiz único para o HtmlCleaner
html = "<div>" + html + "</div>";
CleanerProperties props = new CleanerProperties();
// set some properties to non-default values
props.setTranslateSpecialEntities(true);
props.setTransResCharsToNCR(true);
props.setOmitXmlDeclaration(true);
props.setOmitHtmlEnvelope(true);
props.setOmitComments(true);
// do parsing
TagNode tagNode = new HtmlCleaner(props).clean(html);
// serialize to xml file
String ret;
try {
ret = new SimpleXmlSerializer(props).getAsString(tagNode);
} catch (IOException e) {
throw new RuntimeException(e.getMessage(), e);
}
// Remove tag raiz <div>
ret = ret.substring(5, ret.length() - 6);
return ret;
}
示例11: getStandardCredit
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static ArrayList<String> getStandardCredit(String year, int index,
String department) throws Exception {
try {
ArrayList<String> standard = new ArrayList<>();
HashMap<String, String> params = new HashMap<>();
params.put("format", "-3");
params.put("year", year);
params.put("matric", matrics.get(index));
String result = Connector
.getDataByPost(getStandardUri(lang), params, "big5");
result = result.replace("<td", "</td><td");
result = result.replace("<tr>", "</td><tr>");
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setUseCdataForScriptAndStyle(true);
props.setRecognizeUnicodeChars(true);
props.setUseEmptyElementTags(true);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
props.setBooleanAttributeValues("empty");
result = new PrettyHtmlSerializer(props).getAsString(result);
TagNode tagNode = cleaner.clean(result);
TagNode[] tables = tagNode.getElementsByAttValue("border", "1",
true, false);
TagNode[] rows = tables[0].getElementsByName("tr", true);
for (int i = 1; i < rows.length; i++) {
TagNode[] cols = rows[i].getElementsByName("td", true);
String temp = cols[0].getText().toString();
if (temp.replace(" ", "").replace("\n", "").contains(department.replace(" ", "").replace("\n", ""))) {
for (int j = 1; j < 9; j++) {
String credit = Utility.cleanString(cols[j].getText()
.toString());
standard.add(credit);
}
return standard;
}
}
throw new Exception();
} catch (Exception e) {
e.printStackTrace();
throw new Exception("畢業學分標準讀取時發生錯誤");
}
}
示例12: stripSignatureForHtmlMessage
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static String stripSignatureForHtmlMessage(String content) {
Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
if (dashSignatureHtml.find()) {
Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
List<Integer> start = new ArrayList<>();
List<Integer> end = new ArrayList<>();
while (blockquoteStart.find()) {
start.add(blockquoteStart.start());
}
while (blockquoteEnd.find()) {
end.add(blockquoteEnd.start());
}
if (start.size() != end.size()) {
Log.d(K9.LOG_TAG, "There are " + start.size() + " <blockquote> tags, but " +
end.size() + " </blockquote> tags. Refusing to strip.");
} else if (start.size() > 0) {
// Ignore quoted signatures in blockquotes.
dashSignatureHtml.region(0, start.get(0));
if (dashSignatureHtml.find()) {
// before first <blockquote>.
content = content.substring(0, dashSignatureHtml.start());
} else {
for (int i = 0; i < start.size() - 1; i++) {
// within blockquotes.
if (end.get(i) < start.get(i + 1)) {
dashSignatureHtml.region(end.get(i), start.get(i + 1));
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
break;
}
}
}
if (end.get(end.size() - 1) < content.length()) {
// after last </blockquote>.
dashSignatureHtml.region(end.get(end.size() - 1), content.length());
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
}
}
}
} else {
// No blockquotes found.
content = content.substring(0, dashSignatureHtml.start());
}
}
// Fix the stripping off of closing tags if a signature was stripped,
// as well as clean up the HTML of the quoted message.
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
// see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
TagNode node = cleaner.clean(content);
SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
content = htmlSerialized.getAsString(node, "UTF8");
return content;
}
示例13: getCleanHtml
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* Clean HTML document and return XML as byte array
*
* @param resourceMap map of resources
* @param resID unique ID of resource
* @return clean XHTML document as {@code byte[]}
* @throws IOException
*/
private byte[] getCleanHtml(PandaSettings pandaSettings, String resID) throws IOException {
byte[] doc = null;
// Get local path to file, if null the URL field will be used to
// retrieve resource
ResourceInfo resInfo = pandaSettings.getResourceMap().getMap().get(resID);
String filePath = resInfo.getFilePath();
// properties for HTML cleaning
CleanerProperties props = new CleanerProperties();
// preserve namespace prefixes
props.setNamespacesAware(true);
// remove <?TAGNAME....> or <!TAGNAME....>
props.setIgnoreQuestAndExclam(true);
// do not split attributes with multiple words
props.setAllowMultiWordAttributes(true);
// omits <html> tag
// props.setOmitHtmlEnvelope(true);
// omit DTD
props.setOmitDoctypeDeclaration(true);
// omit xml declaration
props.setOmitXmlDeclaration(true);
// omit comments
props.setOmitComments(true);
// omit deprecated tags like <font...>
props.setOmitDeprecatedTags(true);
// treat script and style tag contents as CDATA
props.setUseCdataForScriptAndStyle(true);
// replace html character in form &#XXXX with real unicode characters
props.setRecognizeUnicodeChars(true);
// replace special entities with unicode character
props.setTranslateSpecialEntities(true);
// if true do not escape valid xml character sequences
props.setAdvancedXmlEscape(true);
// get HTML document, parse HTML
TagNode tagNode = null;
if (filePath != null) {
tagNode = new HtmlCleaner(props).clean(new File(filePath));
} else {
// Get online resource
URL resURL = pandaSettings.getResourceMap().getMap().get(resID).getURL();
InputStream htmlDoc = getOnlineResource(resURL);
tagNode = new HtmlCleaner(props).clean(htmlDoc);
}
PrettyXmlSerializer pXmlS = new PrettyXmlSerializer(props);
doc = pXmlS.getAsString(tagNode).getBytes();
return doc;
}
示例14: cleanHTML
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public void cleanHTML(String path, String out, String encoding) throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
CleanerTransformations transformations = new CleanerTransformations();
AttributeTransformationPatternImpl attPattern = new AttributeTransformationPatternImpl(
Pattern.compile("^\\s*class", Pattern.CASE_INSENSITIVE), null,
null);
transformations.addGlobalTransformation(attPattern);
AttributeTransformationPatternImpl attPattern2 = new AttributeTransformationPatternImpl(
Pattern.compile("^\\s*id", Pattern.CASE_INSENSITIVE), null,
null);
transformations.addGlobalTransformation(attPattern2);
props.setCleanerTransformations(transformations);
// set some properties to non-default values
props.setTranslateSpecialEntities(true);
props.setTransResCharsToNCR(false);
props.setOmitComments(true);
props.setPruneTags("script,style,img,form");
// do parsing
TagNode tagNode = new HtmlCleaner(props)
.clean(new File(path), encoding);
tagNode.removeAttribute("class");
// serialize to xml file
new PrettyHtmlSerializer(props).writeToFile(tagNode,
out, "utf-8");
}