本文整理汇总了Java中org.htmlcleaner.CleanerProperties.setTransResCharsToNCR方法的典型用法代码示例。如果您正苦于以下问题:Java CleanerProperties.setTransResCharsToNCR方法的具体用法?Java CleanerProperties.setTransResCharsToNCR怎么用?Java CleanerProperties.setTransResCharsToNCR使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlcleaner.CleanerProperties
的用法示例。
在下文中一共展示了CleanerProperties.setTransResCharsToNCR方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
HtmlCleaner result = new HtmlCleaner();
CleanerProperties cleanerProperties = result.getProperties();
cleanerProperties.setAdvancedXmlEscape(true);
cleanerProperties.setOmitXmlDeclaration(true);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setTranslateSpecialEntities(true);
cleanerProperties.setTransResCharsToNCR(true);
cleanerProperties.setRecognizeUnicodeChars(true);
cleanerProperties.setIgnoreQuestAndExclam(true);
cleanerProperties.setUseEmptyElementTags(false);
cleanerProperties.setPruneTags("script,title");
return result;
}
示例2: htmlOutputStreamViaHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* To Output html Stream via Html Cleaner.
*
* @param pathOfHOCRFile String
* @param outputFilePath String
* @throws IOException
*/
public static void htmlOutputStreamViaHtmlCleaner(String pathOfHOCRFile, String outputFilePath) throws IOException {
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(false);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
// take default cleaner properties
// CleanerProperties props = cleaner.getProperties();
FileInputStream hOCRFileInputStream = new FileInputStream(pathOfHOCRFile);
TagNode tagNode = cleaner.clean(hOCRFileInputStream, UTF_ENCODING);
if (null != hOCRFileInputStream) {
hOCRFileInputStream.close();
}
try {
new PrettyHtmlSerializer(cleanerProps).writeToFile(tagNode, outputFilePath, UTF_ENCODING);
} catch (Exception e) { // NOPMD.
}
}
示例3: getTextFromHtmlString
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* This method extracts the text from html string.
* @param htmlString {@link String}
* @return {@link String}
*/
public static String getTextFromHtmlString(String htmlString) {
String errorText = "";
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(true);
cleanerProps.setUseEmptyElementTags(true);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
TagNode tagNode = cleaner.clean(htmlString);
Object[] rootNode = null;
try {
rootNode = tagNode.evaluateXPath("//table");
if (null != rootNode && rootNode.length > 0) {
TagNode[] textNode = ((TagNode) rootNode[rootNode.length - 1]).getElementsByName("td", true);
for (TagNode tag : textNode) {
if (tag != null && tag.getText() != null) {
StringBuilder errorTextString = new StringBuilder();
errorTextString.append(errorText);
if (tag.getText().toString().trim().equals(" ")) {
errorTextString.append(" ");
errorText = errorTextString.toString();
} else {
errorTextString.append(tag.getText());
errorText = errorTextString.toString();
}
}
}
}
} catch (XPatherException e) {
LOGGER.error("Error extracting table node from html." + e.getMessage());
}
return errorText;
}
示例4: html2xhtml
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private String html2xhtml(String html) {
if (StringUtils.isEmpty(html)){
return "";
}
// Garante tag raiz único para o HtmlCleaner
html = "<div>" + html + "</div>";
CleanerProperties props = new CleanerProperties();
// set some properties to non-default values
props.setTranslateSpecialEntities(true);
props.setTransResCharsToNCR(true);
props.setOmitXmlDeclaration(true);
props.setOmitHtmlEnvelope(true);
props.setOmitComments(true);
// do parsing
TagNode tagNode = new HtmlCleaner(props).clean(html);
// serialize to xml file
String ret;
try {
ret = new SimpleXmlSerializer(props).getAsString(tagNode);
} catch (IOException e) {
throw new RuntimeException(e.getMessage(), e);
}
// Remove tag raiz <div>
ret = ret.substring(5, ret.length() - 6);
return ret;
}
示例5: cleanHTML
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public void cleanHTML(String path, String out, String encoding) throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
CleanerTransformations transformations = new CleanerTransformations();
AttributeTransformationPatternImpl attPattern = new AttributeTransformationPatternImpl(
Pattern.compile("^\\s*class", Pattern.CASE_INSENSITIVE), null,
null);
transformations.addGlobalTransformation(attPattern);
AttributeTransformationPatternImpl attPattern2 = new AttributeTransformationPatternImpl(
Pattern.compile("^\\s*id", Pattern.CASE_INSENSITIVE), null,
null);
transformations.addGlobalTransformation(attPattern2);
props.setCleanerTransformations(transformations);
// set some properties to non-default values
props.setTranslateSpecialEntities(true);
props.setTransResCharsToNCR(false);
props.setOmitComments(true);
props.setPruneTags("script,style,img,form");
// do parsing
TagNode tagNode = new HtmlCleaner(props)
.clean(new File(path), encoding);
tagNode.removeAttribute("class");
// serialize to xml file
new PrettyHtmlSerializer(props).writeToFile(tagNode,
out, "utf-8");
}