本文整理汇总了Java中org.htmlcleaner.CleanerProperties类的典型用法代码示例。如果您正苦于以下问题:Java CleanerProperties类的具体用法?Java CleanerProperties怎么用?Java CleanerProperties使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
CleanerProperties类属于org.htmlcleaner包,在下文中一共展示了CleanerProperties类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
HtmlCleaner result = new HtmlCleaner();
CleanerProperties cleanerProperties = result.getProperties();
cleanerProperties.setAdvancedXmlEscape(true);
cleanerProperties.setOmitXmlDeclaration(true);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setTranslateSpecialEntities(true);
cleanerProperties.setTransResCharsToNCR(true);
cleanerProperties.setRecognizeUnicodeChars(true);
cleanerProperties.setIgnoreQuestAndExclam(true);
cleanerProperties.setUseEmptyElementTags(false);
cleanerProperties.setPruneTags("script,title");
return result;
}
示例2: toXML
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**
* htmlcleaner로 html string을 xml string으로 바꿔주는 메소드.
* @param source
* @return
*/
private String toXML(String source){
try {
CleanerProperties props = new CleanerProperties();
props.setTranslateSpecialEntities(true);
props.setOmitComments(true);
props.setPruneTags("script,style");
// namespace를 무시한다.
props.setNamespacesAware(false);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
HtmlCleaner cl = new HtmlCleaner(props);
TagNode tagNode = cl.clean(source);
source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
} catch (IOException e) {
logger.error("",e);
}
return source;
}
示例3: htmlOutputStreamViaHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**
* To Output html Stream via Html Cleaner.
*
* @param pathOfHOCRFile String
* @param outputFilePath String
* @throws IOException
*/
public static void htmlOutputStreamViaHtmlCleaner(String pathOfHOCRFile, String outputFilePath) throws IOException {
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(false);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
// take default cleaner properties
// CleanerProperties props = cleaner.getProperties();
FileInputStream hOCRFileInputStream = new FileInputStream(pathOfHOCRFile);
TagNode tagNode = cleaner.clean(hOCRFileInputStream, UTF_ENCODING);
if (null != hOCRFileInputStream) {
hOCRFileInputStream.close();
}
try {
new PrettyHtmlSerializer(cleanerProps).writeToFile(tagNode, outputFilePath, UTF_ENCODING);
} catch (Exception e) { // NOPMD.
}
}
示例4: cleanFile
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**
*
* @param props
* @param path
* @param nameFile
* @param newNameFile
*/
public static void cleanFile(CleanerProperties props, String path, String nameFile, String newNameFile)
{
File fileURL = new File(path + File.separator + nameFile);
// do parsing
try
{
TagNode tagNode = new HtmlCleaner(props).clean(fileURL, "utf-8");
// serialize to xml file
new CompactHtmlSerializer(props).writeToFile(
tagNode, path + File.separator + newNameFile, "UTF-8"
);
LOG.info(path + File.separator + nameFile + " cleaned!");
}
catch(Exception ex)
{
LOG.log(Level.WARNING, ex.getMessage() + " " + path + File.separator + nameFile + " NOT FOUND!");
}
}
示例5: getHTML
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public byte[] getHTML(HSSFWorkbook book) throws IOException {
double width = 21.0;
double height = 29.7;
if (isLandscape()) {
width += height;
height = width - height;
width = width - height;
}
byte[] html = convert(book, width, height).getBytes();
ByteArrayInputStream in = new ByteArrayInputStream(html);
// Clean up the HTML to be well formed
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
TagNode node = cleaner.clean(in, "UTF-8");
// ByteArrayOutputStream out = new ByteArrayOutputStream();
// Instead of writing to System.out we now write to the ByteArray buffer
// return new PrettyXmlSerializer(props).getAsString(node, "UTF-8").getBytes();
ByteArrayOutputStream out = new ByteArrayOutputStream();
// Instead of writing to System.out we now write to the ByteArray buffer
new PrettyXmlSerializer(props).writeToStream(node, out);
return out.toByteArray();
}
示例6: getSerialized
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**
* Convenience method (for xml/xhtml): serializes the parsed page.
*
* @param inSerializer
* {@link XmlSerializer}
* @return String the cleaned and serialized html
* @throws IOException
*/
public String getSerialized(final XmlSerializer inSerializer)
throws IOException {
if (docNode == null) {
return ""; //$NON-NLS-1$
}
final CleanerProperties lProps = new HtmlCleaner().getProperties();
lProps.setUseCdataForScriptAndStyle(true);
lProps.setRecognizeUnicodeChars(true);
lProps.setUseEmptyElementTags(true);
lProps.setAdvancedXmlEscape(true);
lProps.setTranslateSpecialEntities(true);
lProps.setBooleanAttributeValues("empty"); //$NON-NLS-1$
lProps.setNamespacesAware(true);
lProps.setOmitXmlDeclaration(false);
lProps.setOmitDoctypeDeclaration(true);
lProps.setOmitHtmlEnvelope(false);
docNode.getAttributes().remove(NS_XML);
return inSerializer.getSerializer(lProps).getXmlAsString(docNode);
}
示例7: createCleanerProperties
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
private static CleanerProperties createCleanerProperties() {
CleanerProperties properties = new CleanerProperties();
// See http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
properties.setIgnoreQuestAndExclam(false);
properties.setAllowHtmlInsideAttributes(true);
return properties;
}
示例8: isHealthy
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
@Override
public boolean isHealthy() {
HttpGet getMethod = new HttpGet(GET_REQUEST_URL);
CloseableHttpResponse response = null;
CloseableHttpClient httpClient = null;
try {
httpClient = HttpClientBuilder.create().build();
response = httpClient.execute(getMethod);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != HttpStatus.SC_OK) {
LOG.info("Health check failed, got response code: %d", statusCode);
return false;
}
String htmlContents = EntityUtils.toString(response.getEntity());
TagNode tagNode = new HtmlCleaner().clean(htmlContents);
Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
XPath xpath = XPathFactory.newInstance().newXPath();
String submitId = (String) xpath.evaluate(XPATH_TO_SUBMIT_ID, doc, XPathConstants.STRING);
if (StringUtils.isBlank(submitId)) {
LOG.info("Health check failed, submitId token was null or empty.");
return false;
}
} catch (Throwable t) {
LOG.info("Health check failed, exception thrown: %s", t.getMessage());
} finally {
closeHttpObjects(response, httpClient);
}
return true;
}
示例9: testXpathExtraction
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
@Test
public void testXpathExtraction() throws IOException, ParserConfigurationException, XPathExpressionException {
byte[] encoded = Files.readAllBytes(Paths.get("src/test/resources/raw_data/archive.is.html"));
String htmlContents = new String(encoded, StandardCharsets.UTF_8);
TagNode tagNode = new HtmlCleaner().clean(htmlContents);
Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
XPath xpath = XPathFactory.newInstance().newXPath();
String str = (String) xpath.evaluate("//*[@id=\"submiturl\"]/input/@value", doc, XPathConstants.STRING);
String actualValue = "YHuwL/nTgL370PMDM2G2vkuvMg3kmNqk/y/i7NRSaLyf2JSIU+/now+AYw+X0nX8";
Assert.assertTrue("Did not extract expected value!", str.equals(actualValue));
}
示例10: getTextFromHtmlString
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**
* This method extracts the text from html string.
* @param htmlString {@link String}
* @return {@link String}
*/
public static String getTextFromHtmlString(String htmlString) {
String errorText = "";
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(true);
cleanerProps.setUseEmptyElementTags(true);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
TagNode tagNode = cleaner.clean(htmlString);
Object[] rootNode = null;
try {
rootNode = tagNode.evaluateXPath("//table");
if (null != rootNode && rootNode.length > 0) {
TagNode[] textNode = ((TagNode) rootNode[rootNode.length - 1]).getElementsByName("td", true);
for (TagNode tag : textNode) {
if (tag != null && tag.getText() != null) {
StringBuilder errorTextString = new StringBuilder();
errorTextString.append(errorText);
if (tag.getText().toString().trim().equals(" ")) {
errorTextString.append(" ");
errorText = errorTextString.toString();
} else {
errorTextString.append(tag.getText());
errorText = errorTextString.toString();
}
}
}
}
} catch (XPatherException e) {
LOGGER.error("Error extracting table node from html." + e.getMessage());
}
return errorText;
}
示例11: htmlToWiki
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public static String htmlToWiki(String html, String contextPath, int projectId) throws Exception {
// Strip the nbsp because it gets converted to unicode
html = StringUtils.replace(html, " ", " ");
// Take the html create DOM for parsing
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
TagNode node = cleaner.clean(html);
Document document = new DomSerializer(props, true).createDOM(node);
if (LOG.isTraceEnabled()) {
LOG.trace(html);
}
// Process each node and output the wiki equivalent
StringBuffer sb = new StringBuffer();
ArrayList<Node> nodeList = new ArrayList<Node>();
for (int i = 0; i < document.getChildNodes().getLength(); i++) {
Node n = document.getChildNodes().item(i);
nodeList.add(n);
}
processChildNodes(nodeList, sb, 0, true, true, false, "", contextPath, projectId);
if (sb.length() > 0) {
String content = sb.toString().trim();
if (content.contains("'")) {
// Determine if this is where the ' is being introduced
content = StringUtils.replace(content, "'", "'");
}
if (!content.endsWith(CRLF)) {
return content + CRLF;
} else {
return content;
}
} else {
return "";
}
}
示例12: createHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
HtmlCleaner result = new HtmlCleaner();
CleanerProperties cleanerProperties = result.getProperties();
cleanerProperties.setOmitXmlDeclaration(true);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setRecognizeUnicodeChars(true);
cleanerProperties.setTranslateSpecialEntities(false);
cleanerProperties.setIgnoreQuestAndExclam(true);
cleanerProperties.setUseEmptyElementTags(false);
return result;
}
示例13: parseHhc
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException, XPathExpressionException {
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties props = htmlCleaner.getProperties();
TagNode node = htmlCleaner.clean(hhcFile);
Document hhcDocument = new DomSerializer(props).createDOM(node);
XPath xpath = XPathFactory.newInstance().newXPath();
Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
.getDocumentElement(), XPathConstants.NODE);
List<TOCReference> sections = processUlNode(ulNode, resources);
return sections;
}
示例14: getHtmlDocumentModel
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public static Document getHtmlDocumentModel(String htmlContent) {
try {
TagNode tagNode = new HtmlCleaner().clean(htmlContent);
Document doc;
try {
doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
return doc;
} catch (RuntimeException rte) {
return null;
}
}
示例15: HtmlXpathSelector
import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public HtmlXpathSelector(String content) throws ParserConfigurationException, SAXException, IOException
{
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode rootTagNode = htmlCleaner.clean(content);
rootDocument = new DomSerializer(new CleanerProperties()).createDOM(rootTagNode);
xPath=XPathFactory.newInstance().newXPath();
}