当前位置: 首页>>代码示例>>Java>>正文


Java CleanerProperties类代码示例

本文整理汇总了Java中org.htmlcleaner.CleanerProperties的典型用法代码示例。如果您正苦于以下问题:Java CleanerProperties类的具体用法?Java CleanerProperties怎么用?Java CleanerProperties使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


CleanerProperties类属于org.htmlcleaner包,在下文中一共展示了CleanerProperties类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: createHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
    HtmlCleaner result = new HtmlCleaner();
    CleanerProperties cleanerProperties = result.getProperties();

    cleanerProperties.setAdvancedXmlEscape(true);

    cleanerProperties.setOmitXmlDeclaration(true);
    cleanerProperties.setOmitDoctypeDeclaration(false);

    cleanerProperties.setTranslateSpecialEntities(true);
    cleanerProperties.setTransResCharsToNCR(true);
    cleanerProperties.setRecognizeUnicodeChars(true);

    cleanerProperties.setIgnoreQuestAndExclam(true);
    cleanerProperties.setUseEmptyElementTags(false);

    cleanerProperties.setPruneTags("script,title");

    return result;
}
 
开发者ID:SysdataSpA,项目名称:SDHtmlTextView,代码行数:21,代码来源:HtmlSpanner.java

示例2: toXML

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**
 * htmlcleaner로 html string을 xml string으로 바꿔주는 메소드.
 * @param source
 * @return
 */
private String toXML(String source){
	try {
		CleanerProperties props = new CleanerProperties();
		props.setTranslateSpecialEntities(true);
		props.setOmitComments(true);
		props.setPruneTags("script,style");
		// namespace를 무시한다.
		props.setNamespacesAware(false);
		props.setAdvancedXmlEscape(true);
		props.setTranslateSpecialEntities(true);
		HtmlCleaner cl = new HtmlCleaner(props);
		TagNode tagNode = cl.clean(source);
		source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
	} catch (IOException e) {
		logger.error("",e);
	}
	return source;
}
 
开发者ID:gncloud,项目名称:fastcatsearch3,代码行数:24,代码来源:ReadabilityExtractor.java

示例3: htmlOutputStreamViaHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**
 * To Output html Stream via Html Cleaner.
 * 
 * @param pathOfHOCRFile String
 * @param outputFilePath String
 * @throws IOException
 */
public static void htmlOutputStreamViaHtmlCleaner(String pathOfHOCRFile, String outputFilePath) throws IOException {
	CleanerProperties cleanerProps = new CleanerProperties();

	// set some properties to non-default values
	cleanerProps.setTransResCharsToNCR(true);
	cleanerProps.setTranslateSpecialEntities(true);
	cleanerProps.setOmitComments(true);
	cleanerProps.setOmitDoctypeDeclaration(true);
	cleanerProps.setOmitXmlDeclaration(false);
	HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);

	// take default cleaner properties
	// CleanerProperties props = cleaner.getProperties();
	FileInputStream hOCRFileInputStream = new FileInputStream(pathOfHOCRFile);
	TagNode tagNode = cleaner.clean(hOCRFileInputStream, UTF_ENCODING);
	if (null != hOCRFileInputStream) {
		hOCRFileInputStream.close();
	}
	try {
		new PrettyHtmlSerializer(cleanerProps).writeToFile(tagNode, outputFilePath, UTF_ENCODING);
	} catch (Exception e) { // NOPMD.
	}
}
 
开发者ID:kuzavas,项目名称:ephesoft,代码行数:31,代码来源:XMLUtil.java

示例4: cleanFile

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**     
 * 
 * @param props
 * @param path
 * @param nameFile
 * @param newNameFile
 */
public static void cleanFile(CleanerProperties props, String path, String nameFile, String newNameFile)
{        
    File fileURL = new File(path + File.separator + nameFile);
    // do parsing
    try
    {
        TagNode tagNode = new HtmlCleaner(props).clean(fileURL, "utf-8");                                                
        // serialize to xml file
        new CompactHtmlSerializer(props).writeToFile(
            tagNode, path + File.separator + newNameFile, "UTF-8"
        );
        LOG.info(path + File.separator + nameFile + " cleaned!");
    }
    catch(Exception ex)
    {
        LOG.log(Level.WARNING, ex.getMessage() + " " + path + File.separator + nameFile + " NOT FOUND!");
    }
}
 
开发者ID:eduardoguzman,项目名称:sisob-data-extractor,代码行数:26,代码来源:ResearchersPagePostProcessor.java

示例5: getHTML

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public byte[] getHTML(HSSFWorkbook book) throws IOException {
        double width = 21.0;
        double height = 29.7;
        if (isLandscape()) {
            width += height;
            height = width - height;
            width = width - height;
        }
        byte[] html = convert(book, width, height).getBytes();
        ByteArrayInputStream in = new ByteArrayInputStream(html);

        // Clean up the HTML to be well formed
        HtmlCleaner cleaner = new HtmlCleaner();
        CleanerProperties props = cleaner.getProperties();
        TagNode node = cleaner.clean(in, "UTF-8");

//        ByteArrayOutputStream out = new ByteArrayOutputStream();
        // Instead of writing to System.out we now write to the ByteArray buffer
//        return 	new PrettyXmlSerializer(props).getAsString(node, "UTF-8").getBytes();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        // Instead of writing to System.out we now write to the ByteArray buffer
        new PrettyXmlSerializer(props).writeToStream(node, out);

        return out.toByteArray();
    }
 
开发者ID:rmage,项目名称:gnvc-ims,代码行数:26,代码来源:ReportModel.java

示例6: getSerialized

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**
 * Convenience method (for xml/xhtml): serializes the parsed page.
 *
 * @param inSerializer
 *            {@link XmlSerializer}
 * @return String the cleaned and serialized html
 * @throws IOException
 */
public String getSerialized(final XmlSerializer inSerializer)
        throws IOException {
	if (docNode == null) {
		return ""; //$NON-NLS-1$
	}

	final CleanerProperties lProps = new HtmlCleaner().getProperties();
	lProps.setUseCdataForScriptAndStyle(true);
	lProps.setRecognizeUnicodeChars(true);
	lProps.setUseEmptyElementTags(true);
	lProps.setAdvancedXmlEscape(true);
	lProps.setTranslateSpecialEntities(true);
	lProps.setBooleanAttributeValues("empty"); //$NON-NLS-1$
	lProps.setNamespacesAware(true);
	lProps.setOmitXmlDeclaration(false);
	lProps.setOmitDoctypeDeclaration(true);
	lProps.setOmitHtmlEnvelope(false);

	docNode.getAttributes().remove(NS_XML);

	return inSerializer.getSerializer(lProps).getXmlAsString(docNode);
}
 
开发者ID:aktion-hip,项目名称:relations,代码行数:31,代码来源:XPathHelper.java

示例7: createCleanerProperties

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
private static CleanerProperties createCleanerProperties() {
    CleanerProperties properties = new CleanerProperties();

    // See http://htmlcleaner.sourceforge.net/parameters.php for descriptions
    properties.setNamespacesAware(false);
    properties.setAdvancedXmlEscape(false);
    properties.setOmitXmlDeclaration(true);
    properties.setOmitDoctypeDeclaration(false);
    properties.setTranslateSpecialEntities(false);
    properties.setRecognizeUnicodeChars(false);
    properties.setIgnoreQuestAndExclam(false);
    properties.setAllowHtmlInsideAttributes(true);

    return properties;
}
 
开发者ID:scoute-dich,项目名称:K9-MailClient,代码行数:16,代码来源:HtmlSanitizer.java

示例8: isHealthy

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
@Override
public boolean isHealthy() {
    HttpGet getMethod = new HttpGet(GET_REQUEST_URL);

    CloseableHttpResponse response = null;
    CloseableHttpClient httpClient = null;
    try {
        httpClient = HttpClientBuilder.create().build();
        response = httpClient.execute(getMethod);
        int statusCode = response.getStatusLine().getStatusCode();

        if (statusCode != HttpStatus.SC_OK) {
            LOG.info("Health check failed, got response code: %d", statusCode);
            return false;
        }

        String htmlContents = EntityUtils.toString(response.getEntity());
        TagNode tagNode = new HtmlCleaner().clean(htmlContents);
        Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

        XPath xpath = XPathFactory.newInstance().newXPath();
        String submitId = (String) xpath.evaluate(XPATH_TO_SUBMIT_ID, doc, XPathConstants.STRING);

        if (StringUtils.isBlank(submitId)) {
            LOG.info("Health check failed, submitId token was null or empty.");
            return false;
        }

    } catch (Throwable t) {
        LOG.info("Health check failed, exception thrown: %s", t.getMessage());
    } finally {
        closeHttpObjects(response, httpClient);
    }

    return true;
}
 
开发者ID:sgskinner,项目名称:StashThisBot,代码行数:37,代码来源:ArchiveIsServiceImpl.java

示例9: testXpathExtraction

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
@Test
public void testXpathExtraction() throws IOException, ParserConfigurationException, XPathExpressionException {
    byte[] encoded = Files.readAllBytes(Paths.get("src/test/resources/raw_data/archive.is.html"));
    String htmlContents = new String(encoded, StandardCharsets.UTF_8);

    TagNode tagNode = new HtmlCleaner().clean(htmlContents);
    Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    XPath xpath = XPathFactory.newInstance().newXPath();
    String str = (String) xpath.evaluate("//*[@id=\"submiturl\"]/input/@value", doc, XPathConstants.STRING);

    String actualValue = "YHuwL/nTgL370PMDM2G2vkuvMg3kmNqk/y/i7NRSaLyf2JSIU+/now+AYw+X0nX8";
    Assert.assertTrue("Did not extract expected value!", str.equals(actualValue));
}
 
开发者ID:sgskinner,项目名称:StashThisBot,代码行数:15,代码来源:ArchiveIsServiceTest.java

示例10: getTextFromHtmlString

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
/**
 * This method extracts the text from html string.
 * @param htmlString {@link String}
 * @return {@link String}
 */
public static String getTextFromHtmlString(String htmlString) {
	String errorText = "";
	CleanerProperties cleanerProps = new CleanerProperties();
	// set some properties to non-default values
	cleanerProps.setTransResCharsToNCR(true);
	cleanerProps.setTranslateSpecialEntities(true);
	cleanerProps.setOmitComments(true);
	cleanerProps.setOmitDoctypeDeclaration(true);
	cleanerProps.setOmitXmlDeclaration(true);
	cleanerProps.setUseEmptyElementTags(true);

	HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
	TagNode tagNode = cleaner.clean(htmlString);
	Object[] rootNode = null;
	try {
		rootNode = tagNode.evaluateXPath("//table");
		if (null != rootNode && rootNode.length > 0) {
			TagNode[] textNode = ((TagNode) rootNode[rootNode.length - 1]).getElementsByName("td", true);
			for (TagNode tag : textNode) {
				if (tag != null && tag.getText() != null) {
					StringBuilder errorTextString = new StringBuilder();
					errorTextString.append(errorText);
					if (tag.getText().toString().trim().equals(" ")) {
						errorTextString.append(" ");
						errorText = errorTextString.toString();
					} else {
						errorTextString.append(tag.getText());
						errorText = errorTextString.toString();
					}
				}
			}
		}
	} catch (XPatherException e) {
		LOGGER.error("Error extracting table node from html." + e.getMessage());
	}
	return errorText;
}
 
开发者ID:kuzavas,项目名称:ephesoft,代码行数:43,代码来源:AbstractUploadFile.java

示例11: htmlToWiki

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public static String htmlToWiki(String html, String contextPath, int projectId) throws Exception {

    // Strip the nbsp because it gets converted to unicode
    html = StringUtils.replace(html, " ", " ");

    // Take the html create DOM for parsing
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties props = cleaner.getProperties();
    TagNode node = cleaner.clean(html);
    Document document = new DomSerializer(props, true).createDOM(node);
    if (LOG.isTraceEnabled()) {
      LOG.trace(html);
    }

    // Process each node and output the wiki equivalent
    StringBuffer sb = new StringBuffer();
    ArrayList<Node> nodeList = new ArrayList<Node>();
    for (int i = 0; i < document.getChildNodes().getLength(); i++) {
      Node n = document.getChildNodes().item(i);
      nodeList.add(n);
    }
    processChildNodes(nodeList, sb, 0, true, true, false, "", contextPath, projectId);
    if (sb.length() > 0) {
      String content = sb.toString().trim();
      if (content.contains("&apos;")) {
        // Determine if this is where the &apos; is being introduced
        content = StringUtils.replace(content, "&apos;", "'");
      }
      if (!content.endsWith(CRLF)) {
        return content + CRLF;
      } else {
        return content;
      }
    } else {
      return "";
    }
  }
 
开发者ID:Concursive,项目名称:concourseconnect-community,代码行数:38,代码来源:HTMLToWikiUtils.java

示例12: createHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
    HtmlCleaner result = new HtmlCleaner();
    CleanerProperties cleanerProperties = result.getProperties();
    cleanerProperties.setOmitXmlDeclaration(true);
    cleanerProperties.setOmitDoctypeDeclaration(false);
    cleanerProperties.setRecognizeUnicodeChars(true);
    cleanerProperties.setTranslateSpecialEntities(false);
    cleanerProperties.setIgnoreQuestAndExclam(true);
    cleanerProperties.setUseEmptyElementTags(false);
    return result;
}
 
开发者ID:DASAR,项目名称:epublib-android,代码行数:12,代码来源:HtmlCleanerBookProcessor.java

示例13: parseHhc

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException,	XPathExpressionException {
	HtmlCleaner htmlCleaner = new HtmlCleaner();
	CleanerProperties props = htmlCleaner.getProperties();
	TagNode node = htmlCleaner.clean(hhcFile);
	Document hhcDocument = new DomSerializer(props).createDOM(node);
	XPath xpath = XPathFactory.newInstance().newXPath();
	Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
			.getDocumentElement(), XPathConstants.NODE);
	List<TOCReference> sections = processUlNode(ulNode, resources);
	return sections;
}
 
开发者ID:DASAR,项目名称:epublib-android,代码行数:12,代码来源:HHCParser.java

示例14: getHtmlDocumentModel

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public static Document getHtmlDocumentModel(String htmlContent) {

        try {
            TagNode tagNode = new HtmlCleaner().clean(htmlContent);
            Document doc;
            try {
                doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
            } catch (ParserConfigurationException e) {
                throw new RuntimeException(e);
            }
            return doc;
        } catch (RuntimeException rte) {
            return null;
        }
    }
 
开发者ID:trywildcard,项目名称:pair-java,代码行数:16,代码来源:HtmlParserUtil.java

示例15: HtmlXpathSelector

import org.htmlcleaner.CleanerProperties; //导入依赖的package包/类
public HtmlXpathSelector(String content) throws ParserConfigurationException, SAXException, IOException
{

	HtmlCleaner htmlCleaner = new HtmlCleaner();  
	TagNode rootTagNode = htmlCleaner.clean(content);  
	rootDocument = new DomSerializer(new CleanerProperties()).createDOM(rootTagNode);
	xPath=XPathFactory.newInstance().newXPath();
	
}
 
开发者ID:hxt168,项目名称:webpasser,代码行数:10,代码来源:HtmlXpathSelector.java


注:本文中的org.htmlcleaner.CleanerProperties类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。