当前位置: 首页>>代码示例>>Java>>正文


Java CleanerProperties.setTranslateSpecialEntities方法代码示例

本文整理汇总了Java中org.htmlcleaner.CleanerProperties.setTranslateSpecialEntities方法的典型用法代码示例。如果您正苦于以下问题:Java CleanerProperties.setTranslateSpecialEntities方法的具体用法?Java CleanerProperties.setTranslateSpecialEntities怎么用?Java CleanerProperties.setTranslateSpecialEntities使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.htmlcleaner.CleanerProperties的用法示例。


在下文中一共展示了CleanerProperties.setTranslateSpecialEntities方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: createHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
    HtmlCleaner result = new HtmlCleaner();
    CleanerProperties cleanerProperties = result.getProperties();

    cleanerProperties.setAdvancedXmlEscape(true);

    cleanerProperties.setOmitXmlDeclaration(true);
    cleanerProperties.setOmitDoctypeDeclaration(false);

    cleanerProperties.setTranslateSpecialEntities(true);
    cleanerProperties.setTransResCharsToNCR(true);
    cleanerProperties.setRecognizeUnicodeChars(true);

    cleanerProperties.setIgnoreQuestAndExclam(true);
    cleanerProperties.setUseEmptyElementTags(false);

    cleanerProperties.setPruneTags("script,title");

    return result;
}
 
开发者ID:SysdataSpA,项目名称:SDHtmlTextView,代码行数:21,代码来源:HtmlSpanner.java

示例2: toXML

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
 * htmlcleaner로 html string을 xml string으로 바꿔주는 메소드.
 * @param source
 * @return
 */
private String toXML(String source){
	try {
		CleanerProperties props = new CleanerProperties();
		props.setTranslateSpecialEntities(true);
		props.setOmitComments(true);
		props.setPruneTags("script,style");
		// namespace를 무시한다.
		props.setNamespacesAware(false);
		props.setAdvancedXmlEscape(true);
		props.setTranslateSpecialEntities(true);
		HtmlCleaner cl = new HtmlCleaner(props);
		TagNode tagNode = cl.clean(source);
		source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
	} catch (IOException e) {
		logger.error("",e);
	}
	return source;
}
 
开发者ID:gncloud,项目名称:fastcatsearch3,代码行数:24,代码来源:ReadabilityExtractor.java

示例3: htmlOutputStreamViaHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
 * To Output html Stream via Html Cleaner.
 * 
 * @param pathOfHOCRFile String
 * @param outputFilePath String
 * @throws IOException
 */
public static void htmlOutputStreamViaHtmlCleaner(String pathOfHOCRFile, String outputFilePath) throws IOException {
	CleanerProperties cleanerProps = new CleanerProperties();

	// set some properties to non-default values
	cleanerProps.setTransResCharsToNCR(true);
	cleanerProps.setTranslateSpecialEntities(true);
	cleanerProps.setOmitComments(true);
	cleanerProps.setOmitDoctypeDeclaration(true);
	cleanerProps.setOmitXmlDeclaration(false);
	HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);

	// take default cleaner properties
	// CleanerProperties props = cleaner.getProperties();
	FileInputStream hOCRFileInputStream = new FileInputStream(pathOfHOCRFile);
	TagNode tagNode = cleaner.clean(hOCRFileInputStream, UTF_ENCODING);
	if (null != hOCRFileInputStream) {
		hOCRFileInputStream.close();
	}
	try {
		new PrettyHtmlSerializer(cleanerProps).writeToFile(tagNode, outputFilePath, UTF_ENCODING);
	} catch (Exception e) { // NOPMD.
	}
}
 
开发者ID:kuzavas,项目名称:ephesoft,代码行数:31,代码来源:XMLUtil.java

示例4: getSerialized

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
 * Convenience method (for xml/xhtml): serializes the parsed page.
 *
 * @param inSerializer
 *            {@link XmlSerializer}
 * @return String the cleaned and serialized html
 * @throws IOException
 */
public String getSerialized(final XmlSerializer inSerializer)
        throws IOException {
	if (docNode == null) {
		return ""; //$NON-NLS-1$
	}

	final CleanerProperties lProps = new HtmlCleaner().getProperties();
	lProps.setUseCdataForScriptAndStyle(true);
	lProps.setRecognizeUnicodeChars(true);
	lProps.setUseEmptyElementTags(true);
	lProps.setAdvancedXmlEscape(true);
	lProps.setTranslateSpecialEntities(true);
	lProps.setBooleanAttributeValues("empty"); //$NON-NLS-1$
	lProps.setNamespacesAware(true);
	lProps.setOmitXmlDeclaration(false);
	lProps.setOmitDoctypeDeclaration(true);
	lProps.setOmitHtmlEnvelope(false);

	docNode.getAttributes().remove(NS_XML);

	return inSerializer.getSerializer(lProps).getXmlAsString(docNode);
}
 
开发者ID:aktion-hip,项目名称:relations,代码行数:31,代码来源:XPathHelper.java

示例5: createCleanerProperties

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static CleanerProperties createCleanerProperties() {
    CleanerProperties properties = new CleanerProperties();

    // See http://htmlcleaner.sourceforge.net/parameters.php for descriptions
    properties.setNamespacesAware(false);
    properties.setAdvancedXmlEscape(false);
    properties.setOmitXmlDeclaration(true);
    properties.setOmitDoctypeDeclaration(false);
    properties.setTranslateSpecialEntities(false);
    properties.setRecognizeUnicodeChars(false);
    properties.setIgnoreQuestAndExclam(false);
    properties.setAllowHtmlInsideAttributes(true);

    return properties;
}
 
开发者ID:scoute-dich,项目名称:K9-MailClient,代码行数:16,代码来源:HtmlSanitizer.java

示例6: getTextFromHtmlString

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
 * This method extracts the text from html string.
 * @param htmlString {@link String}
 * @return {@link String}
 */
public static String getTextFromHtmlString(String htmlString) {
	String errorText = "";
	CleanerProperties cleanerProps = new CleanerProperties();
	// set some properties to non-default values
	cleanerProps.setTransResCharsToNCR(true);
	cleanerProps.setTranslateSpecialEntities(true);
	cleanerProps.setOmitComments(true);
	cleanerProps.setOmitDoctypeDeclaration(true);
	cleanerProps.setOmitXmlDeclaration(true);
	cleanerProps.setUseEmptyElementTags(true);

	HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
	TagNode tagNode = cleaner.clean(htmlString);
	Object[] rootNode = null;
	try {
		rootNode = tagNode.evaluateXPath("//table");
		if (null != rootNode && rootNode.length > 0) {
			TagNode[] textNode = ((TagNode) rootNode[rootNode.length - 1]).getElementsByName("td", true);
			for (TagNode tag : textNode) {
				if (tag != null && tag.getText() != null) {
					StringBuilder errorTextString = new StringBuilder();
					errorTextString.append(errorText);
					if (tag.getText().toString().trim().equals(" ")) {
						errorTextString.append(" ");
						errorText = errorTextString.toString();
					} else {
						errorTextString.append(tag.getText());
						errorText = errorTextString.toString();
					}
				}
			}
		}
	} catch (XPatherException e) {
		LOGGER.error("Error extracting table node from html." + e.getMessage());
	}
	return errorText;
}
 
开发者ID:kuzavas,项目名称:ephesoft,代码行数:43,代码来源:AbstractUploadFile.java

示例7: createHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
    HtmlCleaner result = new HtmlCleaner();
    CleanerProperties cleanerProperties = result.getProperties();
    cleanerProperties.setOmitXmlDeclaration(true);
    cleanerProperties.setOmitDoctypeDeclaration(false);
    cleanerProperties.setRecognizeUnicodeChars(true);
    cleanerProperties.setTranslateSpecialEntities(false);
    cleanerProperties.setIgnoreQuestAndExclam(true);
    cleanerProperties.setUseEmptyElementTags(false);
    return result;
}
 
开发者ID:DASAR,项目名称:epublib-android,代码行数:12,代码来源:HtmlCleanerBookProcessor.java

示例8: ExtractInfoWithHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private ExtractInfoWithHtmlCleaner()
{
	CleanerProperties props = cleaner.getProperties();     
       props.setUseCdataForScriptAndStyle(true);     
       props.setRecognizeUnicodeChars(true);     
       props.setUseEmptyElementTags(true);     
       props.setAdvancedXmlEscape(true);     
       props.setTranslateSpecialEntities(true);     
       props.setBooleanAttributeValues("empty");
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:11,代码来源:ExtractInfoWithHtmlCleaner.java

示例9: main

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException, XPatherException {
	CleanerProperties props = cleaner.getProperties();     
       props.setUseCdataForScriptAndStyle(true);     
       props.setRecognizeUnicodeChars(true);     
       props.setUseEmptyElementTags(true);     
       props.setAdvancedXmlEscape(true);     
       props.setTranslateSpecialEntities(true);     
       props.setBooleanAttributeValues("empty");     
       String result ="";
       File file = new File("E:/test4java/tangniaobing.htm");
       
       
       URL url = new URL("http://www.haodf.com/wenda/anzhentaohong_g_638200415.htm");
	 TagNode node = cleaner.clean(url,"gb2312");
	 //Object[] ns = node.getElementsByName("", true);
	 Object[] ns = node.evaluateXPath("//*[@class=\"bb_d3 bl_d3 pb20\"]/div[3]/div[2]/p[2]");
	 //Object[] ns = node.("//*[@id=\"shequREP_pageNumLab\"]/a");
	 for (Object object : ns) 
	 {
	    TagNode dd = (TagNode) object;
	    
	    result = result +dd.getText()+"\n";
	 }
	 result = result.replace("&nbsp", "").replace("\r", "").replace(";", "");
	 
			result = CommonUtil.getDateString(result,".*?([0-9]+.[0-9]+.[0-9]+).*");

	 /*result = "?uthorid=4917458&page=6&tid=16785968";
	 String rex = "\\?(?!authorid=).*";
	 Pattern p = Pattern.compile(rex);
	 Matcher m = p.matcher(result);
	 boolean s = m.matches();
	 for(int i=1;i<=m.groupCount();i++)
	 {
		 System.out.println(m.group(i));
	 }*/
	 
	 System.out.print(result);
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:40,代码来源:ExtractInfoWithHtmlCleaner.java

示例10: html2xhtml

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private String html2xhtml(String html) {
	if (StringUtils.isEmpty(html)){
		return "";
	}
	
	// Garante tag raiz único para o HtmlCleaner 
	html = "<div>" + html + "</div>";
	
	CleanerProperties props = new CleanerProperties();
	 
	// set some properties to non-default values
	props.setTranslateSpecialEntities(true);
	props.setTransResCharsToNCR(true);
	props.setOmitXmlDeclaration(true);
	props.setOmitHtmlEnvelope(true);
	props.setOmitComments(true);
	 
	// do parsing
	TagNode tagNode = new HtmlCleaner(props).clean(html);
	 
	// serialize to xml file
	String ret;
	try {
		ret = new SimpleXmlSerializer(props).getAsString(tagNode);
	} catch (IOException e) {
		throw new RuntimeException(e.getMessage(), e);
	}

	// Remove tag raiz <div>
	ret = ret.substring(5, ret.length() - 6);
	
	return ret;
}
 
开发者ID:lexml,项目名称:lexml-swing-editorhtml,代码行数:34,代码来源:HTML2FOConverter.java

示例11: getStandardCredit

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static ArrayList<String> getStandardCredit(String year, int index,
                                                  String department) throws Exception {
    try {
        ArrayList<String> standard = new ArrayList<>();
        HashMap<String, String> params = new HashMap<>();
        params.put("format", "-3");
        params.put("year", year);
        params.put("matric", matrics.get(index));
        String result = Connector
                .getDataByPost(getStandardUri(lang), params, "big5");
        result = result.replace("<td", "</td><td");
        result = result.replace("<tr>", "</td><tr>");
        HtmlCleaner cleaner = new HtmlCleaner();
        CleanerProperties props = cleaner.getProperties();
        props.setUseCdataForScriptAndStyle(true);
        props.setRecognizeUnicodeChars(true);
        props.setUseEmptyElementTags(true);
        props.setAdvancedXmlEscape(true);
        props.setTranslateSpecialEntities(true);
        props.setBooleanAttributeValues("empty");
        result = new PrettyHtmlSerializer(props).getAsString(result);
        TagNode tagNode = cleaner.clean(result);
        TagNode[] tables = tagNode.getElementsByAttValue("border", "1",
                true, false);
        TagNode[] rows = tables[0].getElementsByName("tr", true);
        for (int i = 1; i < rows.length; i++) {
            TagNode[] cols = rows[i].getElementsByName("td", true);
            String temp = cols[0].getText().toString();
            if (temp.replace(" ", "").replace("\n", "").contains(department.replace(" ", "").replace("\n", ""))) {
                for (int j = 1; j < 9; j++) {
                    String credit = Utility.cleanString(cols[j].getText()
                            .toString());
                    standard.add(credit);
                }
                return standard;
            }
        }
        throw new Exception();
    } catch (Exception e) {
        e.printStackTrace();
        throw new Exception("畢業學分標準讀取時發生錯誤");
    }
}
 
开发者ID:kamisakihideyoshi,项目名称:TaipeiTechRefined,代码行数:44,代码来源:CreditConnector.java

示例12: stripSignatureForHtmlMessage

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static String stripSignatureForHtmlMessage(String content) {
    Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
    if (dashSignatureHtml.find()) {
        Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
        Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
        List<Integer> start = new ArrayList<>();
        List<Integer> end = new ArrayList<>();

        while (blockquoteStart.find()) {
            start.add(blockquoteStart.start());
        }
        while (blockquoteEnd.find()) {
            end.add(blockquoteEnd.start());
        }
        if (start.size() != end.size()) {
            Log.d(K9.LOG_TAG, "There are " + start.size() + " <blockquote> tags, but " +
                    end.size() + " </blockquote> tags. Refusing to strip.");
        } else if (start.size() > 0) {
            // Ignore quoted signatures in blockquotes.
            dashSignatureHtml.region(0, start.get(0));
            if (dashSignatureHtml.find()) {
                // before first <blockquote>.
                content = content.substring(0, dashSignatureHtml.start());
            } else {
                for (int i = 0; i < start.size() - 1; i++) {
                    // within blockquotes.
                    if (end.get(i) < start.get(i + 1)) {
                        dashSignatureHtml.region(end.get(i), start.get(i + 1));
                        if (dashSignatureHtml.find()) {
                            content = content.substring(0, dashSignatureHtml.start());
                            break;
                        }
                    }
                }
                if (end.get(end.size() - 1) < content.length()) {
                    // after last </blockquote>.
                    dashSignatureHtml.region(end.get(end.size() - 1), content.length());
                    if (dashSignatureHtml.find()) {
                        content = content.substring(0, dashSignatureHtml.start());
                    }
                }
            }
        } else {
            // No blockquotes found.
            content = content.substring(0, dashSignatureHtml.start());
        }
    }

    // Fix the stripping off of closing tags if a signature was stripped,
    // as well as clean up the HTML of the quoted message.
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties properties = cleaner.getProperties();

    // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
    properties.setNamespacesAware(false);
    properties.setAdvancedXmlEscape(false);
    properties.setOmitXmlDeclaration(true);
    properties.setOmitDoctypeDeclaration(false);
    properties.setTranslateSpecialEntities(false);
    properties.setRecognizeUnicodeChars(false);

    TagNode node = cleaner.clean(content);
    SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
    content = htmlSerialized.getAsString(node, "UTF8");
    return content;
}
 
开发者ID:scoute-dich,项目名称:K9-MailClient,代码行数:67,代码来源:QuotedMessageHelper.java

示例13: getCleanHtml

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
 * Clean HTML document and return XML as byte array
 * 
 * @param resourceMap map of resources
 * @param resID unique ID of resource
 * @return clean XHTML document as {@code byte[]}
 * @throws IOException
 */
private byte[] getCleanHtml(PandaSettings pandaSettings, String resID) throws IOException {
    byte[] doc = null;
    // Get local path to file, if null the URL field will be used to
    // retrieve resource
    ResourceInfo resInfo = pandaSettings.getResourceMap().getMap().get(resID);
    String filePath = resInfo.getFilePath();

    // properties for HTML cleaning
    CleanerProperties props = new CleanerProperties();
    // preserve namespace prefixes
    props.setNamespacesAware(true);
    // remove <?TAGNAME....> or <!TAGNAME....>
    props.setIgnoreQuestAndExclam(true);
    // do not split attributes with multiple words
    props.setAllowMultiWordAttributes(true);
    // omits <html> tag
    // props.setOmitHtmlEnvelope(true);
    // omit DTD
    props.setOmitDoctypeDeclaration(true);
    // omit xml declaration
    props.setOmitXmlDeclaration(true);
    // omit comments
    props.setOmitComments(true);
    // omit deprecated tags like <font...>
    props.setOmitDeprecatedTags(true);
    // treat script and style tag contents as CDATA
    props.setUseCdataForScriptAndStyle(true);
    // replace html character in form &#XXXX with real unicode characters
    props.setRecognizeUnicodeChars(true);
    // replace special entities with unicode character
    props.setTranslateSpecialEntities(true);
    // if true do not escape valid xml character sequences
    props.setAdvancedXmlEscape(true);

    // get HTML document, parse HTML
    TagNode tagNode = null;
    if (filePath != null) {
        tagNode = new HtmlCleaner(props).clean(new File(filePath));
    } else {
        // Get online resource
        URL resURL = pandaSettings.getResourceMap().getMap().get(resID).getURL();
        InputStream htmlDoc = getOnlineResource(resURL);
        tagNode = new HtmlCleaner(props).clean(htmlDoc);
    }

    PrettyXmlSerializer pXmlS = new PrettyXmlSerializer(props);
    doc = pXmlS.getAsString(tagNode).getBytes();

    return doc;
}
 
开发者ID:chsatgithub,项目名称:PANDA-DEEPLINKING,代码行数:59,代码来源:DataHtmlResource.java

示例14: cleanHTML

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public void cleanHTML(String path, String out, String encoding) throws IOException {

	HtmlCleaner cleaner = new HtmlCleaner();

	CleanerProperties props = cleaner.getProperties();

	CleanerTransformations transformations = new CleanerTransformations();

	AttributeTransformationPatternImpl attPattern = new AttributeTransformationPatternImpl(
			Pattern.compile("^\\s*class", Pattern.CASE_INSENSITIVE), null,
			null);
	transformations.addGlobalTransformation(attPattern);

	AttributeTransformationPatternImpl attPattern2 = new AttributeTransformationPatternImpl(
			Pattern.compile("^\\s*id", Pattern.CASE_INSENSITIVE), null,
			null);
	transformations.addGlobalTransformation(attPattern2);

	props.setCleanerTransformations(transformations);

	// set some properties to non-default values
	props.setTranslateSpecialEntities(true);
	props.setTransResCharsToNCR(false);
	props.setOmitComments(true);
	props.setPruneTags("script,style,img,form");
	
	

	// do parsing
	TagNode tagNode = new HtmlCleaner(props)
			.clean(new File(path), encoding);

	tagNode.removeAttribute("class");

	// serialize to xml file
	new PrettyHtmlSerializer(props).writeToFile(tagNode,
			out, "utf-8");

}
 
开发者ID:fauconnier,项目名称:LaToe,代码行数:40,代码来源:HTML_Service.java


注:本文中的org.htmlcleaner.CleanerProperties.setTranslateSpecialEntities方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。