当前位置: 首页>>代码示例>>Java>>正文


Java CleanerProperties.setRecognizeUnicodeChars方法代码示例

本文整理汇总了Java中org.htmlcleaner.CleanerProperties.setRecognizeUnicodeChars方法的典型用法代码示例。如果您正苦于以下问题:Java CleanerProperties.setRecognizeUnicodeChars方法的具体用法?Java CleanerProperties.setRecognizeUnicodeChars怎么用?Java CleanerProperties.setRecognizeUnicodeChars使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.htmlcleaner.CleanerProperties的用法示例。


在下文中一共展示了CleanerProperties.setRecognizeUnicodeChars方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: createHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
    HtmlCleaner result = new HtmlCleaner();
    CleanerProperties cleanerProperties = result.getProperties();

    cleanerProperties.setAdvancedXmlEscape(true);

    cleanerProperties.setOmitXmlDeclaration(true);
    cleanerProperties.setOmitDoctypeDeclaration(false);

    cleanerProperties.setTranslateSpecialEntities(true);
    cleanerProperties.setTransResCharsToNCR(true);
    cleanerProperties.setRecognizeUnicodeChars(true);

    cleanerProperties.setIgnoreQuestAndExclam(true);
    cleanerProperties.setUseEmptyElementTags(false);

    cleanerProperties.setPruneTags("script,title");

    return result;
}
 
开发者ID:SysdataSpA,项目名称:SDHtmlTextView,代码行数:21,代码来源:HtmlSpanner.java

示例2: getSerialized

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
 * Convenience method (for xml/xhtml): serializes the parsed page.
 *
 * @param inSerializer
 *            {@link XmlSerializer}
 * @return String the cleaned and serialized html
 * @throws IOException
 */
public String getSerialized(final XmlSerializer inSerializer)
        throws IOException {
	if (docNode == null) {
		return ""; //$NON-NLS-1$
	}

	final CleanerProperties lProps = new HtmlCleaner().getProperties();
	lProps.setUseCdataForScriptAndStyle(true);
	lProps.setRecognizeUnicodeChars(true);
	lProps.setUseEmptyElementTags(true);
	lProps.setAdvancedXmlEscape(true);
	lProps.setTranslateSpecialEntities(true);
	lProps.setBooleanAttributeValues("empty"); //$NON-NLS-1$
	lProps.setNamespacesAware(true);
	lProps.setOmitXmlDeclaration(false);
	lProps.setOmitDoctypeDeclaration(true);
	lProps.setOmitHtmlEnvelope(false);

	docNode.getAttributes().remove(NS_XML);

	return inSerializer.getSerializer(lProps).getXmlAsString(docNode);
}
 
开发者ID:aktion-hip,项目名称:relations,代码行数:31,代码来源:XPathHelper.java

示例3: createCleanerProperties

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static CleanerProperties createCleanerProperties() {
    CleanerProperties properties = new CleanerProperties();

    // See http://htmlcleaner.sourceforge.net/parameters.php for descriptions
    properties.setNamespacesAware(false);
    properties.setAdvancedXmlEscape(false);
    properties.setOmitXmlDeclaration(true);
    properties.setOmitDoctypeDeclaration(false);
    properties.setTranslateSpecialEntities(false);
    properties.setRecognizeUnicodeChars(false);
    properties.setIgnoreQuestAndExclam(false);
    properties.setAllowHtmlInsideAttributes(true);

    return properties;
}
 
开发者ID:scoute-dich,项目名称:K9-MailClient,代码行数:16,代码来源:HtmlSanitizer.java

示例4: createHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private static HtmlCleaner createHtmlCleaner() {
    HtmlCleaner result = new HtmlCleaner();
    CleanerProperties cleanerProperties = result.getProperties();
    cleanerProperties.setOmitXmlDeclaration(true);
    cleanerProperties.setOmitDoctypeDeclaration(false);
    cleanerProperties.setRecognizeUnicodeChars(true);
    cleanerProperties.setTranslateSpecialEntities(false);
    cleanerProperties.setIgnoreQuestAndExclam(true);
    cleanerProperties.setUseEmptyElementTags(false);
    return result;
}
 
开发者ID:DASAR,项目名称:epublib-android,代码行数:12,代码来源:HtmlCleanerBookProcessor.java

示例5: ExtractInfoWithHtmlCleaner

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private ExtractInfoWithHtmlCleaner()
{
	CleanerProperties props = cleaner.getProperties();     
       props.setUseCdataForScriptAndStyle(true);     
       props.setRecognizeUnicodeChars(true);     
       props.setUseEmptyElementTags(true);     
       props.setAdvancedXmlEscape(true);     
       props.setTranslateSpecialEntities(true);     
       props.setBooleanAttributeValues("empty");
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:11,代码来源:ExtractInfoWithHtmlCleaner.java

示例6: main

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException, XPatherException {
	CleanerProperties props = cleaner.getProperties();     
       props.setUseCdataForScriptAndStyle(true);     
       props.setRecognizeUnicodeChars(true);     
       props.setUseEmptyElementTags(true);     
       props.setAdvancedXmlEscape(true);     
       props.setTranslateSpecialEntities(true);     
       props.setBooleanAttributeValues("empty");     
       String result ="";
       File file = new File("E:/test4java/tangniaobing.htm");
       
       
       URL url = new URL("http://www.haodf.com/wenda/anzhentaohong_g_638200415.htm");
	 TagNode node = cleaner.clean(url,"gb2312");
	 //Object[] ns = node.getElementsByName("", true);
	 Object[] ns = node.evaluateXPath("//*[@class=\"bb_d3 bl_d3 pb20\"]/div[3]/div[2]/p[2]");
	 //Object[] ns = node.("//*[@id=\"shequREP_pageNumLab\"]/a");
	 for (Object object : ns) 
	 {
	    TagNode dd = (TagNode) object;
	    
	    result = result +dd.getText()+"\n";
	 }
	 result = result.replace("&nbsp", "").replace("\r", "").replace(";", "");
	 
			result = CommonUtil.getDateString(result,".*?([0-9]+.[0-9]+.[0-9]+).*");

	 /*result = "?uthorid=4917458&page=6&tid=16785968";
	 String rex = "\\?(?!authorid=).*";
	 Pattern p = Pattern.compile(rex);
	 Matcher m = p.matcher(result);
	 boolean s = m.matches();
	 for(int i=1;i<=m.groupCount();i++)
	 {
		 System.out.println(m.group(i));
	 }*/
	 
	 System.out.print(result);
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:40,代码来源:ExtractInfoWithHtmlCleaner.java

示例7: getStandardCredit

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static ArrayList<String> getStandardCredit(String year, int index,
                                                  String department) throws Exception {
    try {
        ArrayList<String> standard = new ArrayList<>();
        HashMap<String, String> params = new HashMap<>();
        params.put("format", "-3");
        params.put("year", year);
        params.put("matric", matrics.get(index));
        String result = Connector
                .getDataByPost(getStandardUri(lang), params, "big5");
        result = result.replace("<td", "</td><td");
        result = result.replace("<tr>", "</td><tr>");
        HtmlCleaner cleaner = new HtmlCleaner();
        CleanerProperties props = cleaner.getProperties();
        props.setUseCdataForScriptAndStyle(true);
        props.setRecognizeUnicodeChars(true);
        props.setUseEmptyElementTags(true);
        props.setAdvancedXmlEscape(true);
        props.setTranslateSpecialEntities(true);
        props.setBooleanAttributeValues("empty");
        result = new PrettyHtmlSerializer(props).getAsString(result);
        TagNode tagNode = cleaner.clean(result);
        TagNode[] tables = tagNode.getElementsByAttValue("border", "1",
                true, false);
        TagNode[] rows = tables[0].getElementsByName("tr", true);
        for (int i = 1; i < rows.length; i++) {
            TagNode[] cols = rows[i].getElementsByName("td", true);
            String temp = cols[0].getText().toString();
            if (temp.replace(" ", "").replace("\n", "").contains(department.replace(" ", "").replace("\n", ""))) {
                for (int j = 1; j < 9; j++) {
                    String credit = Utility.cleanString(cols[j].getText()
                            .toString());
                    standard.add(credit);
                }
                return standard;
            }
        }
        throw new Exception();
    } catch (Exception e) {
        e.printStackTrace();
        throw new Exception("畢業學分標準讀取時發生錯誤");
    }
}
 
开发者ID:kamisakihideyoshi,项目名称:TaipeiTechRefined,代码行数:44,代码来源:CreditConnector.java

示例8: stripSignatureForHtmlMessage

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static String stripSignatureForHtmlMessage(String content) {
    Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
    if (dashSignatureHtml.find()) {
        Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
        Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
        List<Integer> start = new ArrayList<>();
        List<Integer> end = new ArrayList<>();

        while (blockquoteStart.find()) {
            start.add(blockquoteStart.start());
        }
        while (blockquoteEnd.find()) {
            end.add(blockquoteEnd.start());
        }
        if (start.size() != end.size()) {
            Log.d(K9.LOG_TAG, "There are " + start.size() + " <blockquote> tags, but " +
                    end.size() + " </blockquote> tags. Refusing to strip.");
        } else if (start.size() > 0) {
            // Ignore quoted signatures in blockquotes.
            dashSignatureHtml.region(0, start.get(0));
            if (dashSignatureHtml.find()) {
                // before first <blockquote>.
                content = content.substring(0, dashSignatureHtml.start());
            } else {
                for (int i = 0; i < start.size() - 1; i++) {
                    // within blockquotes.
                    if (end.get(i) < start.get(i + 1)) {
                        dashSignatureHtml.region(end.get(i), start.get(i + 1));
                        if (dashSignatureHtml.find()) {
                            content = content.substring(0, dashSignatureHtml.start());
                            break;
                        }
                    }
                }
                if (end.get(end.size() - 1) < content.length()) {
                    // after last </blockquote>.
                    dashSignatureHtml.region(end.get(end.size() - 1), content.length());
                    if (dashSignatureHtml.find()) {
                        content = content.substring(0, dashSignatureHtml.start());
                    }
                }
            }
        } else {
            // No blockquotes found.
            content = content.substring(0, dashSignatureHtml.start());
        }
    }

    // Fix the stripping off of closing tags if a signature was stripped,
    // as well as clean up the HTML of the quoted message.
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties properties = cleaner.getProperties();

    // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
    properties.setNamespacesAware(false);
    properties.setAdvancedXmlEscape(false);
    properties.setOmitXmlDeclaration(true);
    properties.setOmitDoctypeDeclaration(false);
    properties.setTranslateSpecialEntities(false);
    properties.setRecognizeUnicodeChars(false);

    TagNode node = cleaner.clean(content);
    SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
    content = htmlSerialized.getAsString(node, "UTF8");
    return content;
}
 
开发者ID:scoute-dich,项目名称:K9-MailClient,代码行数:67,代码来源:QuotedMessageHelper.java

示例9: updateArtists

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static Boolean updateArtists(StaticDataStore db){
	Logging.Log(LOG_TAG, "Fetching Artists");
	ArrayList<ArrayList<String>> artists = new ArrayList<ArrayList<String>>();

	HtmlCleaner pageParser = new HtmlCleaner();
	CleanerProperties props = pageParser.getProperties();
	props.setAllowHtmlInsideAttributes(true);
	props.setAllowMultiWordAttributes(true);
	props.setRecognizeUnicodeChars(true);
	props.setOmitComments(true);

	try {
		String url = "http://www.archive.org/browse.php?field=/metadata/bandWithMP3s&collection=etree";

		HttpParams params = new BasicHttpParams();
		int timeout = (int) (15 * DateUtils.SECOND_IN_MILLIS);
		HttpConnectionParams.setConnectionTimeout(params, timeout);
		HttpConnectionParams.setSoTimeout(params, timeout);
		HttpClient client = new DefaultHttpClient(params);

		HttpGet request = new HttpGet(url);
		HttpResponse response = client.execute(request);
		StatusLine status = response.getStatusLine();
		if (status.getStatusCode() == HttpStatus.SC_OK) {
			ResponseHandler<String> responseHandler = new BasicResponseHandler();
			TagNode node = pageParser.clean(responseHandler.handleResponse(response));
			client.getConnectionManager().shutdown();

			org.w3c.dom.Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
			XPath xpath = XPathFactory.newInstance().newXPath();
			NodeList artistNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/a", doc, XPathConstants.NODESET);
			NodeList numberNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/text()[preceding-sibling::a]", doc, XPathConstants.NODESET);
			Logging.Log(LOG_TAG, "artistNodes: " + artistNodes.getLength());
			Logging.Log(LOG_TAG, "numberNodes: " + numberNodes.getLength());

			if(artistNodes.getLength() == numberNodes.getLength()){
				for (int i = 0; i < artistNodes.getLength(); i++) {
					ArrayList<String> artistPair = new ArrayList<String>();
					artistPair.add(artistNodes.item(i).getTextContent().replace("&apos;", "'").replace("&gt;", ">").replace("&lt;", "<").replace("&quot;", "\"").replace("&amp;", "&"));
					artistPair.add(numberNodes.item(i).getTextContent());
					artists.add(artistPair);
				}
			}
			if (artists.size() > 0) {
				db.insertArtistBulk(artists);
				String s = DateFormat.format("yyyy-MM-dd", new GregorianCalendar().getTime()).toString();
				db.updatePref("artistUpdate", s);
				Logging.Log(LOG_TAG, "Finished Fetching Artists");
			}
			else {
				Logging.Log(LOG_TAG, "Error Fetching Artists");
			}
		}
		else {
			client.getConnectionManager().shutdown();
		}
	} catch(Exception e) {
		e.printStackTrace();
		Logging.Log(LOG_TAG, "Error Fetching Artists");
	}
	return true;

}
 
开发者ID:sedenardi,项目名称:vibevault,代码行数:64,代码来源:Searching.java

示例10: getCleanHtml

import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
 * Clean HTML document and return XML as byte array
 * 
 * @param resourceMap map of resources
 * @param resID unique ID of resource
 * @return clean XHTML document as {@code byte[]}
 * @throws IOException
 */
private byte[] getCleanHtml(PandaSettings pandaSettings, String resID) throws IOException {
    byte[] doc = null;
    // Get local path to file, if null the URL field will be used to
    // retrieve resource
    ResourceInfo resInfo = pandaSettings.getResourceMap().getMap().get(resID);
    String filePath = resInfo.getFilePath();

    // properties for HTML cleaning
    CleanerProperties props = new CleanerProperties();
    // preserve namespace prefixes
    props.setNamespacesAware(true);
    // remove <?TAGNAME....> or <!TAGNAME....>
    props.setIgnoreQuestAndExclam(true);
    // do not split attributes with multiple words
    props.setAllowMultiWordAttributes(true);
    // omits <html> tag
    // props.setOmitHtmlEnvelope(true);
    // omit DTD
    props.setOmitDoctypeDeclaration(true);
    // omit xml declaration
    props.setOmitXmlDeclaration(true);
    // omit comments
    props.setOmitComments(true);
    // omit deprecated tags like <font...>
    props.setOmitDeprecatedTags(true);
    // treat script and style tag contents as CDATA
    props.setUseCdataForScriptAndStyle(true);
    // replace html character in form &#XXXX with real unicode characters
    props.setRecognizeUnicodeChars(true);
    // replace special entities with unicode character
    props.setTranslateSpecialEntities(true);
    // if true do not escape valid xml character sequences
    props.setAdvancedXmlEscape(true);

    // get HTML document, parse HTML
    TagNode tagNode = null;
    if (filePath != null) {
        tagNode = new HtmlCleaner(props).clean(new File(filePath));
    } else {
        // Get online resource
        URL resURL = pandaSettings.getResourceMap().getMap().get(resID).getURL();
        InputStream htmlDoc = getOnlineResource(resURL);
        tagNode = new HtmlCleaner(props).clean(htmlDoc);
    }

    PrettyXmlSerializer pXmlS = new PrettyXmlSerializer(props);
    doc = pXmlS.getAsString(tagNode).getBytes();

    return doc;
}
 
开发者ID:chsatgithub,项目名称:PANDA-DEEPLINKING,代码行数:59,代码来源:DataHtmlResource.java


注:本文中的org.htmlcleaner.CleanerProperties.setRecognizeUnicodeChars方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。