本文整理汇总了Java中org.htmlcleaner.CleanerProperties.setOmitComments方法的典型用法代码示例。如果您正苦于以下问题:Java CleanerProperties.setOmitComments方法的具体用法?Java CleanerProperties.setOmitComments怎么用?Java CleanerProperties.setOmitComments使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlcleaner.CleanerProperties
的用法示例。
在下文中一共展示了CleanerProperties.setOmitComments方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: toXML
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* htmlcleaner로 html string을 xml string으로 바꿔주는 메소드.
* @param source
* @return
*/
private String toXML(String source){
try {
CleanerProperties props = new CleanerProperties();
props.setTranslateSpecialEntities(true);
props.setOmitComments(true);
props.setPruneTags("script,style");
// namespace를 무시한다.
props.setNamespacesAware(false);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
HtmlCleaner cl = new HtmlCleaner(props);
TagNode tagNode = cl.clean(source);
source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
} catch (IOException e) {
logger.error("",e);
}
return source;
}
示例2: htmlOutputStreamViaHtmlCleaner
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* To Output html Stream via Html Cleaner.
*
* @param pathOfHOCRFile String
* @param outputFilePath String
* @throws IOException
*/
public static void htmlOutputStreamViaHtmlCleaner(String pathOfHOCRFile, String outputFilePath) throws IOException {
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(false);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
// take default cleaner properties
// CleanerProperties props = cleaner.getProperties();
FileInputStream hOCRFileInputStream = new FileInputStream(pathOfHOCRFile);
TagNode tagNode = cleaner.clean(hOCRFileInputStream, UTF_ENCODING);
if (null != hOCRFileInputStream) {
hOCRFileInputStream.close();
}
try {
new PrettyHtmlSerializer(cleanerProps).writeToFile(tagNode, outputFilePath, UTF_ENCODING);
} catch (Exception e) { // NOPMD.
}
}
示例3: getTextFromHtmlString
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* This method extracts the text from html string.
* @param htmlString {@link String}
* @return {@link String}
*/
public static String getTextFromHtmlString(String htmlString) {
String errorText = "";
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(true);
cleanerProps.setUseEmptyElementTags(true);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
TagNode tagNode = cleaner.clean(htmlString);
Object[] rootNode = null;
try {
rootNode = tagNode.evaluateXPath("//table");
if (null != rootNode && rootNode.length > 0) {
TagNode[] textNode = ((TagNode) rootNode[rootNode.length - 1]).getElementsByName("td", true);
for (TagNode tag : textNode) {
if (tag != null && tag.getText() != null) {
StringBuilder errorTextString = new StringBuilder();
errorTextString.append(errorText);
if (tag.getText().toString().trim().equals(" ")) {
errorTextString.append(" ");
errorText = errorTextString.toString();
} else {
errorTextString.append(tag.getText());
errorText = errorTextString.toString();
}
}
}
}
} catch (XPatherException e) {
LOGGER.error("Error extracting table node from html." + e.getMessage());
}
return errorText;
}
示例4: html2xhtml
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
private String html2xhtml(String html) {
if (StringUtils.isEmpty(html)){
return "";
}
// Garante tag raiz único para o HtmlCleaner
html = "<div>" + html + "</div>";
CleanerProperties props = new CleanerProperties();
// set some properties to non-default values
props.setTranslateSpecialEntities(true);
props.setTransResCharsToNCR(true);
props.setOmitXmlDeclaration(true);
props.setOmitHtmlEnvelope(true);
props.setOmitComments(true);
// do parsing
TagNode tagNode = new HtmlCleaner(props).clean(html);
// serialize to xml file
String ret;
try {
ret = new SimpleXmlSerializer(props).getAsString(tagNode);
} catch (IOException e) {
throw new RuntimeException(e.getMessage(), e);
}
// Remove tag raiz <div>
ret = ret.substring(5, ret.length() - 6);
return ret;
}
示例5: updateArtists
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static Boolean updateArtists(StaticDataStore db){
Logging.Log(LOG_TAG, "Fetching Artists");
ArrayList<ArrayList<String>> artists = new ArrayList<ArrayList<String>>();
HtmlCleaner pageParser = new HtmlCleaner();
CleanerProperties props = pageParser.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
try {
String url = "http://www.archive.org/browse.php?field=/metadata/bandWithMP3s&collection=etree";
HttpParams params = new BasicHttpParams();
int timeout = (int) (15 * DateUtils.SECOND_IN_MILLIS);
HttpConnectionParams.setConnectionTimeout(params, timeout);
HttpConnectionParams.setSoTimeout(params, timeout);
HttpClient client = new DefaultHttpClient(params);
HttpGet request = new HttpGet(url);
HttpResponse response = client.execute(request);
StatusLine status = response.getStatusLine();
if (status.getStatusCode() == HttpStatus.SC_OK) {
ResponseHandler<String> responseHandler = new BasicResponseHandler();
TagNode node = pageParser.clean(responseHandler.handleResponse(response));
client.getConnectionManager().shutdown();
org.w3c.dom.Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
XPath xpath = XPathFactory.newInstance().newXPath();
NodeList artistNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/a", doc, XPathConstants.NODESET);
NodeList numberNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/text()[preceding-sibling::a]", doc, XPathConstants.NODESET);
Logging.Log(LOG_TAG, "artistNodes: " + artistNodes.getLength());
Logging.Log(LOG_TAG, "numberNodes: " + numberNodes.getLength());
if(artistNodes.getLength() == numberNodes.getLength()){
for (int i = 0; i < artistNodes.getLength(); i++) {
ArrayList<String> artistPair = new ArrayList<String>();
artistPair.add(artistNodes.item(i).getTextContent().replace("'", "'").replace(">", ">").replace("<", "<").replace(""", "\"").replace("&", "&"));
artistPair.add(numberNodes.item(i).getTextContent());
artists.add(artistPair);
}
}
if (artists.size() > 0) {
db.insertArtistBulk(artists);
String s = DateFormat.format("yyyy-MM-dd", new GregorianCalendar().getTime()).toString();
db.updatePref("artistUpdate", s);
Logging.Log(LOG_TAG, "Finished Fetching Artists");
}
else {
Logging.Log(LOG_TAG, "Error Fetching Artists");
}
}
else {
client.getConnectionManager().shutdown();
}
} catch(Exception e) {
e.printStackTrace();
Logging.Log(LOG_TAG, "Error Fetching Artists");
}
return true;
}
示例6: downloadResearchesPages
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public static void downloadResearchesPages(String destDir,
String sInstitutionName,
TreeMap<String,
TreeMap<String, List<String>>
> treeInstitution)
{
try
{
CleanerProperties props = new CleanerProperties();
// set some properties to non-default values
//props.setTranslateSpecialEntities(true);
//props.setTransResCharsToNCR(true);
props.setOmitComments(true);
props.setOmitXmlDeclaration(true);
props.setAdvancedXmlEscape(true);
props.setNamespacesAware(false);
props.setOmitDoctypeDeclaration(true);
String sUnitOfAssessment_Description = "";
String sResearchGroupDescription = "";
String sResearchName = "";
String sResearchInitials = "";
File dirI = new File(destDir + System.getProperty("file.separator") + sInstitutionName.replaceAll("[^a-z^A-Z]","") + System.getProperty("file.separator"));
if(!dirI.mkdir()) throw new Exception("Cant create " + dirI.getPath());
else
for (String keyAssessment_Description : treeInstitution.keySet())
{
sUnitOfAssessment_Description = keyAssessment_Description;
if(sUnitOfAssessment_Description.length() > 20) sUnitOfAssessment_Description = sUnitOfAssessment_Description.substring(0, 20);
File dirUAD = new File(dirI.getPath() + System.getProperty("file.separator") + sUnitOfAssessment_Description.replaceAll("[^a-z^A-Z]","") + System.getProperty("file.separator"));
if(!dirUAD.mkdir()) throw new Exception("Cant create " + dirUAD.getPath());
TreeMap<String, List<String>> treeResearchers = treeInstitution.get(keyAssessment_Description);
for (String keyResearcher : treeResearchers.keySet())
{
String sAux = keyResearcher;
File dirR = new File(dirUAD.getPath() + System.getProperty("file.separator") + sAux + System.getProperty("file.separator"));
if(!dirR.exists())
{
if(!dirR.mkdir()) throw new Exception("Cant create " + dirR.getPath());
}
else
{
LOG.info("Repeated: " + sAux);
break;
}
List<String> lstResearcherWebAddress = treeResearchers.get(keyResearcher);
//int iCount = 0;
List<String> lstLocalResearcherWebAddress = new ArrayList<String>();
for (String url : lstResearcherWebAddress)
{
byte[] bytes = url.getBytes();
String ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT_VALUE_DEFAULT_HTML;
String type = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE_VALUE_DEFAULT_CV;
String fileDownloaded = ResearchersPagePostProcessor.downloadAndClean(dirR.getAbsolutePath(), type, url, ext, true, true);
if(fileDownloaded != "")
lstLocalResearcherWebAddress.add(fileDownloaded);
}
lstResearcherWebAddress.clear();
lstResearcherWebAddress.addAll(lstLocalResearcherWebAddress);
}
}
}
catch(Exception ex)
{
LOG.log(Level.SEVERE, "ERROR: "+ ex.getMessage());
}
}
开发者ID:eduardoguzman,项目名称:sisob-data-extractor,代码行数:80,代码来源:DownloaderResearchersWebPagesTreeFormat.java
示例7: getCleanHtml
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
/**
* Clean HTML document and return XML as byte array
*
* @param resourceMap map of resources
* @param resID unique ID of resource
* @return clean XHTML document as {@code byte[]}
* @throws IOException
*/
private byte[] getCleanHtml(PandaSettings pandaSettings, String resID) throws IOException {
byte[] doc = null;
// Get local path to file, if null the URL field will be used to
// retrieve resource
ResourceInfo resInfo = pandaSettings.getResourceMap().getMap().get(resID);
String filePath = resInfo.getFilePath();
// properties for HTML cleaning
CleanerProperties props = new CleanerProperties();
// preserve namespace prefixes
props.setNamespacesAware(true);
// remove <?TAGNAME....> or <!TAGNAME....>
props.setIgnoreQuestAndExclam(true);
// do not split attributes with multiple words
props.setAllowMultiWordAttributes(true);
// omits <html> tag
// props.setOmitHtmlEnvelope(true);
// omit DTD
props.setOmitDoctypeDeclaration(true);
// omit xml declaration
props.setOmitXmlDeclaration(true);
// omit comments
props.setOmitComments(true);
// omit deprecated tags like <font...>
props.setOmitDeprecatedTags(true);
// treat script and style tag contents as CDATA
props.setUseCdataForScriptAndStyle(true);
// replace html character in form &#XXXX with real unicode characters
props.setRecognizeUnicodeChars(true);
// replace special entities with unicode character
props.setTranslateSpecialEntities(true);
// if true do not escape valid xml character sequences
props.setAdvancedXmlEscape(true);
// get HTML document, parse HTML
TagNode tagNode = null;
if (filePath != null) {
tagNode = new HtmlCleaner(props).clean(new File(filePath));
} else {
// Get online resource
URL resURL = pandaSettings.getResourceMap().getMap().get(resID).getURL();
InputStream htmlDoc = getOnlineResource(resURL);
tagNode = new HtmlCleaner(props).clean(htmlDoc);
}
PrettyXmlSerializer pXmlS = new PrettyXmlSerializer(props);
doc = pXmlS.getAsString(tagNode).getBytes();
return doc;
}
示例8: cleanHTML
import org.htmlcleaner.CleanerProperties; //导入方法依赖的package包/类
public void cleanHTML(String path, String out, String encoding) throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
CleanerTransformations transformations = new CleanerTransformations();
AttributeTransformationPatternImpl attPattern = new AttributeTransformationPatternImpl(
Pattern.compile("^\\s*class", Pattern.CASE_INSENSITIVE), null,
null);
transformations.addGlobalTransformation(attPattern);
AttributeTransformationPatternImpl attPattern2 = new AttributeTransformationPatternImpl(
Pattern.compile("^\\s*id", Pattern.CASE_INSENSITIVE), null,
null);
transformations.addGlobalTransformation(attPattern2);
props.setCleanerTransformations(transformations);
// set some properties to non-default values
props.setTranslateSpecialEntities(true);
props.setTransResCharsToNCR(false);
props.setOmitComments(true);
props.setPruneTags("script,style,img,form");
// do parsing
TagNode tagNode = new HtmlCleaner(props)
.clean(new File(path), encoding);
tagNode.removeAttribute("class");
// serialize to xml file
new PrettyHtmlSerializer(props).writeToFile(tagNode,
out, "utf-8");
}