Java HtmlCleaner.clean方法代码示例

本文整理汇总了Java中org.htmlcleaner.HtmlCleaner.clean方法的典型用法代码示例。如果您正苦于以下问题：Java HtmlCleaner.clean方法的具体用法？Java HtmlCleaner.clean怎么用？Java HtmlCleaner.clean使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlcleaner.HtmlCleaner的用法示例。

在下文中一共展示了HtmlCleaner.clean方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: toXML

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
 * htmlcleaner로 html string을 xml string으로 바꿔주는 메소드.
 * @param source
 * @return
 */
private String toXML(String source){
	try {
		CleanerProperties props = new CleanerProperties();
		props.setTranslateSpecialEntities(true);
		props.setOmitComments(true);
		props.setPruneTags("script,style");
		// namespace를 무시한다.
		props.setNamespacesAware(false);
		props.setAdvancedXmlEscape(true);
		props.setTranslateSpecialEntities(true);
		HtmlCleaner cl = new HtmlCleaner(props);
		TagNode tagNode = cl.clean(source);
		source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
	} catch (IOException e) {
		logger.error("",e);
	}
	return source;
}

开发者ID:gncloud，项目名称:fastcatsearch3，代码行数:24，代码来源:ReadabilityExtractor.java

示例2: processFollow

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
 * 解析关注页面，关注与被关注
 * 
 * @param followUrl
 */
public static void processFollow(String followUrl) {
	String content = PageUtil.getContent(followUrl);
	HtmlCleaner htmlCleaner = new HtmlCleaner();
	TagNode tNode = htmlCleaner.clean(content);
	extractUserUrl(content);
	try {
		Object[] pageNumObj = tNode
				.evaluateXPath("//*[@id=\"Profile-following\"]//div[@class=\"Pagination\"]/button");
		if (pageNumObj != null && pageNumObj.length > 0) {
			TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2];
			int pagenum = Integer.parseInt(node.getText().toString());
			for (int i = 2; i <= pagenum; i++) {
				String url = followUrl + "?page=" + i;
				content = PageUtil.getContent(url);
				extractUserUrl(content);
			}
		}
	} catch (XPatherException e) {
		logger.error(e.getMessage());
	}
}

开发者ID:monsonlee，项目名称:BigData，代码行数:27，代码来源:UserUtil.java

示例3: htmlOutputStreamViaHtmlCleaner

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
 * To Output html Stream via Html Cleaner.
 * 
 * @param pathOfHOCRFile String
 * @param outputFilePath String
 * @throws IOException
 */
public static void htmlOutputStreamViaHtmlCleaner(String pathOfHOCRFile, String outputFilePath) throws IOException {
	CleanerProperties cleanerProps = new CleanerProperties();

	// set some properties to non-default values
	cleanerProps.setTransResCharsToNCR(true);
	cleanerProps.setTranslateSpecialEntities(true);
	cleanerProps.setOmitComments(true);
	cleanerProps.setOmitDoctypeDeclaration(true);
	cleanerProps.setOmitXmlDeclaration(false);
	HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);

	// take default cleaner properties
	// CleanerProperties props = cleaner.getProperties();
	FileInputStream hOCRFileInputStream = new FileInputStream(pathOfHOCRFile);
	TagNode tagNode = cleaner.clean(hOCRFileInputStream, UTF_ENCODING);
	if (null != hOCRFileInputStream) {
		hOCRFileInputStream.close();
	}
	try {
		new PrettyHtmlSerializer(cleanerProps).writeToFile(tagNode, outputFilePath, UTF_ENCODING);
	} catch (Exception e) { // NOPMD.
	}
}

开发者ID:kuzavas，项目名称:ephesoft，代码行数:31，代码来源:XMLUtil.java

示例4: JoinedBefore

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static boolean JoinedBefore(ChatPlayer mp, int year, int month, int day) throws Exception {
	URL url = new URL("https://www.reddit.com/u/" + mp.UserName());
	URLConnection con = url.openConnection();
	con.setRequestProperty("User-Agent", "TheButtonAutoFlair");
	InputStream in = con.getInputStream();
	HtmlCleaner cleaner = new HtmlCleaner();
	TagNode node = cleaner.clean(in);

	node = node.getElementsByAttValue("class", "age", true, true)[0];
	node = node.getElementsByName("time", false)[0];
	String joindate = node.getAttributeByName("datetime");
	SimpleDateFormat parserSDF = new SimpleDateFormat("yyyy-MM-dd");
	joindate = joindate.split("T")[0];
	Date date = parserSDF.parse(joindate);
	return date.before(new Calendar.Builder().setTimeZone(TimeZone.getTimeZone("UTC")).setDate(year, month, day)
			.build().getTime());
}

开发者ID:TBMCPlugins，项目名称:ButtonChat，代码行数:18，代码来源:PluginMain.java

示例5: getHTML

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public byte[] getHTML(HSSFWorkbook book) throws IOException {
        double width = 21.0;
        double height = 29.7;
        if (isLandscape()) {
            width += height;
            height = width - height;
            width = width - height;
        }
        byte[] html = convert(book, width, height).getBytes();
        ByteArrayInputStream in = new ByteArrayInputStream(html);

        // Clean up the HTML to be well formed
        HtmlCleaner cleaner = new HtmlCleaner();
        CleanerProperties props = cleaner.getProperties();
        TagNode node = cleaner.clean(in, "UTF-8");

//        ByteArrayOutputStream out = new ByteArrayOutputStream();
        // Instead of writing to System.out we now write to the ByteArray buffer
//        return 	new PrettyXmlSerializer(props).getAsString(node, "UTF-8").getBytes();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        // Instead of writing to System.out we now write to the ByteArray buffer
        new PrettyXmlSerializer(props).writeToStream(node, out);

        return out.toByteArray();
    }

开发者ID:rmage，项目名称:gnvc-ims，代码行数:26，代码来源:ReportModel.java

示例6: createDocument

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
 * Convenience method (for xml/xhtml): creates a <code>Document</code> from
 * the specified URL.
 *
 * @param inUrl
 *            {@link URL}
 * @return {@link Document} the parsed XML document, may be
 *         <code>null</code>
 * @throws ParserConfigurationException
 * @throws IOException
 * @throws SAXException
 */
public static Document createDocument(final URL inUrl)
        throws ParserConfigurationException, IOException, SAXException {
	final HtmlCleaner lCleaner = new HtmlCleaner();
	lCleaner.clean(inUrl);

	final DocumentBuilder lBuilder = DocumentBuilderFactory.newInstance()
	        .newDocumentBuilder();
	final URLConnection lConnection = inUrl.openConnection();
	Document outDocument = null;
	try (BufferedInputStream lStream = new BufferedInputStream(
	        lConnection.getInputStream());) {
		outDocument = lBuilder.parse(lStream);
	}
	return outDocument;
}

开发者ID:aktion-hip，项目名称:relations，代码行数:28，代码来源:XPathHelper.java

示例7: getTextFromHtmlString

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
 * This method extracts the text from html string.
 * @param htmlString {@link String}
 * @return {@link String}
 */
public static String getTextFromHtmlString(String htmlString) {
	String errorText = "";
	CleanerProperties cleanerProps = new CleanerProperties();
	// set some properties to non-default values
	cleanerProps.setTransResCharsToNCR(true);
	cleanerProps.setTranslateSpecialEntities(true);
	cleanerProps.setOmitComments(true);
	cleanerProps.setOmitDoctypeDeclaration(true);
	cleanerProps.setOmitXmlDeclaration(true);
	cleanerProps.setUseEmptyElementTags(true);

	HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
	TagNode tagNode = cleaner.clean(htmlString);
	Object[] rootNode = null;
	try {
		rootNode = tagNode.evaluateXPath("//table");
		if (null != rootNode && rootNode.length > 0) {
			TagNode[] textNode = ((TagNode) rootNode[rootNode.length - 1]).getElementsByName("td", true);
			for (TagNode tag : textNode) {
				if (tag != null && tag.getText() != null) {
					StringBuilder errorTextString = new StringBuilder();
					errorTextString.append(errorText);
					if (tag.getText().toString().trim().equals("&nbsp;")) {
						errorTextString.append(" ");
						errorText = errorTextString.toString();
					} else {
						errorTextString.append(tag.getText());
						errorText = errorTextString.toString();
					}
				}
			}
		}
	} catch (XPatherException e) {
		LOGGER.error("Error extracting table node from html." + e.getMessage());
	}
	return errorText;
}

开发者ID:kuzavas，项目名称:ephesoft，代码行数:43，代码来源:AbstractUploadFile.java

示例8: htmlToWiki

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static String htmlToWiki(String html, String contextPath, int projectId) throws Exception {

    // Strip the nbsp because it gets converted to unicode
    html = StringUtils.replace(html, "&nbsp;", " ");

    // Take the html create DOM for parsing
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties props = cleaner.getProperties();
    TagNode node = cleaner.clean(html);
    Document document = new DomSerializer(props, true).createDOM(node);
    if (LOG.isTraceEnabled()) {
      LOG.trace(html);
    }

    // Process each node and output the wiki equivalent
    StringBuffer sb = new StringBuffer();
    ArrayList<Node> nodeList = new ArrayList<Node>();
    for (int i = 0; i < document.getChildNodes().getLength(); i++) {
      Node n = document.getChildNodes().item(i);
      nodeList.add(n);
    }
    processChildNodes(nodeList, sb, 0, true, true, false, "", contextPath, projectId);
    if (sb.length() > 0) {
      String content = sb.toString().trim();
      if (content.contains("&apos;")) {
        // Determine if this is where the &apos; is being introduced
        content = StringUtils.replace(content, "&apos;", "'");
      }
      if (!content.endsWith(CRLF)) {
        return content + CRLF;
      } else {
        return content;
      }
    } else {
      return "";
    }
  }

开发者ID:Concursive，项目名称:concourseconnect-community，代码行数:38，代码来源:HTMLToWikiUtils.java

示例9: parseHhc

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException,	XPathExpressionException {
	HtmlCleaner htmlCleaner = new HtmlCleaner();
	CleanerProperties props = htmlCleaner.getProperties();
	TagNode node = htmlCleaner.clean(hhcFile);
	Document hhcDocument = new DomSerializer(props).createDOM(node);
	XPath xpath = XPathFactory.newInstance().newXPath();
	Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
			.getDocumentElement(), XPathConstants.NODE);
	List<TOCReference> sections = processUlNode(ulNode, resources);
	return sections;
}

开发者ID:DASAR，项目名称:epublib-android，代码行数:12，代码来源:HHCParser.java

示例10: HtmlXpathSelector

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public HtmlXpathSelector(String content) throws ParserConfigurationException, SAXException, IOException
{

	HtmlCleaner htmlCleaner = new HtmlCleaner();  
	TagNode rootTagNode = htmlCleaner.clean(content);  
	rootDocument = new DomSerializer(new CleanerProperties()).createDOM(rootTagNode);
	xPath=XPathFactory.newInstance().newXPath();
	
}

开发者ID:hxt168，项目名称:webpasser，代码行数:10，代码来源:HtmlXpathSelector.java

示例11: toXhtml

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static String toXhtml(String htmlString) {
	String xhtmlString = null;
	if (StringUtils.isNotEmpty(htmlString)) {
		xhtmlString = XmlUtils.skipDocTypeDeclaration(htmlString.trim());
		if (xhtmlString.startsWith("<html>")
				|| xhtmlString.startsWith("<html ")) {
			CleanerProperties props = new CleanerProperties();
			HtmlCleaner cleaner = new HtmlCleaner(props);
			TagNode tagNode = cleaner.clean(xhtmlString);
			xhtmlString = new SimpleXmlSerializer(props)
					.getXmlAsString(tagNode);
		}
	}
	return xhtmlString;
}

开发者ID:ibissource，项目名称:iaf，代码行数:16，代码来源:XmlUtils.java

示例12: getStandardCredit

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static ArrayList<String> getStandardCredit(String year, int index,
                                                  String department) throws Exception {
    try {
        ArrayList<String> standard = new ArrayList<>();
        HashMap<String, String> params = new HashMap<>();
        params.put("format", "-3");
        params.put("year", year);
        params.put("matric", matrics.get(index));
        String result = Connector
                .getDataByPost(getStandardUri(lang), params, "big5");
        result = result.replace("<td", "</td><td");
        result = result.replace("<tr>", "</td><tr>");
        HtmlCleaner cleaner = new HtmlCleaner();
        CleanerProperties props = cleaner.getProperties();
        props.setUseCdataForScriptAndStyle(true);
        props.setRecognizeUnicodeChars(true);
        props.setUseEmptyElementTags(true);
        props.setAdvancedXmlEscape(true);
        props.setTranslateSpecialEntities(true);
        props.setBooleanAttributeValues("empty");
        result = new PrettyHtmlSerializer(props).getAsString(result);
        TagNode tagNode = cleaner.clean(result);
        TagNode[] tables = tagNode.getElementsByAttValue("border", "1",
                true, false);
        TagNode[] rows = tables[0].getElementsByName("tr", true);
        for (int i = 1; i < rows.length; i++) {
            TagNode[] cols = rows[i].getElementsByName("td", true);
            String temp = cols[0].getText().toString();
            if (temp.replace(" ", "").replace("\n", "").contains(department.replace(" ", "").replace("\n", ""))) {
                for (int j = 1; j < 9; j++) {
                    String credit = Utility.cleanString(cols[j].getText()
                            .toString());
                    standard.add(credit);
                }
                return standard;
            }
        }
        throw new Exception();
    } catch (Exception e) {
        e.printStackTrace();
        throw new Exception("畢業學分標準讀取時發生錯誤");
    }
}

开发者ID:kamisakihideyoshi，项目名称:TaipeiTechRefined，代码行数:44，代码来源:CreditConnector.java

示例13: stripSignatureForHtmlMessage

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static String stripSignatureForHtmlMessage(String content) {
    Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
    if (dashSignatureHtml.find()) {
        Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
        Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
        List<Integer> start = new ArrayList<>();
        List<Integer> end = new ArrayList<>();

        while (blockquoteStart.find()) {
            start.add(blockquoteStart.start());
        }
        while (blockquoteEnd.find()) {
            end.add(blockquoteEnd.start());
        }
        if (start.size() != end.size()) {
            Log.d(K9.LOG_TAG, "There are " + start.size() + " <blockquote> tags, but " +
                    end.size() + " </blockquote> tags. Refusing to strip.");
        } else if (start.size() > 0) {
            // Ignore quoted signatures in blockquotes.
            dashSignatureHtml.region(0, start.get(0));
            if (dashSignatureHtml.find()) {
                // before first <blockquote>.
                content = content.substring(0, dashSignatureHtml.start());
            } else {
                for (int i = 0; i < start.size() - 1; i++) {
                    // within blockquotes.
                    if (end.get(i) < start.get(i + 1)) {
                        dashSignatureHtml.region(end.get(i), start.get(i + 1));
                        if (dashSignatureHtml.find()) {
                            content = content.substring(0, dashSignatureHtml.start());
                            break;
                        }
                    }
                }
                if (end.get(end.size() - 1) < content.length()) {
                    // after last </blockquote>.
                    dashSignatureHtml.region(end.get(end.size() - 1), content.length());
                    if (dashSignatureHtml.find()) {
                        content = content.substring(0, dashSignatureHtml.start());
                    }
                }
            }
        } else {
            // No blockquotes found.
            content = content.substring(0, dashSignatureHtml.start());
        }
    }

    // Fix the stripping off of closing tags if a signature was stripped,
    // as well as clean up the HTML of the quoted message.
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties properties = cleaner.getProperties();

    // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
    properties.setNamespacesAware(false);
    properties.setAdvancedXmlEscape(false);
    properties.setOmitXmlDeclaration(true);
    properties.setOmitDoctypeDeclaration(false);
    properties.setTranslateSpecialEntities(false);
    properties.setRecognizeUnicodeChars(false);

    TagNode node = cleaner.clean(content);
    SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
    content = htmlSerialized.getAsString(node, "UTF8");
    return content;
}

开发者ID:scoute-dich，项目名称:K9-MailClient，代码行数:67，代码来源:QuotedMessageHelper.java

示例14: updateArtists

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static Boolean updateArtists(StaticDataStore db){
	Logging.Log(LOG_TAG, "Fetching Artists");
	ArrayList<ArrayList<String>> artists = new ArrayList<ArrayList<String>>();

	HtmlCleaner pageParser = new HtmlCleaner();
	CleanerProperties props = pageParser.getProperties();
	props.setAllowHtmlInsideAttributes(true);
	props.setAllowMultiWordAttributes(true);
	props.setRecognizeUnicodeChars(true);
	props.setOmitComments(true);

	try {
		String url = "http://www.archive.org/browse.php?field=/metadata/bandWithMP3s&collection=etree";

		HttpParams params = new BasicHttpParams();
		int timeout = (int) (15 * DateUtils.SECOND_IN_MILLIS);
		HttpConnectionParams.setConnectionTimeout(params, timeout);
		HttpConnectionParams.setSoTimeout(params, timeout);
		HttpClient client = new DefaultHttpClient(params);

		HttpGet request = new HttpGet(url);
		HttpResponse response = client.execute(request);
		StatusLine status = response.getStatusLine();
		if (status.getStatusCode() == HttpStatus.SC_OK) {
			ResponseHandler<String> responseHandler = new BasicResponseHandler();
			TagNode node = pageParser.clean(responseHandler.handleResponse(response));
			client.getConnectionManager().shutdown();

			org.w3c.dom.Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
			XPath xpath = XPathFactory.newInstance().newXPath();
			NodeList artistNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/a", doc, XPathConstants.NODESET);
			NodeList numberNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/text()[preceding-sibling::a]", doc, XPathConstants.NODESET);
			Logging.Log(LOG_TAG, "artistNodes: " + artistNodes.getLength());
			Logging.Log(LOG_TAG, "numberNodes: " + numberNodes.getLength());

			if(artistNodes.getLength() == numberNodes.getLength()){
				for (int i = 0; i < artistNodes.getLength(); i++) {
					ArrayList<String> artistPair = new ArrayList<String>();
					artistPair.add(artistNodes.item(i).getTextContent().replace("&apos;", "'").replace("&gt;", ">").replace("&lt;", "<").replace("&quot;", "\"").replace("&amp;", "&"));
					artistPair.add(numberNodes.item(i).getTextContent());
					artists.add(artistPair);
				}
			}
			if (artists.size() > 0) {
				db.insertArtistBulk(artists);
				String s = DateFormat.format("yyyy-MM-dd", new GregorianCalendar().getTime()).toString();
				db.updatePref("artistUpdate", s);
				Logging.Log(LOG_TAG, "Finished Fetching Artists");
			}
			else {
				Logging.Log(LOG_TAG, "Error Fetching Artists");
			}
		}
		else {
			client.getConnectionManager().shutdown();
		}
	} catch(Exception e) {
		e.printStackTrace();
		Logging.Log(LOG_TAG, "Error Fetching Artists");
	}
	return true;

}

开发者ID:sedenardi，项目名称:vibevault，代码行数:64，代码来源:Searching.java

示例15: XpathOldSelector

import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public XpathOldSelector(String content)
{
	 htmlCleaner = new HtmlCleaner();  
	  rootTagNode = htmlCleaner.clean(content);  
}

开发者ID:hxt168，项目名称:webpasser，代码行数:6，代码来源:XpathOldSelector.java

注：本文中的org.htmlcleaner.HtmlCleaner.clean方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。