本文整理汇总了Java中org.htmlcleaner.HtmlCleaner.clean方法的典型用法代码示例。如果您正苦于以下问题:Java HtmlCleaner.clean方法的具体用法?Java HtmlCleaner.clean怎么用?Java HtmlCleaner.clean使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlcleaner.HtmlCleaner
的用法示例。
在下文中一共展示了HtmlCleaner.clean方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: toXML
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
* htmlcleaner로 html string을 xml string으로 바꿔주는 메소드.
* @param source
* @return
*/
private String toXML(String source){
try {
CleanerProperties props = new CleanerProperties();
props.setTranslateSpecialEntities(true);
props.setOmitComments(true);
props.setPruneTags("script,style");
// namespace를 무시한다.
props.setNamespacesAware(false);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
HtmlCleaner cl = new HtmlCleaner(props);
TagNode tagNode = cl.clean(source);
source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
} catch (IOException e) {
logger.error("",e);
}
return source;
}
示例2: processFollow
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
* 解析关注页面,关注与被关注
*
* @param followUrl
*/
public static void processFollow(String followUrl) {
String content = PageUtil.getContent(followUrl);
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tNode = htmlCleaner.clean(content);
extractUserUrl(content);
try {
Object[] pageNumObj = tNode
.evaluateXPath("//*[@id=\"Profile-following\"]//div[@class=\"Pagination\"]/button");
if (pageNumObj != null && pageNumObj.length > 0) {
TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2];
int pagenum = Integer.parseInt(node.getText().toString());
for (int i = 2; i <= pagenum; i++) {
String url = followUrl + "?page=" + i;
content = PageUtil.getContent(url);
extractUserUrl(content);
}
}
} catch (XPatherException e) {
logger.error(e.getMessage());
}
}
示例3: htmlOutputStreamViaHtmlCleaner
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
* To Output html Stream via Html Cleaner.
*
* @param pathOfHOCRFile String
* @param outputFilePath String
* @throws IOException
*/
public static void htmlOutputStreamViaHtmlCleaner(String pathOfHOCRFile, String outputFilePath) throws IOException {
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(false);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
// take default cleaner properties
// CleanerProperties props = cleaner.getProperties();
FileInputStream hOCRFileInputStream = new FileInputStream(pathOfHOCRFile);
TagNode tagNode = cleaner.clean(hOCRFileInputStream, UTF_ENCODING);
if (null != hOCRFileInputStream) {
hOCRFileInputStream.close();
}
try {
new PrettyHtmlSerializer(cleanerProps).writeToFile(tagNode, outputFilePath, UTF_ENCODING);
} catch (Exception e) { // NOPMD.
}
}
示例4: JoinedBefore
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static boolean JoinedBefore(ChatPlayer mp, int year, int month, int day) throws Exception {
URL url = new URL("https://www.reddit.com/u/" + mp.UserName());
URLConnection con = url.openConnection();
con.setRequestProperty("User-Agent", "TheButtonAutoFlair");
InputStream in = con.getInputStream();
HtmlCleaner cleaner = new HtmlCleaner();
TagNode node = cleaner.clean(in);
node = node.getElementsByAttValue("class", "age", true, true)[0];
node = node.getElementsByName("time", false)[0];
String joindate = node.getAttributeByName("datetime");
SimpleDateFormat parserSDF = new SimpleDateFormat("yyyy-MM-dd");
joindate = joindate.split("T")[0];
Date date = parserSDF.parse(joindate);
return date.before(new Calendar.Builder().setTimeZone(TimeZone.getTimeZone("UTC")).setDate(year, month, day)
.build().getTime());
}
示例5: getHTML
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public byte[] getHTML(HSSFWorkbook book) throws IOException {
double width = 21.0;
double height = 29.7;
if (isLandscape()) {
width += height;
height = width - height;
width = width - height;
}
byte[] html = convert(book, width, height).getBytes();
ByteArrayInputStream in = new ByteArrayInputStream(html);
// Clean up the HTML to be well formed
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
TagNode node = cleaner.clean(in, "UTF-8");
// ByteArrayOutputStream out = new ByteArrayOutputStream();
// Instead of writing to System.out we now write to the ByteArray buffer
// return new PrettyXmlSerializer(props).getAsString(node, "UTF-8").getBytes();
ByteArrayOutputStream out = new ByteArrayOutputStream();
// Instead of writing to System.out we now write to the ByteArray buffer
new PrettyXmlSerializer(props).writeToStream(node, out);
return out.toByteArray();
}
示例6: createDocument
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
* Convenience method (for xml/xhtml): creates a <code>Document</code> from
* the specified URL.
*
* @param inUrl
* {@link URL}
* @return {@link Document} the parsed XML document, may be
* <code>null</code>
* @throws ParserConfigurationException
* @throws IOException
* @throws SAXException
*/
public static Document createDocument(final URL inUrl)
throws ParserConfigurationException, IOException, SAXException {
final HtmlCleaner lCleaner = new HtmlCleaner();
lCleaner.clean(inUrl);
final DocumentBuilder lBuilder = DocumentBuilderFactory.newInstance()
.newDocumentBuilder();
final URLConnection lConnection = inUrl.openConnection();
Document outDocument = null;
try (BufferedInputStream lStream = new BufferedInputStream(
lConnection.getInputStream());) {
outDocument = lBuilder.parse(lStream);
}
return outDocument;
}
示例7: getTextFromHtmlString
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
/**
* This method extracts the text from html string.
* @param htmlString {@link String}
* @return {@link String}
*/
public static String getTextFromHtmlString(String htmlString) {
String errorText = "";
CleanerProperties cleanerProps = new CleanerProperties();
// set some properties to non-default values
cleanerProps.setTransResCharsToNCR(true);
cleanerProps.setTranslateSpecialEntities(true);
cleanerProps.setOmitComments(true);
cleanerProps.setOmitDoctypeDeclaration(true);
cleanerProps.setOmitXmlDeclaration(true);
cleanerProps.setUseEmptyElementTags(true);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
TagNode tagNode = cleaner.clean(htmlString);
Object[] rootNode = null;
try {
rootNode = tagNode.evaluateXPath("//table");
if (null != rootNode && rootNode.length > 0) {
TagNode[] textNode = ((TagNode) rootNode[rootNode.length - 1]).getElementsByName("td", true);
for (TagNode tag : textNode) {
if (tag != null && tag.getText() != null) {
StringBuilder errorTextString = new StringBuilder();
errorTextString.append(errorText);
if (tag.getText().toString().trim().equals(" ")) {
errorTextString.append(" ");
errorText = errorTextString.toString();
} else {
errorTextString.append(tag.getText());
errorText = errorTextString.toString();
}
}
}
}
} catch (XPatherException e) {
LOGGER.error("Error extracting table node from html." + e.getMessage());
}
return errorText;
}
示例8: htmlToWiki
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static String htmlToWiki(String html, String contextPath, int projectId) throws Exception {
// Strip the nbsp because it gets converted to unicode
html = StringUtils.replace(html, " ", " ");
// Take the html create DOM for parsing
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
TagNode node = cleaner.clean(html);
Document document = new DomSerializer(props, true).createDOM(node);
if (LOG.isTraceEnabled()) {
LOG.trace(html);
}
// Process each node and output the wiki equivalent
StringBuffer sb = new StringBuffer();
ArrayList<Node> nodeList = new ArrayList<Node>();
for (int i = 0; i < document.getChildNodes().getLength(); i++) {
Node n = document.getChildNodes().item(i);
nodeList.add(n);
}
processChildNodes(nodeList, sb, 0, true, true, false, "", contextPath, projectId);
if (sb.length() > 0) {
String content = sb.toString().trim();
if (content.contains("'")) {
// Determine if this is where the ' is being introduced
content = StringUtils.replace(content, "'", "'");
}
if (!content.endsWith(CRLF)) {
return content + CRLF;
} else {
return content;
}
} else {
return "";
}
}
示例9: parseHhc
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException, XPathExpressionException {
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties props = htmlCleaner.getProperties();
TagNode node = htmlCleaner.clean(hhcFile);
Document hhcDocument = new DomSerializer(props).createDOM(node);
XPath xpath = XPathFactory.newInstance().newXPath();
Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
.getDocumentElement(), XPathConstants.NODE);
List<TOCReference> sections = processUlNode(ulNode, resources);
return sections;
}
示例10: HtmlXpathSelector
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public HtmlXpathSelector(String content) throws ParserConfigurationException, SAXException, IOException
{
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode rootTagNode = htmlCleaner.clean(content);
rootDocument = new DomSerializer(new CleanerProperties()).createDOM(rootTagNode);
xPath=XPathFactory.newInstance().newXPath();
}
示例11: toXhtml
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static String toXhtml(String htmlString) {
String xhtmlString = null;
if (StringUtils.isNotEmpty(htmlString)) {
xhtmlString = XmlUtils.skipDocTypeDeclaration(htmlString.trim());
if (xhtmlString.startsWith("<html>")
|| xhtmlString.startsWith("<html ")) {
CleanerProperties props = new CleanerProperties();
HtmlCleaner cleaner = new HtmlCleaner(props);
TagNode tagNode = cleaner.clean(xhtmlString);
xhtmlString = new SimpleXmlSerializer(props)
.getXmlAsString(tagNode);
}
}
return xhtmlString;
}
示例12: getStandardCredit
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static ArrayList<String> getStandardCredit(String year, int index,
String department) throws Exception {
try {
ArrayList<String> standard = new ArrayList<>();
HashMap<String, String> params = new HashMap<>();
params.put("format", "-3");
params.put("year", year);
params.put("matric", matrics.get(index));
String result = Connector
.getDataByPost(getStandardUri(lang), params, "big5");
result = result.replace("<td", "</td><td");
result = result.replace("<tr>", "</td><tr>");
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setUseCdataForScriptAndStyle(true);
props.setRecognizeUnicodeChars(true);
props.setUseEmptyElementTags(true);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
props.setBooleanAttributeValues("empty");
result = new PrettyHtmlSerializer(props).getAsString(result);
TagNode tagNode = cleaner.clean(result);
TagNode[] tables = tagNode.getElementsByAttValue("border", "1",
true, false);
TagNode[] rows = tables[0].getElementsByName("tr", true);
for (int i = 1; i < rows.length; i++) {
TagNode[] cols = rows[i].getElementsByName("td", true);
String temp = cols[0].getText().toString();
if (temp.replace(" ", "").replace("\n", "").contains(department.replace(" ", "").replace("\n", ""))) {
for (int j = 1; j < 9; j++) {
String credit = Utility.cleanString(cols[j].getText()
.toString());
standard.add(credit);
}
return standard;
}
}
throw new Exception();
} catch (Exception e) {
e.printStackTrace();
throw new Exception("畢業學分標準讀取時發生錯誤");
}
}
示例13: stripSignatureForHtmlMessage
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static String stripSignatureForHtmlMessage(String content) {
Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
if (dashSignatureHtml.find()) {
Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
List<Integer> start = new ArrayList<>();
List<Integer> end = new ArrayList<>();
while (blockquoteStart.find()) {
start.add(blockquoteStart.start());
}
while (blockquoteEnd.find()) {
end.add(blockquoteEnd.start());
}
if (start.size() != end.size()) {
Log.d(K9.LOG_TAG, "There are " + start.size() + " <blockquote> tags, but " +
end.size() + " </blockquote> tags. Refusing to strip.");
} else if (start.size() > 0) {
// Ignore quoted signatures in blockquotes.
dashSignatureHtml.region(0, start.get(0));
if (dashSignatureHtml.find()) {
// before first <blockquote>.
content = content.substring(0, dashSignatureHtml.start());
} else {
for (int i = 0; i < start.size() - 1; i++) {
// within blockquotes.
if (end.get(i) < start.get(i + 1)) {
dashSignatureHtml.region(end.get(i), start.get(i + 1));
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
break;
}
}
}
if (end.get(end.size() - 1) < content.length()) {
// after last </blockquote>.
dashSignatureHtml.region(end.get(end.size() - 1), content.length());
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
}
}
}
} else {
// No blockquotes found.
content = content.substring(0, dashSignatureHtml.start());
}
}
// Fix the stripping off of closing tags if a signature was stripped,
// as well as clean up the HTML of the quoted message.
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
// see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
TagNode node = cleaner.clean(content);
SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
content = htmlSerialized.getAsString(node, "UTF8");
return content;
}
示例14: updateArtists
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public static Boolean updateArtists(StaticDataStore db){
Logging.Log(LOG_TAG, "Fetching Artists");
ArrayList<ArrayList<String>> artists = new ArrayList<ArrayList<String>>();
HtmlCleaner pageParser = new HtmlCleaner();
CleanerProperties props = pageParser.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
try {
String url = "http://www.archive.org/browse.php?field=/metadata/bandWithMP3s&collection=etree";
HttpParams params = new BasicHttpParams();
int timeout = (int) (15 * DateUtils.SECOND_IN_MILLIS);
HttpConnectionParams.setConnectionTimeout(params, timeout);
HttpConnectionParams.setSoTimeout(params, timeout);
HttpClient client = new DefaultHttpClient(params);
HttpGet request = new HttpGet(url);
HttpResponse response = client.execute(request);
StatusLine status = response.getStatusLine();
if (status.getStatusCode() == HttpStatus.SC_OK) {
ResponseHandler<String> responseHandler = new BasicResponseHandler();
TagNode node = pageParser.clean(responseHandler.handleResponse(response));
client.getConnectionManager().shutdown();
org.w3c.dom.Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
XPath xpath = XPathFactory.newInstance().newXPath();
NodeList artistNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/a", doc, XPathConstants.NODESET);
NodeList numberNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/text()[preceding-sibling::a]", doc, XPathConstants.NODESET);
Logging.Log(LOG_TAG, "artistNodes: " + artistNodes.getLength());
Logging.Log(LOG_TAG, "numberNodes: " + numberNodes.getLength());
if(artistNodes.getLength() == numberNodes.getLength()){
for (int i = 0; i < artistNodes.getLength(); i++) {
ArrayList<String> artistPair = new ArrayList<String>();
artistPair.add(artistNodes.item(i).getTextContent().replace("'", "'").replace(">", ">").replace("<", "<").replace(""", "\"").replace("&", "&"));
artistPair.add(numberNodes.item(i).getTextContent());
artists.add(artistPair);
}
}
if (artists.size() > 0) {
db.insertArtistBulk(artists);
String s = DateFormat.format("yyyy-MM-dd", new GregorianCalendar().getTime()).toString();
db.updatePref("artistUpdate", s);
Logging.Log(LOG_TAG, "Finished Fetching Artists");
}
else {
Logging.Log(LOG_TAG, "Error Fetching Artists");
}
}
else {
client.getConnectionManager().shutdown();
}
} catch(Exception e) {
e.printStackTrace();
Logging.Log(LOG_TAG, "Error Fetching Artists");
}
return true;
}
示例15: XpathOldSelector
import org.htmlcleaner.HtmlCleaner; //导入方法依赖的package包/类
public XpathOldSelector(String content)
{
htmlCleaner = new HtmlCleaner();
rootTagNode = htmlCleaner.clean(content);
}