本文整理汇总了Java中org.htmlcleaner.DomSerializer类的典型用法代码示例。如果您正苦于以下问题:Java DomSerializer类的具体用法?Java DomSerializer怎么用?Java DomSerializer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
DomSerializer类属于org.htmlcleaner包,在下文中一共展示了DomSerializer类的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: isHealthy
import org.htmlcleaner.DomSerializer; //导入依赖的package包/类
@Override
public boolean isHealthy() {
HttpGet getMethod = new HttpGet(GET_REQUEST_URL);
CloseableHttpResponse response = null;
CloseableHttpClient httpClient = null;
try {
httpClient = HttpClientBuilder.create().build();
response = httpClient.execute(getMethod);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != HttpStatus.SC_OK) {
LOG.info("Health check failed, got response code: %d", statusCode);
return false;
}
String htmlContents = EntityUtils.toString(response.getEntity());
TagNode tagNode = new HtmlCleaner().clean(htmlContents);
Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
XPath xpath = XPathFactory.newInstance().newXPath();
String submitId = (String) xpath.evaluate(XPATH_TO_SUBMIT_ID, doc, XPathConstants.STRING);
if (StringUtils.isBlank(submitId)) {
LOG.info("Health check failed, submitId token was null or empty.");
return false;
}
} catch (Throwable t) {
LOG.info("Health check failed, exception thrown: %s", t.getMessage());
} finally {
closeHttpObjects(response, httpClient);
}
return true;
}
示例2: testXpathExtraction
import org.htmlcleaner.DomSerializer; //导入依赖的package包/类
@Test
public void testXpathExtraction() throws IOException, ParserConfigurationException, XPathExpressionException {
byte[] encoded = Files.readAllBytes(Paths.get("src/test/resources/raw_data/archive.is.html"));
String htmlContents = new String(encoded, StandardCharsets.UTF_8);
TagNode tagNode = new HtmlCleaner().clean(htmlContents);
Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
XPath xpath = XPathFactory.newInstance().newXPath();
String str = (String) xpath.evaluate("//*[@id=\"submiturl\"]/input/@value", doc, XPathConstants.STRING);
String actualValue = "YHuwL/nTgL370PMDM2G2vkuvMg3kmNqk/y/i7NRSaLyf2JSIU+/now+AYw+X0nX8";
Assert.assertTrue("Did not extract expected value!", str.equals(actualValue));
}
示例3: htmlToWiki
import org.htmlcleaner.DomSerializer; //导入依赖的package包/类
public static String htmlToWiki(String html, String contextPath, int projectId) throws Exception {
// Strip the nbsp because it gets converted to unicode
html = StringUtils.replace(html, " ", " ");
// Take the html create DOM for parsing
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
TagNode node = cleaner.clean(html);
Document document = new DomSerializer(props, true).createDOM(node);
if (LOG.isTraceEnabled()) {
LOG.trace(html);
}
// Process each node and output the wiki equivalent
StringBuffer sb = new StringBuffer();
ArrayList<Node> nodeList = new ArrayList<Node>();
for (int i = 0; i < document.getChildNodes().getLength(); i++) {
Node n = document.getChildNodes().item(i);
nodeList.add(n);
}
processChildNodes(nodeList, sb, 0, true, true, false, "", contextPath, projectId);
if (sb.length() > 0) {
String content = sb.toString().trim();
if (content.contains("'")) {
// Determine if this is where the ' is being introduced
content = StringUtils.replace(content, "'", "'");
}
if (!content.endsWith(CRLF)) {
return content + CRLF;
} else {
return content;
}
} else {
return "";
}
}
示例4: parseHhc
import org.htmlcleaner.DomSerializer; //导入依赖的package包/类
public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException, XPathExpressionException {
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties props = htmlCleaner.getProperties();
TagNode node = htmlCleaner.clean(hhcFile);
Document hhcDocument = new DomSerializer(props).createDOM(node);
XPath xpath = XPathFactory.newInstance().newXPath();
Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
.getDocumentElement(), XPathConstants.NODE);
List<TOCReference> sections = processUlNode(ulNode, resources);
return sections;
}
示例5: getHtmlDocumentModel
import org.htmlcleaner.DomSerializer; //导入依赖的package包/类
public static Document getHtmlDocumentModel(String htmlContent) {
try {
TagNode tagNode = new HtmlCleaner().clean(htmlContent);
Document doc;
try {
doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
return doc;
} catch (RuntimeException rte) {
return null;
}
}
示例6: HtmlXpathSelector
import org.htmlcleaner.DomSerializer; //导入依赖的package包/类
public HtmlXpathSelector(String content) throws ParserConfigurationException, SAXException, IOException
{
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode rootTagNode = htmlCleaner.clean(content);
rootDocument = new DomSerializer(new CleanerProperties()).createDOM(rootTagNode);
xPath=XPathFactory.newInstance().newXPath();
}
示例7: getDomHtmlNode
import org.htmlcleaner.DomSerializer; //导入依赖的package包/类
private DomHtmlNode getDomHtmlNode() throws ParserConfigurationException {
Document document = new DomSerializer(cleaner.getProperties(), true).createDOM(rootTagNode);
String lang = rootTagNode.getAttributeByName("lang");
if (lang != null)
document.getDocumentElement().setAttribute("lang", lang);
return new DomHtmlNode(document);
}
示例8: updateArtists
import org.htmlcleaner.DomSerializer; //导入依赖的package包/类
public static Boolean updateArtists(StaticDataStore db){
Logging.Log(LOG_TAG, "Fetching Artists");
ArrayList<ArrayList<String>> artists = new ArrayList<ArrayList<String>>();
HtmlCleaner pageParser = new HtmlCleaner();
CleanerProperties props = pageParser.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
try {
String url = "http://www.archive.org/browse.php?field=/metadata/bandWithMP3s&collection=etree";
HttpParams params = new BasicHttpParams();
int timeout = (int) (15 * DateUtils.SECOND_IN_MILLIS);
HttpConnectionParams.setConnectionTimeout(params, timeout);
HttpConnectionParams.setSoTimeout(params, timeout);
HttpClient client = new DefaultHttpClient(params);
HttpGet request = new HttpGet(url);
HttpResponse response = client.execute(request);
StatusLine status = response.getStatusLine();
if (status.getStatusCode() == HttpStatus.SC_OK) {
ResponseHandler<String> responseHandler = new BasicResponseHandler();
TagNode node = pageParser.clean(responseHandler.handleResponse(response));
client.getConnectionManager().shutdown();
org.w3c.dom.Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
XPath xpath = XPathFactory.newInstance().newXPath();
NodeList artistNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/a", doc, XPathConstants.NODESET);
NodeList numberNodes = (NodeList) xpath.evaluate("//div[@class='row']//div[@class='col-sm-4']/text()[preceding-sibling::a]", doc, XPathConstants.NODESET);
Logging.Log(LOG_TAG, "artistNodes: " + artistNodes.getLength());
Logging.Log(LOG_TAG, "numberNodes: " + numberNodes.getLength());
if(artistNodes.getLength() == numberNodes.getLength()){
for (int i = 0; i < artistNodes.getLength(); i++) {
ArrayList<String> artistPair = new ArrayList<String>();
artistPair.add(artistNodes.item(i).getTextContent().replace("'", "'").replace(">", ">").replace("<", "<").replace(""", "\"").replace("&", "&"));
artistPair.add(numberNodes.item(i).getTextContent());
artists.add(artistPair);
}
}
if (artists.size() > 0) {
db.insertArtistBulk(artists);
String s = DateFormat.format("yyyy-MM-dd", new GregorianCalendar().getTime()).toString();
db.updatePref("artistUpdate", s);
Logging.Log(LOG_TAG, "Finished Fetching Artists");
}
else {
Logging.Log(LOG_TAG, "Error Fetching Artists");
}
}
else {
client.getConnectionManager().shutdown();
}
} catch(Exception e) {
e.printStackTrace();
Logging.Log(LOG_TAG, "Error Fetching Artists");
}
return true;
}