本文整理汇总了Java中org.htmlparser.Parser类的典型用法代码示例。如果您正苦于以下问题:Java Parser类的具体用法?Java Parser怎么用?Java Parser使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Parser类属于org.htmlparser包,在下文中一共展示了Parser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parserUrl
import org.htmlparser.Parser; //导入依赖的package包/类
@Override
public NodeList parserUrl(Parser parser) {
NodeFilter hrefNodeFilter = new NodeFilter() {
@Override
public boolean accept(Node node) {
if (node.getText().startsWith("a href=")) {
return true;
} else {
return false;
}
}
};
try {
return parser.extractAllNodesThatMatch(hrefNodeFilter);
} catch (ParserException e) {
e.printStackTrace();
}
return null;
}
示例2: getPlainText
import org.htmlparser.Parser; //导入依赖的package包/类
public static String getPlainText(String htmlStr) {
Parser parser = new Parser();
String plainText = "";
try {
parser.setInputHTML(htmlStr);
StringBean stringBean = new StringBean();
// 设置不需要得到页面所包含的链接信息
stringBean.setLinks(false);
// 设置将不间断空格由正规空格所替代
stringBean.setReplaceNonBreakingSpaces(true);
// 设置将一序列空格由单一空格替代
stringBean.setCollapse(true);
parser.visitAllNodesWith(stringBean);
plainText = stringBean.getStrings();
} catch (ParserException e) {
e.printStackTrace();
}
return plainText;
}
示例3: parseMessage
import org.htmlparser.Parser; //导入依赖的package包/类
/**
* parses the body of the message, and returns a parsed representation
* See {@link http://htmlparser.sourceforge.net/} for details
* @param url the url that the message resulted from
* @param message the Message to parse
* @return a NodeList containing the various Nodes making up the page
*/
public Object parseMessage(HttpUrl url, Message message) {
String contentType = message.getHeader("Content-Type");
if (contentType == null || !contentType.matches("text/html.*")) {
return null;
}
byte[] content = message.getContent();
if (content == null || content.length == 0) {
return null;
}
Parser parser = Parser.createParser(new String(content), null);
try {
NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
return true;
}
});
return nodelist;
} catch (ParserException pe) {
_logger.severe(pe.toString());
return null;
}
}
示例4: getGangliaAttribute
import org.htmlparser.Parser; //导入依赖的package包/类
public List<String> getGangliaAttribute(String clusterName)
throws ParserException, MalformedURLException, IOException {
String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName);
Parser parser = new Parser(new URL(url).openConnection());
NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
new HasAttributeFilter("id", "metrics-picker"));
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
SimpleNodeIterator iterator = nodeList.elements();
List<String> metricList = new ArrayList<String>();
while (iterator.hasMoreNodes()) {
Node node = iterator.nextNode();
SimpleNodeIterator childIterator = node.getChildren().elements();
while (childIterator.hasMoreNodes()) {
OptionTag children = (OptionTag) childIterator.nextNode();
metricList.add(children.getOptionText());
}
}
return metricList;
}
示例5: main
import org.htmlparser.Parser; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Parser parser = new Parser(new URL("http://10.8.75.3/ganglia/?r=hour&cs=&ce=&s=by+name&c=Zookeeper_Cluster&tab=m&vn=&hide-hf=false").openConnection());
NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
new HasAttributeFilter("id", "metrics-picker"));
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
SimpleNodeIterator iterator = nodeList.elements();
while (iterator.hasMoreNodes()) {
Node node = iterator.nextNode();
SimpleNodeIterator childIterator = node.getChildren().elements();
while (childIterator.hasMoreNodes()) {
OptionTag children = (OptionTag) childIterator.nextNode();
System.out.println(children.getOptionText());
}
}
}
示例6: splitHtml
import org.htmlparser.Parser; //导入依赖的package包/类
private List<String> splitHtml() {
List<String> resultList = new ArrayList<String>();
try {
Parser parser = Parser.createParser(content, "UTF-8");
NodeList nodeList = parser.parse(null);
resultList = recusiveSplitHtml(nodeList);
StringBuffer lastPageContent = new StringBuffer();
for (TagNode tagNode : tagNodeList) {
if (tagNode.getStartPosition() < startPosition && tagNode.getEndTag().getEndPosition() >= startPosition) {
lastPageContent.append("<");
lastPageContent.append(tagNode.getText());
lastPageContent.append(">");
}
}
lastPageContent.append(content.substring(startPosition));
Parser lastPageContentParser = Parser.createParser(lastPageContent.toString(), "UTF-8");
NodeList pageContentNodeList = lastPageContentParser.parse(null);
resultList.add(pageContentNodeList.toHtml());
} catch (ParserException e) {
e.printStackTrace();
}
return resultList;
}
示例7: html2text
import org.htmlparser.Parser; //导入依赖的package包/类
/**
* Converts an HTML document into plain text.
*
* @param html HTML document
* @return plain text or <code>null</code> if the conversion failed
*/
public static synchronized String html2text(String html) {
// convert HTML document
StringBean sb = new StringBean();
sb.setLinks(false); // no links
sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
sb.setCollapse(true); // replace sequences of whitespaces
Parser parser = new Parser();
try {
parser.setInputHTML(html);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
if (docText == null) docText = ""; // no content
return docText;
}
示例8: file2text
import org.htmlparser.Parser; //导入依赖的package包/类
/**
* Reads an HTML document from a file and converts it into plain text.
*
* @param filename name of file containing HTML documents
* @return plain text or <code>null</code> if the reading or conversion failed
*/
public static synchronized String file2text(String filename) {
// read from file and convert HTML document
StringBean sb = new StringBean();
sb.setLinks(false); // no links
sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
sb.setCollapse(true); // replace sequences of whitespaces
Parser parser = new Parser();
try {
parser.setResource(filename);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
return docText;
}
示例9: run
import org.htmlparser.Parser; //导入依赖的package包/类
@Override
public void run() {
try {
parser = new Parser(content);
logger.info(currentThread().getName() + "开始解析Post请求响应的HTML!,并存储到HBASE中!");
NodeIterator rootList = parser.elements();
rootList.nextNode();
NodeList nodeList = rootList.nextNode().getChildren();
// System.out.println("===================="+nodeList.size());
/*
* 判断该HTML响应是否有具体的内容,在出错或者到所有数据读取完毕时起效
* 如果起效,修改endFlag标志位,停止开启新的线程,结束当前任务!
*/
if (nodeList.size() <= 4) {
program.endFlag = true;
}
/*
* 找到对应的tag记录,然后解析
*/
nodeList.remove(0);
nodeList.remove(0);
SimpleNodeIterator childList = nodeList.elements();
while (childList.hasMoreNodes()) {
Node node = childList.nextNode();
if (node.getChildren() != null) {
toObject(node);
}
}
} catch (Exception e) {
logger.error(currentThread().getName() + "解析HTML文件出现异常!\n"+e.getMessage()+"\n");
} finally {
logger.info(currentThread().getName() + "HTML文件解析结束!");
store.close();
}
}
示例10: parsePageInfo
import org.htmlparser.Parser; //导入依赖的package包/类
/***
* 解析小区的页数
*
* @param url
* @return
* @throws IOException
* @throws ParserException
*/
private int parsePageInfo(final String url) throws IOException, ParserException {
Parser parser = new Parser(CommonHttpURLConnection.getURLConnection(url));
NodeFilter nodeFilter = new HasAttributeFilter("class", "pagenumber");
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
for (Node node : nodeList.toNodeArray()) {
if (!(node instanceof Div)) {
continue;
}
for (Node innerNode : node.getChildren().elementAt(1).getChildren().toNodeArray()) {
if (!(innerNode instanceof TextNode)) {
continue;
}
String pageStr = innerNode.toPlainTextString();
if (!pageStr.contains("/")) {
continue;
}
pageStr = pageStr.substring(pageStr.indexOf("/") + 1);
try {
return Integer.parseInt(pageStr);
} catch (Exception e) {
}
}
}
return 0;
}
示例11: run
import org.htmlparser.Parser; //导入依赖的package包/类
/***
* 爬取透明网最近的预售证信息
* @param url
* @throws InterruptedException
* @throws IOException
* @throws ParserException
*/
public void run(String url) throws InterruptedException, IOException, ParserException {
URLConnection urlConnection = CommonHttpURLConnection.getURLConnection(url);
Parser parser = new Parser(urlConnection);
NodeFilter nodeFilter = new HasAttributeFilter("class", "sale1");
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
if (nodeList.toNodeArray().length > 0) {
Node[] sellCreditNodeArray = nodeList.elementAt(0).getChildren().toNodeArray();
for (int i = 2; i < sellCreditNodeArray.length; i++) {
if (sellCreditNodeArray[i] instanceof TableRow) {
SellCreditInfo sellCreditInfo = parseSellParser(sellCreditNodeArray[i]);
log.info("get sell credit info:{}", sellCreditInfo);
//该预售证是否已经爬过
HouseInfo houseInfo = dataOP.getHouseInfoByDepartmentNameAndSellCredit(sellCreditInfo);
if(houseInfo != null){
log.info("already parsing sell credit:{}",sellCreditInfo);
break;
}
dataOP.insertSellCreditInfo(sellCreditInfo);
if(i==2) continue;
parseHouseInfo(sellCreditInfo);
}
}
}
}
示例12: parseDailyBriefInfo
import org.htmlparser.Parser; //导入依赖的package包/类
public List<DailyBriefInfo> parseDailyBriefInfo() throws IOException, ParserException {
Parser parser = new Parser(CommonHttpURLConnection.getURLConnection("http://www.tmsf.com/index.jsp"));
NodeFilter nodeFilter = new HasAttributeFilter("id", "myCont5");
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
if (nodeList.toNodeArray().length == 0) {
return Collections.EMPTY_LIST;
}
List<DailyBriefInfo> dailyBriefInfoList = new ArrayList<>();
//到1970/01/01 00:00:00的小时数
int parseHour = (int) (Clock.systemUTC().millis() / (1000 * 3600));
//到1970/01/01 00:00:00的天数
int parseDay = (int) parseHour / 24;
NodeList infoNodeList = nodeList.elementAt(0).getChildren().elementAt(1)
.getChildren().elementAt(1).getChildren();
for (int i = 5; i <= 13; i = i + 2) {
DailyBriefInfo dailyBriefInfo = new DailyBriefInfo(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(1).toPlainTextString()),
Integer.parseInt(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(3).toPlainTextString())),
Integer.parseInt(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(5).toPlainTextString())),
Integer.parseInt(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(7).toPlainTextString())),
parseDay,parseHour);
dailyBriefInfoList.add(dailyBriefInfo);
dataOP.insertBriefDealInfo(dailyBriefInfo);
ESOP.writeToES("log/daily_brief_info_es", JSONObject.toJSONString(dailyBriefInfo));
}
return dailyBriefInfoList;
}
示例13: parsePageInfo
import org.htmlparser.Parser; //导入依赖的package包/类
/**
* 爬取当前楼幢的页数
*
* @return
* @throws InterruptedException
* @throws IOException
* @throws Exception
*/
public int parsePageInfo(String url, DepartmentInfo departmentInfo) throws ParserException, IOException {
Parser parser = new Parser(CommonHttpURLConnection.getURLConnection(url));
int page = 0;
//解析页数
NodeFilter nodeFilter = new HasAttributeFilter("class", "spagenext");
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
if (nodeList.size() == 0) {
return page;
}
for (Node pageNode : nodeList.elementAt(0).getChildren().toNodeArray()) {
if (pageNode instanceof Span) {
try {
String tmp = pageNode.toPlainTextString();
page = Integer.parseInt(tmp.substring(tmp.indexOf("/") + 1, tmp.indexOf("总数") - 1).trim());
break;
} catch (Exception e) {
}
}
}
log.info("get total page [{}] for department:[{}]", page, departmentInfo.toString());
return page;
}
示例14: PostCleaner
import org.htmlparser.Parser; //导入依赖的package包/类
public PostCleaner(String html, int minCodeChars, boolean excludeCode) {
try {
Parser htmlParser = Parser.createParser(html, "utf8");
PostCleanerVisitor res = new PostCleanerVisitor(minCodeChars, excludeCode);
htmlParser.visitAllNodesWith(res);
mText = res.getText();
} catch (ParserException e) {
System.err.println(" Parser exception: " + e + " trying simple conversion");
// Plan B!!!
mText = PostCleanerVisitor.simpleProc(html);
}
}
示例15: extractKeyWordText
import org.htmlparser.Parser; //导入依赖的package包/类
public static void extractKeyWordText(String url, String keyword) {
try {
// 生成一个解析器对象,用网页的 url 作为参数
Parser parser = new Parser(url);
// 设置网页的编码,这里只是请求了一个 gb2312 编码网页
parser.setEncoding("utf-8");// gb2312
// 迭代所有节点, null 表示不使用 NodeFilter
NodeList list = parser.parse(null);
// 从初始的节点列表跌倒所有的节点
processNodeList(list, keyword);
} catch (ParserException e) {
e.printStackTrace();
}
}