本文整理汇总了Java中org.htmlparser.Parser.extractAllNodesThatMatch方法的典型用法代码示例。如果您正苦于以下问题:Java Parser.extractAllNodesThatMatch方法的具体用法?Java Parser.extractAllNodesThatMatch怎么用?Java Parser.extractAllNodesThatMatch使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlparser.Parser
的用法示例。
在下文中一共展示了Parser.extractAllNodesThatMatch方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parserUrl
import org.htmlparser.Parser; //导入方法依赖的package包/类
@Override
public NodeList parserUrl(Parser parser) {
NodeFilter hrefNodeFilter = new NodeFilter() {
@Override
public boolean accept(Node node) {
if (node.getText().startsWith("a href=")) {
return true;
} else {
return false;
}
}
};
try {
return parser.extractAllNodesThatMatch(hrefNodeFilter);
} catch (ParserException e) {
e.printStackTrace();
}
return null;
}
示例2: parseMessage
import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
* parses the body of the message, and returns a parsed representation
* See {@link http://htmlparser.sourceforge.net/} for details
* @param url the url that the message resulted from
* @param message the Message to parse
* @return a NodeList containing the various Nodes making up the page
*/
public Object parseMessage(HttpUrl url, Message message) {
String contentType = message.getHeader("Content-Type");
if (contentType == null || !contentType.matches("text/html.*")) {
return null;
}
byte[] content = message.getContent();
if (content == null || content.length == 0) {
return null;
}
Parser parser = Parser.createParser(new String(content), null);
try {
NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
return true;
}
});
return nodelist;
} catch (ParserException pe) {
_logger.severe(pe.toString());
return null;
}
}
示例3: getGangliaAttribute
import org.htmlparser.Parser; //导入方法依赖的package包/类
public List<String> getGangliaAttribute(String clusterName)
throws ParserException, MalformedURLException, IOException {
String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName);
Parser parser = new Parser(new URL(url).openConnection());
NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
new HasAttributeFilter("id", "metrics-picker"));
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
SimpleNodeIterator iterator = nodeList.elements();
List<String> metricList = new ArrayList<String>();
while (iterator.hasMoreNodes()) {
Node node = iterator.nextNode();
SimpleNodeIterator childIterator = node.getChildren().elements();
while (childIterator.hasMoreNodes()) {
OptionTag children = (OptionTag) childIterator.nextNode();
metricList.add(children.getOptionText());
}
}
return metricList;
}
示例4: main
import org.htmlparser.Parser; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
Parser parser = new Parser(new URL("http://10.8.75.3/ganglia/?r=hour&cs=&ce=&s=by+name&c=Zookeeper_Cluster&tab=m&vn=&hide-hf=false").openConnection());
NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
new HasAttributeFilter("id", "metrics-picker"));
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
SimpleNodeIterator iterator = nodeList.elements();
while (iterator.hasMoreNodes()) {
Node node = iterator.nextNode();
SimpleNodeIterator childIterator = node.getChildren().elements();
while (childIterator.hasMoreNodes()) {
OptionTag children = (OptionTag) childIterator.nextNode();
System.out.println(children.getOptionText());
}
}
}
示例5: parsePageInfo
import org.htmlparser.Parser; //导入方法依赖的package包/类
/***
* 解析小区的页数
*
* @param url
* @return
* @throws IOException
* @throws ParserException
*/
private int parsePageInfo(final String url) throws IOException, ParserException {
Parser parser = new Parser(CommonHttpURLConnection.getURLConnection(url));
NodeFilter nodeFilter = new HasAttributeFilter("class", "pagenumber");
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
for (Node node : nodeList.toNodeArray()) {
if (!(node instanceof Div)) {
continue;
}
for (Node innerNode : node.getChildren().elementAt(1).getChildren().toNodeArray()) {
if (!(innerNode instanceof TextNode)) {
continue;
}
String pageStr = innerNode.toPlainTextString();
if (!pageStr.contains("/")) {
continue;
}
pageStr = pageStr.substring(pageStr.indexOf("/") + 1);
try {
return Integer.parseInt(pageStr);
} catch (Exception e) {
}
}
}
return 0;
}
示例6: run
import org.htmlparser.Parser; //导入方法依赖的package包/类
/***
* 爬取透明网最近的预售证信息
* @param url
* @throws InterruptedException
* @throws IOException
* @throws ParserException
*/
public void run(String url) throws InterruptedException, IOException, ParserException {
URLConnection urlConnection = CommonHttpURLConnection.getURLConnection(url);
Parser parser = new Parser(urlConnection);
NodeFilter nodeFilter = new HasAttributeFilter("class", "sale1");
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
if (nodeList.toNodeArray().length > 0) {
Node[] sellCreditNodeArray = nodeList.elementAt(0).getChildren().toNodeArray();
for (int i = 2; i < sellCreditNodeArray.length; i++) {
if (sellCreditNodeArray[i] instanceof TableRow) {
SellCreditInfo sellCreditInfo = parseSellParser(sellCreditNodeArray[i]);
log.info("get sell credit info:{}", sellCreditInfo);
//该预售证是否已经爬过
HouseInfo houseInfo = dataOP.getHouseInfoByDepartmentNameAndSellCredit(sellCreditInfo);
if(houseInfo != null){
log.info("already parsing sell credit:{}",sellCreditInfo);
break;
}
dataOP.insertSellCreditInfo(sellCreditInfo);
if(i==2) continue;
parseHouseInfo(sellCreditInfo);
}
}
}
}
示例7: parseDailyBriefInfo
import org.htmlparser.Parser; //导入方法依赖的package包/类
public List<DailyBriefInfo> parseDailyBriefInfo() throws IOException, ParserException {
Parser parser = new Parser(CommonHttpURLConnection.getURLConnection("http://www.tmsf.com/index.jsp"));
NodeFilter nodeFilter = new HasAttributeFilter("id", "myCont5");
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
if (nodeList.toNodeArray().length == 0) {
return Collections.EMPTY_LIST;
}
List<DailyBriefInfo> dailyBriefInfoList = new ArrayList<>();
//到1970/01/01 00:00:00的小时数
int parseHour = (int) (Clock.systemUTC().millis() / (1000 * 3600));
//到1970/01/01 00:00:00的天数
int parseDay = (int) parseHour / 24;
NodeList infoNodeList = nodeList.elementAt(0).getChildren().elementAt(1)
.getChildren().elementAt(1).getChildren();
for (int i = 5; i <= 13; i = i + 2) {
DailyBriefInfo dailyBriefInfo = new DailyBriefInfo(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(1).toPlainTextString()),
Integer.parseInt(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(3).toPlainTextString())),
Integer.parseInt(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(5).toPlainTextString())),
Integer.parseInt(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(7).toPlainTextString())),
parseDay,parseHour);
dailyBriefInfoList.add(dailyBriefInfo);
dataOP.insertBriefDealInfo(dailyBriefInfo);
ESOP.writeToES("log/daily_brief_info_es", JSONObject.toJSONString(dailyBriefInfo));
}
return dailyBriefInfoList;
}
示例8: parsePageInfo
import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
* 爬取当前楼幢的页数
*
* @return
* @throws InterruptedException
* @throws IOException
* @throws Exception
*/
public int parsePageInfo(String url, DepartmentInfo departmentInfo) throws ParserException, IOException {
Parser parser = new Parser(CommonHttpURLConnection.getURLConnection(url));
int page = 0;
//解析页数
NodeFilter nodeFilter = new HasAttributeFilter("class", "spagenext");
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
if (nodeList.size() == 0) {
return page;
}
for (Node pageNode : nodeList.elementAt(0).getChildren().toNodeArray()) {
if (pageNode instanceof Span) {
try {
String tmp = pageNode.toPlainTextString();
page = Integer.parseInt(tmp.substring(tmp.indexOf("/") + 1, tmp.indexOf("总数") - 1).trim());
break;
} catch (Exception e) {
}
}
}
log.info("get total page [{}] for department:[{}]", page, departmentInfo.toString());
return page;
}
示例9: extractTextByTextNode
import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> extractTextByTextNode(String content){
List<String> doc=new ArrayList<String>();//每个元素为一个段落
if (content == null) {
return doc;
}
try{
Parser parser = Parser.createParser(content, "utf8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeList nodelist=parser.extractAllNodesThatMatch(textFilter);
HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
for (int i = 0; i < nodelist.size(); i++) {
Node textnode = (Node) nodelist.elementAt(i);
if(textnode.toPlainTextString().trim().length()>0)
log.debug(i+": "+" content: "+textnode.toPlainTextString());
if(isInformativeStricter(textnode,parentWeight)){
log.debug(i+": "+" content: "+textnode.toPlainTextString());
doc.add(textnode.toPlainTextString());
}
}
}catch(Exception e){
e.printStackTrace();
log.error("Text extractor has encountered a problem!! "+e.getMessage());
}
return doc;
}
示例10: extractTextByTagP
import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> extractTextByTagP(String content){
List<String> doc=new ArrayList<String>();//每个元素为一个段落
try{
if (content == null) {
return doc;
}
Parser parser = Parser.createParser(content, "utf8");
TagNameFilter paraFilter=new TagNameFilter("p");//get content between <p> </p>
// TagNameFilter paraFilter2=new TagNameFilter("br");//get content between <br> </br>
// NodeFilter filter = new OrFilter(paraFilter, paraFilter2);
NodeList nodelist=parser.extractAllNodesThatMatch(paraFilter);//报错!!
HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
for (int i = 0; i < nodelist.size(); i++) {
Node textnode = (Node) nodelist.elementAt(i);
log.debug(i+": "+" content: "+textnode.toPlainTextString());
if(isInformative(textnode,parentWeight)){
log.debug(i+": "+" content: "+textnode.toPlainTextString());
doc.add(textnode.toPlainTextString());
}
}
}catch(Exception e){
e.printStackTrace();
log.error("Text extractor has encountered a problem!! "+e.getMessage());
}
return doc;
}
示例11: filterSelectNode
import org.htmlparser.Parser; //导入方法依赖的package包/类
private NodeList filterSelectNode(String responseBody) throws ParserException {
Parser parser = Parser.createParser(responseBody, HTTP.ISO_8859_1);
return parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if (node.getText().startsWith("select")) {
return true;
}
return false;
}
});
}
示例12: filterTable
import org.htmlparser.Parser; //导入方法依赖的package包/类
private NodeList filterTable(String responseBody) throws ParserException {
Parser parser = Parser.createParser(responseBody, HTTP.ISO_8859_1);
return parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if (node.getText().toUpperCase().startsWith("TABLE") || node.getText().toUpperCase().startsWith("H3")) {
return true;
}
return false;
}
});
}
示例13: getLinks
import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> getLinks(String url) throws ParserException {
Parser htmlParser = new Parser(url);
List<String> links = new LinkedList<String>();
NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
for (int m = 0; m < tagNodeList.size(); m++) {
LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
String linkName = loopLinks.getLink();
links.add(linkName);
}
return links;
}
示例14: processResponse
import org.htmlparser.Parser; //导入方法依赖的package包/类
private boolean processResponse(HttpResponse resp, Document doc, Element root) {
if(resp.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
System.out.println("[INFO] HTTP Status OK.");
System.out.println("[INFO] Extracting html page...");
String html = extractHtml(resp);
if(html == null) return false;
System.out.println("[INFO] " + html.length() + "B html page extracted.");
if(html.length() < 500) {
System.out.println("[INFO] EOF reached, task completed.");
return false;
} else {
System.out.println("[INFO] Parsing html page...");
try {
Parser parser = new Parser(html);
NodeList weibo_list = parser.extractAllNodesThatMatch(
new HasAttributeFilter("action-type", "feed_list_item"));
System.out.println("[INFO] " + weibo_list.size() + " entries detected.");
SimpleNodeIterator iter = weibo_list.elements();
while(iter.hasMoreNodes()) {
System.out.println("[INFO] processing entry #" + (++total) + "...");
Element elem = extractContent(iter.nextNode(), doc);
if(elem == null) {
System.out.println("[ERROR] Data extraction failed.");
return false;
}
root.appendChild(elem);
}
if(weibo_list.size() != 15) return false;
} catch (ParserException e) {
System.out.println("[ERROR] Parser failed.");
e.printStackTrace();
return false;
}
}
} else {
return false;
}
return true;
}
示例15: extracLinks
import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
* 获取一个网站上的a链接
* @param url
* @return
*/
public static Set<String> extracLinks(String url) {
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag) {
// <a> 标签
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
links.add(linkUrl);
} else {
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1) {
end = frame.indexOf(">");
}
String frameUrl = frame.substring(5, end - 1);
links.add(frameUrl);
}
}
} catch (ParserException e) {
logger.error("", e);
}
return links;
}