当前位置: 首页>>代码示例>>Java>>正文


Java Parser.extractAllNodesThatMatch方法代码示例

本文整理汇总了Java中org.htmlparser.Parser.extractAllNodesThatMatch方法的典型用法代码示例。如果您正苦于以下问题:Java Parser.extractAllNodesThatMatch方法的具体用法?Java Parser.extractAllNodesThatMatch怎么用?Java Parser.extractAllNodesThatMatch使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.htmlparser.Parser的用法示例。


在下文中一共展示了Parser.extractAllNodesThatMatch方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: parserUrl

import org.htmlparser.Parser; //导入方法依赖的package包/类
@Override
public NodeList parserUrl(Parser parser) {
	NodeFilter hrefNodeFilter = new NodeFilter() {
		@Override
		public boolean accept(Node node) {
			if (node.getText().startsWith("a href=")) {
				return true;
			} else {
				return false;
			}
		}
	};
	try {
		return parser.extractAllNodesThatMatch(hrefNodeFilter);
	} catch (ParserException e) {
		e.printStackTrace();
	}
	return null;
}
 
开发者ID:PerkinsZhu,项目名称:WebSprider,代码行数:20,代码来源:HtmlParser01.java

示例2: parseMessage

import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
   * parses the body of the message, and returns a parsed representation
   * See {@link http://htmlparser.sourceforge.net/} for details
   * @param url the url that the message resulted from
   * @param message the Message to parse
   * @return a NodeList containing the various Nodes making up the page
   */
  public Object parseMessage(HttpUrl url, Message message) {
      String contentType = message.getHeader("Content-Type");
      if (contentType == null || !contentType.matches("text/html.*")) {
          return null;
      }
      byte[] content = message.getContent();
      if (content == null || content.length == 0) {
          return null;
      }
      Parser parser = Parser.createParser(new String(content), null);
      try {
          NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
                  return true;
              }
          });
          return nodelist;
      } catch (ParserException pe) {
          _logger.severe(pe.toString());
          return null;
      }
  }
 
开发者ID:Neraud,项目名称:PADListener,代码行数:30,代码来源:HTMLParser.java

示例3: getGangliaAttribute

import org.htmlparser.Parser; //导入方法依赖的package包/类
public List<String> getGangliaAttribute(String clusterName)
		throws ParserException, MalformedURLException, IOException {
	String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName);
	Parser parser = new Parser(new URL(url).openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	List<String> metricList = new ArrayList<String>();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			metricList.add(children.getOptionText());
		}
	}

	return metricList;

}
 
开发者ID:Ctrip-DI,项目名称:Hue-Ctrip-DI,代码行数:23,代码来源:GangliaHttpParser.java

示例4: main

import org.htmlparser.Parser; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	Parser parser = new Parser(new URL("http://10.8.75.3/ganglia/?r=hour&cs=&ce=&s=by+name&c=Zookeeper_Cluster&tab=m&vn=&hide-hf=false").openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			System.out.println(children.getOptionText());
		}
	}

}
 
开发者ID:Ctrip-DI,项目名称:Hue-Ctrip-DI,代码行数:18,代码来源:TestGangliaHttpParser.java

示例5: parsePageInfo

import org.htmlparser.Parser; //导入方法依赖的package包/类
/***
 * 解析小区的页数
 *
 * @param url
 * @return
 * @throws IOException
 * @throws ParserException
 */
private int parsePageInfo(final String url) throws IOException, ParserException {
    Parser parser = new Parser(CommonHttpURLConnection.getURLConnection(url));

    NodeFilter nodeFilter = new HasAttributeFilter("class", "pagenumber");
    NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
    for (Node node : nodeList.toNodeArray()) {
        if (!(node instanceof Div)) {
            continue;
        }
        for (Node innerNode : node.getChildren().elementAt(1).getChildren().toNodeArray()) {
            if (!(innerNode instanceof TextNode)) {
                continue;
            }
            String pageStr = innerNode.toPlainTextString();
            if (!pageStr.contains("/")) {
                continue;
            }
            pageStr = pageStr.substring(pageStr.indexOf("/") + 1);
            try {
                return Integer.parseInt(pageStr);
            } catch (Exception e) {

            }
        }
    }
    return 0;
}
 
开发者ID:deanjin,项目名称:houseHunter,代码行数:36,代码来源:DepartmentParser.java

示例6: run

import org.htmlparser.Parser; //导入方法依赖的package包/类
/***
 * 爬取透明网最近的预售证信息
 * @param url
 * @throws InterruptedException
 * @throws IOException
 * @throws ParserException
 */
public void run(String url) throws InterruptedException, IOException, ParserException {

    URLConnection urlConnection = CommonHttpURLConnection.getURLConnection(url);
    Parser parser = new Parser(urlConnection);
    NodeFilter nodeFilter = new HasAttributeFilter("class", "sale1");
    NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);

    if (nodeList.toNodeArray().length > 0) {
        Node[] sellCreditNodeArray = nodeList.elementAt(0).getChildren().toNodeArray();
        for (int i = 2; i < sellCreditNodeArray.length; i++) {
            if (sellCreditNodeArray[i] instanceof TableRow) {
                SellCreditInfo sellCreditInfo = parseSellParser(sellCreditNodeArray[i]);
                log.info("get sell credit info:{}", sellCreditInfo);
                //该预售证是否已经爬过
                HouseInfo houseInfo = dataOP.getHouseInfoByDepartmentNameAndSellCredit(sellCreditInfo);
                if(houseInfo != null){
                    log.info("already parsing sell credit:{}",sellCreditInfo);
                    break;
                }
                dataOP.insertSellCreditInfo(sellCreditInfo);
                if(i==2) continue;
                parseHouseInfo(sellCreditInfo);
            }
        }
    }
}
 
开发者ID:deanjin,项目名称:houseHunter,代码行数:34,代码来源:SellCreditParser.java

示例7: parseDailyBriefInfo

import org.htmlparser.Parser; //导入方法依赖的package包/类
public List<DailyBriefInfo> parseDailyBriefInfo() throws IOException, ParserException {

        Parser parser = new Parser(CommonHttpURLConnection.getURLConnection("http://www.tmsf.com/index.jsp"));
        NodeFilter nodeFilter = new HasAttributeFilter("id", "myCont5");
        NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
        if (nodeList.toNodeArray().length == 0) {
            return Collections.EMPTY_LIST;
        }

        List<DailyBriefInfo> dailyBriefInfoList = new ArrayList<>();

        //到1970/01/01 00:00:00的小时数
        int parseHour = (int) (Clock.systemUTC().millis() / (1000 * 3600));

        //到1970/01/01 00:00:00的天数
        int parseDay = (int) parseHour / 24;

        NodeList infoNodeList = nodeList.elementAt(0).getChildren().elementAt(1)
                .getChildren().elementAt(1).getChildren();

        for (int i = 5; i <= 13; i = i + 2) {
            DailyBriefInfo dailyBriefInfo = new DailyBriefInfo(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(1).toPlainTextString()),
                    Integer.parseInt(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(3).toPlainTextString())),
                    Integer.parseInt(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(5).toPlainTextString())),
                    Integer.parseInt(CharMatcher.WHITESPACE.trimFrom(infoNodeList.elementAt(i).getChildren().elementAt(7).toPlainTextString())),
                    parseDay,parseHour);

            dailyBriefInfoList.add(dailyBriefInfo);
            dataOP.insertBriefDealInfo(dailyBriefInfo);

            ESOP.writeToES("log/daily_brief_info_es", JSONObject.toJSONString(dailyBriefInfo));
        }

        return dailyBriefInfoList;

    }
 
开发者ID:deanjin,项目名称:houseHunter,代码行数:37,代码来源:DailyDealParser.java

示例8: parsePageInfo

import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
 * 爬取当前楼幢的页数
 *
 * @return
 * @throws InterruptedException
 * @throws IOException
 * @throws Exception
 */
public int parsePageInfo(String url, DepartmentInfo departmentInfo) throws ParserException, IOException {

    Parser parser = new Parser(CommonHttpURLConnection.getURLConnection(url));

    int page = 0;
    //解析页数
    NodeFilter nodeFilter = new HasAttributeFilter("class", "spagenext");
    NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
    if (nodeList.size() == 0) {
        return page;
    }

    for (Node pageNode : nodeList.elementAt(0).getChildren().toNodeArray()) {
        if (pageNode instanceof Span) {
            try {
                String tmp = pageNode.toPlainTextString();
                page = Integer.parseInt(tmp.substring(tmp.indexOf("/") + 1, tmp.indexOf("总数") - 1).trim());
                break;
            } catch (Exception e) {
            }
        }
    }

    log.info("get total page [{}] for department:[{}]", page, departmentInfo.toString());

    return page;
}
 
开发者ID:deanjin,项目名称:houseHunter,代码行数:36,代码来源:HouseParser.java

示例9: extractTextByTextNode

import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> extractTextByTextNode(String content){
  	List<String> doc=new ArrayList<String>();//每个元素为一个段落
  	 if (content == null) {
    	return doc;
  }
  	 try{
	     Parser parser = Parser.createParser(content, "utf8");      
	     NodeFilter textFilter = new NodeClassFilter(TextNode.class);
        NodeList nodelist=parser.extractAllNodesThatMatch(textFilter);
        HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
        for (int i = 0; i < nodelist.size(); i++) {
        	Node textnode = (Node) nodelist.elementAt(i);
        	if(textnode.toPlainTextString().trim().length()>0)
        		log.debug(i+": "+" content: "+textnode.toPlainTextString());
        	if(isInformativeStricter(textnode,parentWeight)){
	        	log.debug(i+": "+" content: "+textnode.toPlainTextString());
	        	doc.add(textnode.toPlainTextString());
        	}        	
        }  
}catch(Exception e){
	e.printStackTrace();
	log.error("Text extractor  has encountered a problem!! "+e.getMessage());
}
  	 
    return doc;

  }
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:28,代码来源:HtmlContentExtractor.java

示例10: extractTextByTagP

import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> extractTextByTagP(String content){
			List<String> doc=new ArrayList<String>();//每个元素为一个段落
			try{
				 if (content == null) {
				    	return doc;
				    }
				     Parser parser = Parser.createParser(content, "utf8");      
			        TagNameFilter paraFilter=new TagNameFilter("p");//get content between <p> </p>
//			        TagNameFilter paraFilter2=new TagNameFilter("br");//get content between <br> </br>
//			        NodeFilter filter = new OrFilter(paraFilter, paraFilter2);
			        NodeList nodelist=parser.extractAllNodesThatMatch(paraFilter);//报错!!
			        HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
			        for (int i = 0; i < nodelist.size(); i++) {
			        	Node textnode = (Node) nodelist.elementAt(i);
			        	log.debug(i+": "+" content: "+textnode.toPlainTextString());

			        	if(isInformative(textnode,parentWeight)){
				        	log.debug(i+": "+" content: "+textnode.toPlainTextString());
				        	doc.add(textnode.toPlainTextString());
			        	}        	
			        }  
			}catch(Exception e){
				e.printStackTrace();
				log.error("Text extractor  has encountered a problem!! "+e.getMessage());
			}
	        return doc;
	}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:28,代码来源:HtmlContentExtractor.java

示例11: filterSelectNode

import org.htmlparser.Parser; //导入方法依赖的package包/类
private NodeList filterSelectNode(String responseBody) throws ParserException {
    Parser parser = Parser.createParser(responseBody, HTTP.ISO_8859_1);
    return parser.extractAllNodesThatMatch(new NodeFilter() {
        @Override
        public boolean accept(Node node) {
            if (node.getText().startsWith("select")) {
                return true;
            }
            return false;
        }
    });
}
 
开发者ID:emivaljr,项目名称:hojenaoapp,代码行数:13,代码来源:MyEndpoint.java

示例12: filterTable

import org.htmlparser.Parser; //导入方法依赖的package包/类
private NodeList filterTable(String responseBody) throws ParserException {
    Parser parser = Parser.createParser(responseBody, HTTP.ISO_8859_1);
    return parser.extractAllNodesThatMatch(new NodeFilter() {
        @Override
        public boolean accept(Node node) {
            if (node.getText().toUpperCase().startsWith("TABLE") || node.getText().toUpperCase().startsWith("H3")) {
                return true;
            }
            return false;
        }
    });
}
 
开发者ID:emivaljr,项目名称:hojenaoapp,代码行数:13,代码来源:MyEndpoint.java

示例13: getLinks

import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> getLinks(String url) throws ParserException {
    Parser htmlParser = new Parser(url);
    List<String> links = new LinkedList<String>();
    NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
    for (int m = 0; m < tagNodeList.size(); m++) {
        LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
        String linkName = loopLinks.getLink();
        links.add(linkName);
    }
    return links;
}
 
开发者ID:wso2,项目名称:carbon-platform-integration-utils,代码行数:12,代码来源:DistributionValidationTestUtils.java

示例14: processResponse

import org.htmlparser.Parser; //导入方法依赖的package包/类
private boolean processResponse(HttpResponse resp, Document doc, Element root) {
	if(resp.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
		System.out.println("[INFO] HTTP Status OK.");
		System.out.println("[INFO] Extracting html page...");
		String html = extractHtml(resp);
		if(html == null) return false;
		System.out.println("[INFO] " + html.length() + "B html page extracted.");
		if(html.length() < 500) {
			System.out.println("[INFO] EOF reached, task completed.");
			return false;
		} else {
			System.out.println("[INFO] Parsing html page...");
			try {
				Parser parser = new Parser(html);
				NodeList weibo_list = parser.extractAllNodesThatMatch(
						new HasAttributeFilter("action-type", "feed_list_item"));
				System.out.println("[INFO] " + weibo_list.size() + " entries detected.");
				SimpleNodeIterator iter = weibo_list.elements();
				while(iter.hasMoreNodes()) {
					System.out.println("[INFO] processing entry #" + (++total) + "...");
					Element elem = extractContent(iter.nextNode(), doc);
					if(elem == null) {
						System.out.println("[ERROR] Data extraction failed.");
						return false;
					}
					root.appendChild(elem);
				}
				if(weibo_list.size() != 15) return false;
			} catch (ParserException e) {
				System.out.println("[ERROR] Parser failed.");
				e.printStackTrace();
				return false;
			}
		}
	} else {
		return false;
	}
	return true;
}
 
开发者ID:w1ndy,项目名称:weibo-fetcher,代码行数:40,代码来源:Spider.java

示例15: extracLinks

import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
 * 获取一个网站上的a链接
 * @param url
 * @return
 */
public static Set<String> extracLinks(String url) {
    Set<String> links = new HashSet<String>();

    try {
        Parser parser = new Parser(url);
        parser.setEncoding("utf-8");
        // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
        @SuppressWarnings("serial")
        NodeFilter frameFilter = new NodeFilter() {
            public boolean accept(Node node) {
                if (node.getText().startsWith("frame src=")) {
                    return true;
                } else {
                    return false;
                }
            }
        };
        // OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
        OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
        // 得到所有经过过滤的标签
        NodeList list = parser.extractAllNodesThatMatch(linkFilter);
        for (int i = 0; i < list.size(); i++) {
            Node tag = list.elementAt(i);
            if (tag instanceof LinkTag) {
                // <a> 标签
                LinkTag link = (LinkTag) tag;
                String linkUrl = link.getLink();
                links.add(linkUrl);
            } else {
                // 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
                String frame = tag.getText();
                int start = frame.indexOf("src=");
                frame = frame.substring(start);
                int end = frame.indexOf(" ");
                if (end == -1) {
                    end = frame.indexOf(">");
                }
                String frameUrl = frame.substring(5, end - 1);
                links.add(frameUrl);
            }
        }
    } catch (ParserException e) {
        logger.error("", e);
    }
    return links;
}
 
开发者ID:xuxueli,项目名称:xxl-incubator,代码行数:52,代码来源:HtmlParserUtil.java


注:本文中的org.htmlparser.Parser.extractAllNodesThatMatch方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。