当前位置: 首页>>代码示例>>Java>>正文


Java NodeList.elementAt方法代码示例

本文整理汇总了Java中org.htmlparser.util.NodeList.elementAt方法的典型用法代码示例。如果您正苦于以下问题:Java NodeList.elementAt方法的具体用法?Java NodeList.elementAt怎么用?Java NodeList.elementAt使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.htmlparser.util.NodeList的用法示例。


在下文中一共展示了NodeList.elementAt方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: parseFlashEmbedTag

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
 * Processes the EMBED node that should contain the Flash animation:
 * @param embedTag the Root object tag to tackle
 * @param flashObjToFill the flash obect to fill in with data
 * @return the updated flash object
 */
@SuppressWarnings("unchecked")
private FlashEmbeddedObject parseFlashEmbedTag( NodeList embeds, final FlashEmbeddedObject flashObjToFill ) {
	if( embeds != null ) {
		logger.debug( "The number of embed-tag nodes is " + embeds.size() );
		for( int i = 0; i < embeds.size() ; i++ ) {
			Node embedNode = embeds.elementAt( i );
			if( embedNode instanceof Tag ) {
				Tag embedTag = (Tag) embedNode;
				//If it is not an end node then we process its attributes, if it is an empty 
				//XML tag then we do the same I believe an empty XML tag is smth like: <TAG />
				if( !embedTag.isEndTag() || embedTag.isEmptyXmlTag() ) {
					//Process the attributes
					logger.debug("Processing embed node's '" + embedTag + "' attributes");
					Vector<Attribute> atts = (Vector<Attribute>) embedTag.getAttributesEx();
					if( atts != null ) {
						for( Attribute att : atts ) {
							String nameValue = att.getName();
							String valueValue = att.getValue();
							if( ! flashObjToFill.setNameValue( nameValue, valueValue ) ) {
								logger.warn("An unknown EMBED attribute, name='" + nameValue + "' value='" + valueValue + "'" );
							} else {
								logger.debug("Set the EMBED attribute, name='" + nameValue + "' value='" + valueValue + "'");
							}
						}
					}
				} else {
					logger.warn( "Encountered an EMBED node: " + embedTag + " that is an end tag!" );
				}
			} else {
				logger.warn( "Encountered a EMBED node: " + embedNode + " that is not an EMBED tag!" );
			}
		}
	} else {
		logger.debug( "The list of embed-tag nodes is null" );
	}
	return flashObjToFill;
}
 
开发者ID:ivan-zapreev,项目名称:x-cure-chat,代码行数:44,代码来源:FlashEmbeddedParser.java

示例2: extractTextByTextNode

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static List<String> extractTextByTextNode(String content){
  	List<String> doc=new ArrayList<String>();//每个元素为一个段落
  	 if (content == null) {
    	return doc;
  }
  	 try{
	     Parser parser = Parser.createParser(content, "utf8");      
	     NodeFilter textFilter = new NodeClassFilter(TextNode.class);
        NodeList nodelist=parser.extractAllNodesThatMatch(textFilter);
        HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
        for (int i = 0; i < nodelist.size(); i++) {
        	Node textnode = (Node) nodelist.elementAt(i);
        	if(textnode.toPlainTextString().trim().length()>0)
        		log.debug(i+": "+" content: "+textnode.toPlainTextString());
        	if(isInformativeStricter(textnode,parentWeight)){
	        	log.debug(i+": "+" content: "+textnode.toPlainTextString());
	        	doc.add(textnode.toPlainTextString());
        	}        	
        }  
}catch(Exception e){
	e.printStackTrace();
	log.error("Text extractor  has encountered a problem!! "+e.getMessage());
}
  	 
    return doc;

  }
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:28,代码来源:HtmlContentExtractor.java

示例3: extractTextByTagP

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static List<String> extractTextByTagP(String content){
			List<String> doc=new ArrayList<String>();//每个元素为一个段落
			try{
				 if (content == null) {
				    	return doc;
				    }
				     Parser parser = Parser.createParser(content, "utf8");      
			        TagNameFilter paraFilter=new TagNameFilter("p");//get content between <p> </p>
//			        TagNameFilter paraFilter2=new TagNameFilter("br");//get content between <br> </br>
//			        NodeFilter filter = new OrFilter(paraFilter, paraFilter2);
			        NodeList nodelist=parser.extractAllNodesThatMatch(paraFilter);//报错!!
			        HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
			        for (int i = 0; i < nodelist.size(); i++) {
			        	Node textnode = (Node) nodelist.elementAt(i);
			        	log.debug(i+": "+" content: "+textnode.toPlainTextString());

			        	if(isInformative(textnode,parentWeight)){
				        	log.debug(i+": "+" content: "+textnode.toPlainTextString());
				        	doc.add(textnode.toPlainTextString());
			        	}        	
			        }  
			}catch(Exception e){
				e.printStackTrace();
				log.error("Text extractor  has encountered a problem!! "+e.getMessage());
			}
	        return doc;
	}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:28,代码来源:HtmlContentExtractor.java

示例4: listarCidades

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
@ApiMethod(name = "listarCidades")
public ListaEstadosCidades listarCidades(@Named("state") String state) throws Exception{
    inicializaMapaEstados();
    if(mapaCidades== null){
        mapaCidades = new HashMap<String,Map<String,String>>();
    }
    if(!mapaCidades.containsKey(state)) {
        Map<String,String> mapa = new HashMap<String, String>();
        mapaCidades.put(state,mapa);
        String responseBody = recuperarDados(mapaEstados.get(state), null);
        NodeList nodeList = filterSelectNode(responseBody);
        Node cidadeNode = nodeList.elementAt(2);
        SimpleNodeIterator iteratorEstado = cidadeNode.getChildren().elements();
        while (iteratorEstado.hasMoreNodes()) {
            OptionTag node = (OptionTag) iteratorEstado.nextNode();
            String cidadeId = node.getValue();
            String cidadeNome = node.getChildren().elements().nextNode().getText();
            if(!(cidadeNome.indexOf("Selecione") != -1)) {
                //System.out.println(cidadeId+","+cidadeNome+","+mapaEstados.get(state));
                mapa.put(cidadeNome, cidadeId);
            }
        }
    }
    ListaEstadosCidades listaEstados = new ListaEstadosCidades();
    listaEstados.setLista(new ArrayList<String>(mapaCidades.get(state).keySet()));
    return listaEstados;

}
 
开发者ID:emivaljr,项目名称:hojenaoapp,代码行数:29,代码来源:MyEndpoint.java

示例5: preencheMapaEstados

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
private void preencheMapaEstados() throws IOException, ParserException {
    String responseBody = recuperarDados(null, null);
    NodeList nodeList = filterSelectNode(responseBody);
    Node estadoNode = nodeList.elementAt(1);
    SimpleNodeIterator iteratorEstado = estadoNode.getChildren().elements();
    while (iteratorEstado.hasMoreNodes()) {
        OptionTag node = (OptionTag) iteratorEstado.nextNode();
        String estadoId = node.getValue();
        String estadoNome = node.getChildren().elements().nextNode().getText();
        //System.out.println(estadoId+","+estadoNome);
        mapaEstados.put(estadoNome,estadoId);
    }

}
 
开发者ID:emivaljr,项目名称:hojenaoapp,代码行数:15,代码来源:MyEndpoint.java

示例6: getLinks

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static List<String> getLinks(String url) throws ParserException {
    Parser htmlParser = new Parser(url);
    List<String> links = new LinkedList<String>();
    NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
    for (int m = 0; m < tagNodeList.size(); m++) {
        LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
        String linkName = loopLinks.getLink();
        links.add(linkName);
    }
    return links;
}
 
开发者ID:wso2,项目名称:carbon-platform-integration-utils,代码行数:12,代码来源:DistributionValidationTestUtils.java

示例7: extracLinks

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
 * 获取一个网站上的a链接
 * @param url
 * @return
 */
public static Set<String> extracLinks(String url) {
    Set<String> links = new HashSet<String>();

    try {
        Parser parser = new Parser(url);
        parser.setEncoding("utf-8");
        // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
        @SuppressWarnings("serial")
        NodeFilter frameFilter = new NodeFilter() {
            public boolean accept(Node node) {
                if (node.getText().startsWith("frame src=")) {
                    return true;
                } else {
                    return false;
                }
            }
        };
        // OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
        OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
        // 得到所有经过过滤的标签
        NodeList list = parser.extractAllNodesThatMatch(linkFilter);
        for (int i = 0; i < list.size(); i++) {
            Node tag = list.elementAt(i);
            if (tag instanceof LinkTag) {
                // <a> 标签
                LinkTag link = (LinkTag) tag;
                String linkUrl = link.getLink();
                links.add(linkUrl);
            } else {
                // 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
                String frame = tag.getText();
                int start = frame.indexOf("src=");
                frame = frame.substring(start);
                int end = frame.indexOf(" ");
                if (end == -1) {
                    end = frame.indexOf(">");
                }
                String frameUrl = frame.substring(5, end - 1);
                links.add(frameUrl);
            }
        }
    } catch (ParserException e) {
        logger.error("", e);
    }
    return links;
}
 
开发者ID:xuxueli,项目名称:xxl-incubator,代码行数:52,代码来源:HtmlParserUtil.java

示例8: extracLinks

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static Set<String> extracLinks(String url, LinkFilter filter) {
	Set<String> links = new HashSet<String>();
	try{
		Parser parser = new Parser(url);
		parser.setEncoding("gb2312");
		// <frame >
		@SuppressWarnings("serial")
		NodeFilter frameFilter = new NodeFilter(){
			public boolean accept(Node node){
				if (node.getText().startsWith("frame src=")){
					return true;
				} 
				else{
					return false;
				}
			}
		};
		//<a><frame>
		OrFilter linkFilter = new OrFilter(new NodeClassFilter(
				LinkTag.class), frameFilter);
		NodeList list = parser.extractAllNodesThatMatch(linkFilter);
		for (int i = 0; i < list.size(); i++) {
			Node tag = list.elementAt(i);
			if (tag instanceof LinkTag){// <a>
				LinkTag link = (LinkTag) tag;
				String linkUrl = link.getLink();
				if (filter.accept(linkUrl))
					links.add(linkUrl);
			} 
			else{
				String frame = tag.getText();
				int start = frame.indexOf("src=");
				frame = frame.substring(start);
				int end = frame.indexOf(" ");
				if (end == -1)
					end = frame.indexOf(">");
				String frameUrl = frame.substring(5, end - 1);
				if (filter.accept(frameUrl))
					links.add(frameUrl);
			}
		}
		System.out.println(links);
	} catch (ParserException e){
		e.printStackTrace();
	}
	return links;
}
 
开发者ID:MelissaChen15,项目名称:Crawler2015,代码行数:48,代码来源:HtmlParserTool.java

示例9: extracLinks

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static void extracLinks(String url) {
	try {
		Parser parser = new Parser(url);
		parser.setEncoding("utf-8");// gb2312
		// 过滤 <frame> 标签的 filter,用来提取 frame 标签里的 src 属性所、表示的链接
		NodeFilter frameFilter = new NodeFilter() {
			public boolean accept(Node node) {
				if (node.getText().startsWith("frame src=")) {
					return true;
				} else {
					return false;
				}
			}
		};
		// OrFilter 来设置过滤 <a> 标签,<img> 标签和 <frame> 标签,三个标签是 or 的关系
		OrFilter orFilter = new OrFilter(
				new NodeClassFilter(LinkTag.class), new NodeClassFilter(
						ImageTag.class));
		OrFilter linkFilter = new OrFilter(orFilter, frameFilter);
		// 得到所有经过过滤的标签
		NodeList list = parser.extractAllNodesThatMatch(linkFilter);
		for (int i = 0; i < list.size(); i++) {
			Node tag = list.elementAt(i);
			if (tag instanceof LinkTag)// <a> 标签
			{
				LinkTag link = (LinkTag) tag;
				String linkUrl = link.getLink();// url
				String text = link.getLinkText();// 链接文字
				System.out.println(linkUrl + "**********" + text);
			} else if (tag instanceof ImageTag)// <img> 标签
			{
				ImageTag image = (ImageTag) list.elementAt(i);
				System.out.print(image.getImageURL() + "********");// 图片地址
				System.out.println(image.getText());// 图片文字
			} else// <frame> 标签
			{
				// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
				String frame = tag.getText();
				int start = frame.indexOf("src=");
				frame = frame.substring(start);
				int end = frame.indexOf(" ");
				if (end == -1)
					end = frame.indexOf(">");
				frame = frame.substring(5, end - 1);
				System.out.println(frame);
			}
		}
	} catch (ParserException e) {
		e.printStackTrace();
	}
}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:52,代码来源:HtmlParserTest.java

示例10: extractLink

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static void extractLink(String content, String keyword) {
		/**
		 * 通过判断链接中是否含keyword确定是否为有效链接。
		 * 注:keyword可能是一组词语或者是一个短语,检索出的内容或许只是匹配上keyword中部分词语
		 */
		try {
		    Parser  parser = Parser.createParser(content, "utf8");
	        NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
			NodeList nodelist = parser.extractAllNodesThatMatch(linkFilter);
			int lastNodeID=0;//上一个确定为有效链接的node ID
			int disThre=8; //通常检索出来的有效链接的id是连续的,因此可用此区分那些广告信息
			  for (int i = 0; i < nodelist.size(); i++) {
		        	Node node = (Node) nodelist.elementAt(i);
		        	LinkTag link = (LinkTag) node;
					String linkUrl = link.getLink();// url
					String text = link.getLinkText();// 链接文字
/*					//simple keywords test for debug
					boolean flag=false;
					String[] tmps=keyword.split("\\s+");
					for(String tmp:tmps){
						if(text.contains(tmp)){
							flag=true;break;
						}
					}
					if(flag){*/
		        	if(containKeyword(text,keyword)){
		        		if(lastNodeID>0 &&i-lastNodeID>disThre){
		        			log.debug("Noisy link!!!");
		        			continue;
		        		}
			        	if(!linkUrl.startsWith("http")) continue;
			        	log.debug(i+":"+linkUrl+", "+text);
			        	lastNodeID=i;
			        	LinkDb.addUnvisitedUrl(linkUrl);
		        	}else{
/*		        		if(text.contains("下一页")){
			        		System.out.println(i+":"+linkUrl+", "+text);
		        		}*/
		        	}
		      }    
		} catch (Exception e) {
			e.printStackTrace();
			log.error("Link extractor  has encountered a problem!! "+e.getMessage());
		}
		
	}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:47,代码来源:HtmlContentExtractor.java

示例11: extracLinks

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
 * 获取一个网页上的链接,并加入到队列中
 * @param content
 * @param filter 用来过滤链接
 * @return Set<String>
 * @author cxn 2015年11月5日
 */
public static Set<String> extracLinks(String content, LinkFilter filter) {
	Set<String> links = new HashSet<String>();
	try {
		Parser parser = new Parser(content);
		// parser.setEncoding("utf-8");
		// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
		NodeFilter frameFilter = new NodeFilter() {
			public boolean accept(Node node) {
				if (node.getText().startsWith("frame src=")) {
					return true;
				} else {
					return false;
				}
			}
		};
		// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
		OrFilter linkFilter = new OrFilter(new NodeClassFilter(
				LinkTag.class), frameFilter);
		// 得到所有经过过滤的标签
		NodeList list = parser.extractAllNodesThatMatch(linkFilter);
		for (int i = 0; i < list.size(); i++) {
			Node tag = list.elementAt(i);
			if (tag instanceof LinkTag){
				LinkTag link = (LinkTag) tag;
				String linkUrl = link.getLink();
				if(linkUrl.startsWith("http") && filter.accept(linkUrl, Main.keyWord)){
					links.add(linkUrl);
				}else if(linkUrl.startsWith("/") && filter.accept(Main.baseUrl+linkUrl, Main.keyWord)){
					links.add(Main.baseUrl+linkUrl);
				}
			}else{
				// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
				String frame = tag.getText();
				int start = frame.indexOf("src=");
				frame = frame.substring(start);
				int end = frame.indexOf(" ");
				if (end == -1){
					end = frame.indexOf(">");
				}
				String frameUrl = frame.substring(5, end - 1);
				if (filter.accept(frameUrl)){
					links.add(frameUrl);
				}
			}
		}
	} catch (ParserException e) {
		e.printStackTrace();
	}
	LinkQueue.addUnvisitedUrl(links);
	return links;
}
 
开发者ID:cxn945,项目名称:my_crawler,代码行数:59,代码来源:PageParser.java

示例12: extracLinks

import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public Set<String> extracLinks(PageResult pageResult, LinkFilter filter) {
	//String url=crawlUrl.getOriUrl();
	Set<String> links = new HashSet<String>();
	try {
		Parser parser = new Parser(pageResult.getContent());
		
		parser.setEncoding(pageResult.getCharSet());
		
		// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
		NodeFilter frameFilter = new NodeFilter() {
			public boolean accept(Node node) {
				if (node.getText().startsWith("frame src=")) {
					return true;
				} else {
					return false;
				}
			}
		};
		// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
		OrFilter linkFilter = new OrFilter(new NodeClassFilter(
				LinkTag.class), frameFilter);
		// 得到所有经过过滤的标签
		NodeList list = parser.extractAllNodesThatMatch(linkFilter);
		for (int i = 0; i < list.size(); i++) {
			Node tag = list.elementAt(i);
			if (tag instanceof LinkTag)// <a> 标签
			{
				LinkTag link = (LinkTag) tag;
				String linkUrl = link.getLink();// url
				if (filter.accept(linkUrl))
				{
					//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(linkUrl);
					links.add(linkUrl);
				}
					
			} else// <frame> 标签
			{
				// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
				String frame = tag.getText();
				int start = frame.indexOf("src=");
				frame = frame.substring(start);
				int end = frame.indexOf(" ");
				if (end == -1)
					end = frame.indexOf(">");
				String frameUrl = frame.substring(5, end - 1);
				if (filter.accept(frameUrl))
				{
					//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(frameUrl);
					links.add(frameUrl);
				}
					
			}
		}
	} catch (ParserException e) {
		e.printStackTrace();
	}
	return links;
}
 
开发者ID:hxt168,项目名称:webpasser,代码行数:59,代码来源:DefaultHtmlParser.java


注:本文中的org.htmlparser.util.NodeList.elementAt方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。