当前位置: 首页>>代码示例>>Java>>正文


Java LinkTag类代码示例

本文整理汇总了Java中org.htmlparser.tags.LinkTag的典型用法代码示例。如果您正苦于以下问题:Java LinkTag类的具体用法?Java LinkTag怎么用?Java LinkTag使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


LinkTag类属于org.htmlparser.tags包,在下文中一共展示了LinkTag类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: parseDetailInfo

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
private Map<String, String> parseDetailInfo(NodeList nodeList) {
    Map<String, String> InfoMap = Maps.newHashMap();
    if (nodeList.size() == 0) {
        return InfoMap;
    }
    for (Node pageNode : nodeList.elementAt(0).getChildren().toNodeArray()) {
        try {
            if (pageNode instanceof LinkTag) {
                String rawId = ((LinkTag) pageNode).getAttribute("id");
                if (StringUtils.isBlank(rawId)) {
                    continue;
                }
                if (rawId.contains("all")) {
                    continue;
                }
                String id = rawId.substring(rawId.indexOf("_") + 1);

                InfoMap.put(id, pageNode.toPlainTextString());
            }
        } catch (Exception e) {
            log.error("parse parseDetailInfo catch Exception:", e);
        }
    }
    return InfoMap;
}
 
开发者ID:deanjin,项目名称:houseHunter,代码行数:26,代码来源:HouseParser.java

示例2: parseLinkTag

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
 * 解析楼幢数
 *
 * @param nodeList
 * @return
 */
private String parseLinkTag(NodeList nodeList) {
    for (Node node : nodeList.toNodeArray()) {
        if (node instanceof LinkTag) {
            return node.toPlainTextString();
        }
    }
    return StringUtils.EMPTY;
}
 
开发者ID:deanjin,项目名称:houseHunter,代码行数:15,代码来源:HouseParser.java

示例3: readTextAndLinkAndTitle

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
 * 分别读纯文本和链接.
 * @param result 网页的内容
 * @throws Exception
 */
public static void readTextAndLinkAndTitle(String result) throws Exception {
    Parser parser;
    NodeList nodelist;
    parser = Parser.createParser(result, "utf8");

    NodeFilter textFilter = new NodeClassFilter(TextNode.class);
    NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
    NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
    OrFilter lastFilter = new OrFilter();
    lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter, titleFilter });
    nodelist = parser.parse(lastFilter);
    Node[] nodes = nodelist.toNodeArray();
    String line = "";
    
    for (int i = 0; i < nodes.length; i++) {
        Node node = nodes[i];
        if (node instanceof TextNode) {
            TextNode textnode = (TextNode) node;
            line = textnode.getText();
        } else if (node instanceof LinkTag) {
            LinkTag link = (LinkTag) node;
            line = link.getLink();
        } else if (node instanceof TitleTag) {
            TitleTag titlenode = (TitleTag) node;
            line = titlenode.getTitle();
        }
        
        if (isTrimEmpty(line))
            continue;
        System.out.println(line);
    }
}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:38,代码来源:HtmlParserTest.java

示例4: getLinks

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public static List<String> getLinks(String url) throws ParserException {
    Parser htmlParser = new Parser(url);
    List<String> links = new LinkedList<String>();
    NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
    for (int m = 0; m < tagNodeList.size(); m++) {
        LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
        String linkName = loopLinks.getLink();
        links.add(linkName);
    }
    return links;
}
 
开发者ID:wso2,项目名称:carbon-platform-integration-utils,代码行数:12,代码来源:DistributionValidationTestUtils.java

示例5: extracLinks

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
 * 获取一个网站上的a链接
 * @param url
 * @return
 */
public static Set<String> extracLinks(String url) {
    Set<String> links = new HashSet<String>();

    try {
        Parser parser = new Parser(url);
        parser.setEncoding("utf-8");
        // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
        @SuppressWarnings("serial")
        NodeFilter frameFilter = new NodeFilter() {
            public boolean accept(Node node) {
                if (node.getText().startsWith("frame src=")) {
                    return true;
                } else {
                    return false;
                }
            }
        };
        // OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
        OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
        // 得到所有经过过滤的标签
        NodeList list = parser.extractAllNodesThatMatch(linkFilter);
        for (int i = 0; i < list.size(); i++) {
            Node tag = list.elementAt(i);
            if (tag instanceof LinkTag) {
                // <a> 标签
                LinkTag link = (LinkTag) tag;
                String linkUrl = link.getLink();
                links.add(linkUrl);
            } else {
                // 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
                String frame = tag.getText();
                int start = frame.indexOf("src=");
                frame = frame.substring(start);
                int end = frame.indexOf(" ");
                if (end == -1) {
                    end = frame.indexOf(">");
                }
                String frameUrl = frame.substring(5, end - 1);
                links.add(frameUrl);
            }
        }
    } catch (ParserException e) {
        logger.error("", e);
    }
    return links;
}
 
开发者ID:xuxueli,项目名称:xxl-incubator,代码行数:52,代码来源:HtmlParserUtil.java

示例6: parserNode

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
 * 对结点进行词法分析
 * @param node 所要分析的结点
 */
private void parserNode(Node node) {
	depth ++;
	String regex = "[ \b\t\n\f\r]*";
	if(node instanceof TextNode) {	// 若为文本结点,则进行分词
		if(depth == 1) {
			System.out.println("TextNode!");
			Lexer lexer = new Lexer(node.getPage());
			Parser parser = new Parser(lexer, Parser.STDOUT);
			//TODO filter script & style
			OrFilter it = new OrFilter(new NotFilter(new TagNameFilter("script ")), new NotFilter(new TagNameFilter("style ")));

			try {
				NodeList nl = parser.extractAllNodesThatMatch(it);
				NodeIterator nit = nl.elements();
				while(nit.hasMoreNodes()) {
					Node n = nit.nextNode();
					if(n instanceof TextNode) {
						if(!(n.getText().matches(regex))) {	// 用正则表达式进行匹配,对非空的文本进行分词
							segment(n.getText());	// 对网页中的文本进行分词
						}
					}
				}
			}
			catch(ParserException exc) {
				System.out.println("ParserException");
				//exc.printStackTrace();
			}
		}
	}
	else if(node instanceof TagNode) {	// 若为链接结点,则扩展外链
		if(node instanceof LinkTag) {
			LinkTag tag = (LinkTag)node;
			if(!(tag.getLink().matches(regex))) {	
				urlInfo.addExtendedURL(tag.getLink());	// 将得到的外链加入到urlInfo中
			}
		}			
		dealTag(node);
	}
	depth --;
}
 
开发者ID:uraplutonium,项目名称:hadoop-distributed-crawler,代码行数:45,代码来源:URLAnalyzer.java

示例7: extracLinks

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public static Set<String> extracLinks(String url, LinkFilter filter) {
	Set<String> links = new HashSet<String>();
	try{
		Parser parser = new Parser(url);
		parser.setEncoding("gb2312");
		// <frame >
		@SuppressWarnings("serial")
		NodeFilter frameFilter = new NodeFilter(){
			public boolean accept(Node node){
				if (node.getText().startsWith("frame src=")){
					return true;
				} 
				else{
					return false;
				}
			}
		};
		//<a><frame>
		OrFilter linkFilter = new OrFilter(new NodeClassFilter(
				LinkTag.class), frameFilter);
		NodeList list = parser.extractAllNodesThatMatch(linkFilter);
		for (int i = 0; i < list.size(); i++) {
			Node tag = list.elementAt(i);
			if (tag instanceof LinkTag){// <a>
				LinkTag link = (LinkTag) tag;
				String linkUrl = link.getLink();
				if (filter.accept(linkUrl))
					links.add(linkUrl);
			} 
			else{
				String frame = tag.getText();
				int start = frame.indexOf("src=");
				frame = frame.substring(start);
				int end = frame.indexOf(" ");
				if (end == -1)
					end = frame.indexOf(">");
				String frameUrl = frame.substring(5, end - 1);
				if (filter.accept(frameUrl))
					links.add(frameUrl);
			}
		}
		System.out.println(links);
	} catch (ParserException e){
		e.printStackTrace();
	}
	return links;
}
 
开发者ID:MelissaChen15,项目名称:Crawler2015,代码行数:48,代码来源:HtmlParserTool.java

示例8: getGames

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public ArrayList<FootballGame> getGames() {
    Parser parser = new Parser();
    ArrayList<FootballGame> games = new ArrayList<FootballGame>();
    try {
        NodeFilter tagNameFilter = new TagNameFilter("table");
        HasAttributeFilter attrFilter = new HasAttributeFilter("bgcolor", "#666666");
        parser.setResource("http://livescores.com/");
        NodeList nl = parser.parse(tagNameFilter);
        nl = nl.extractAllNodesThatMatch(attrFilter);
        attrFilter = new HasAttributeFilter("width", "331");
        nl = nl.extractAllNodesThatMatch(attrFilter);
        Node node = nl.remove(0);
        nl = node.getChildren();
        Node[] nodes = nl.toNodeArray();
        Tag tag;
        String country = "";
        String league = "";
        String hometeam = "";
        String awayteam = "";
        String gametime = "";
        String link = "";
        String result = "";
        for (int i = 0; i < nodes.length; i++) {
            if (nodes[i] instanceof Tag) {
                tag = (Tag) nodes[i];
                String str = tag.getAttribute("bgcolor");
                if (str != null) {
                    //if(str.contains("11111"))
                    //NEW LEAGUE!
                    //    ;
                    if (str.contains("3333")) {
                        tag = (Tag) tag.getFirstChild();
                        str = tag.getAttribute("class");
                        if (str != null && str.contains("title")) {
                            country = tag.getChildren().toNodeArray()[2].getText();
                            league = tag.getChildren().toNodeArray()[4].getText();
                        }
                    } else if (str.contains("f")) {
                        Node[] tempnodes = tag.getChildren().toNodeArray();
                        String[] t = tempnodes[0].getFirstChild().getText().split(";");
                        if (t.length > 1)
                            gametime = t[1];
                        else
                            gametime = tempnodes[0].getFirstChild().getNextSibling().getNextSibling().getText();
                        hometeam = tempnodes[1].getFirstChild().getText();
                        awayteam = tempnodes[3].getFirstChild().getText();
                        //RESULTAT
                        if (tempnodes[2].getFirstChild().getFirstChild() != null) {
                            //MED LÄNK
                            result = tempnodes[2].getFirstChild().getFirstChild().getText();
                            link = ((LinkTag) (tempnodes[2].getFirstChild())).extractLink();
                        } else {
                            //UTAN LÄNK
                            result = tempnodes[2].getFirstChild().getText();
                            link = null;
                        }
                        ArrayList<FootballEvent> ev = new ArrayList<FootballEvent>();
                        if (link != null) {
                            ev = getScorers(link);
                        }
                        games.add(new FootballGame(country, league, hometeam, awayteam, gametime, ev, result));
                    }
                }
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    return games;
}
 
开发者ID:reggna,项目名称:silvertrout,代码行数:71,代码来源:LiveScoreParser.java

示例9: extracLinks

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public static void extracLinks(String url) {
	try {
		Parser parser = new Parser(url);
		parser.setEncoding("utf-8");// gb2312
		// 过滤 <frame> 标签的 filter,用来提取 frame 标签里的 src 属性所、表示的链接
		NodeFilter frameFilter = new NodeFilter() {
			public boolean accept(Node node) {
				if (node.getText().startsWith("frame src=")) {
					return true;
				} else {
					return false;
				}
			}
		};
		// OrFilter 来设置过滤 <a> 标签,<img> 标签和 <frame> 标签,三个标签是 or 的关系
		OrFilter orFilter = new OrFilter(
				new NodeClassFilter(LinkTag.class), new NodeClassFilter(
						ImageTag.class));
		OrFilter linkFilter = new OrFilter(orFilter, frameFilter);
		// 得到所有经过过滤的标签
		NodeList list = parser.extractAllNodesThatMatch(linkFilter);
		for (int i = 0; i < list.size(); i++) {
			Node tag = list.elementAt(i);
			if (tag instanceof LinkTag)// <a> 标签
			{
				LinkTag link = (LinkTag) tag;
				String linkUrl = link.getLink();// url
				String text = link.getLinkText();// 链接文字
				System.out.println(linkUrl + "**********" + text);
			} else if (tag instanceof ImageTag)// <img> 标签
			{
				ImageTag image = (ImageTag) list.elementAt(i);
				System.out.print(image.getImageURL() + "********");// 图片地址
				System.out.println(image.getText());// 图片文字
			} else// <frame> 标签
			{
				// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
				String frame = tag.getText();
				int start = frame.indexOf("src=");
				frame = frame.substring(start);
				int end = frame.indexOf(" ");
				if (end == -1)
					end = frame.indexOf(">");
				frame = frame.substring(5, end - 1);
				System.out.println(frame);
			}
		}
	} catch (ParserException e) {
		e.printStackTrace();
	}
}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:52,代码来源:HtmlParserTest.java

示例10: extractLink

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public static void extractLink(String content, String keyword) {
		/**
		 * 通过判断链接中是否含keyword确定是否为有效链接。
		 * 注:keyword可能是一组词语或者是一个短语,检索出的内容或许只是匹配上keyword中部分词语
		 */
		try {
		    Parser  parser = Parser.createParser(content, "utf8");
	        NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
			NodeList nodelist = parser.extractAllNodesThatMatch(linkFilter);
			int lastNodeID=0;//上一个确定为有效链接的node ID
			int disThre=8; //通常检索出来的有效链接的id是连续的,因此可用此区分那些广告信息
			  for (int i = 0; i < nodelist.size(); i++) {
		        	Node node = (Node) nodelist.elementAt(i);
		        	LinkTag link = (LinkTag) node;
					String linkUrl = link.getLink();// url
					String text = link.getLinkText();// 链接文字
/*					//simple keywords test for debug
					boolean flag=false;
					String[] tmps=keyword.split("\\s+");
					for(String tmp:tmps){
						if(text.contains(tmp)){
							flag=true;break;
						}
					}
					if(flag){*/
		        	if(containKeyword(text,keyword)){
		        		if(lastNodeID>0 &&i-lastNodeID>disThre){
		        			log.debug("Noisy link!!!");
		        			continue;
		        		}
			        	if(!linkUrl.startsWith("http")) continue;
			        	log.debug(i+":"+linkUrl+", "+text);
			        	lastNodeID=i;
			        	LinkDb.addUnvisitedUrl(linkUrl);
		        	}else{
/*		        		if(text.contains("下一页")){
			        		System.out.println(i+":"+linkUrl+", "+text);
		        		}*/
		        	}
		      }    
		} catch (Exception e) {
			e.printStackTrace();
			log.error("Link extractor  has encountered a problem!! "+e.getMessage());
		}
		
	}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:47,代码来源:HtmlContentExtractor.java

示例11: extracLinks

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
 * 获取一个网页上的链接,并加入到队列中
 * @param content
 * @param filter 用来过滤链接
 * @return Set<String>
 * @author cxn 2015年11月5日
 */
public static Set<String> extracLinks(String content, LinkFilter filter) {
	Set<String> links = new HashSet<String>();
	try {
		Parser parser = new Parser(content);
		// parser.setEncoding("utf-8");
		// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
		NodeFilter frameFilter = new NodeFilter() {
			public boolean accept(Node node) {
				if (node.getText().startsWith("frame src=")) {
					return true;
				} else {
					return false;
				}
			}
		};
		// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
		OrFilter linkFilter = new OrFilter(new NodeClassFilter(
				LinkTag.class), frameFilter);
		// 得到所有经过过滤的标签
		NodeList list = parser.extractAllNodesThatMatch(linkFilter);
		for (int i = 0; i < list.size(); i++) {
			Node tag = list.elementAt(i);
			if (tag instanceof LinkTag){
				LinkTag link = (LinkTag) tag;
				String linkUrl = link.getLink();
				if(linkUrl.startsWith("http") && filter.accept(linkUrl, Main.keyWord)){
					links.add(linkUrl);
				}else if(linkUrl.startsWith("/") && filter.accept(Main.baseUrl+linkUrl, Main.keyWord)){
					links.add(Main.baseUrl+linkUrl);
				}
			}else{
				// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
				String frame = tag.getText();
				int start = frame.indexOf("src=");
				frame = frame.substring(start);
				int end = frame.indexOf(" ");
				if (end == -1){
					end = frame.indexOf(">");
				}
				String frameUrl = frame.substring(5, end - 1);
				if (filter.accept(frameUrl)){
					links.add(frameUrl);
				}
			}
		}
	} catch (ParserException e) {
		e.printStackTrace();
	}
	LinkQueue.addUnvisitedUrl(links);
	return links;
}
 
开发者ID:cxn945,项目名称:my_crawler,代码行数:59,代码来源:PageParser.java

示例12: visit

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
	 * 更新最新的部门发文
	 * @param httpConn
	 * @throws Exception
	 */
	public List visit(boolean isByCookie) throws Exception{
		log4.info("======访问网站============cookie="+isByCookie);
		
		String newUrls = null;
		TableColumn[] arrColumns;
		LinkTag lt = null;
	
		String title, dates, codes, link;
		HtmlPage page = null;
		TableTag tableContent[] = null;
		int order_count = 0;
		List list = new LinkedList();
		
		if(this.getUrl().startsWith("http:")){
			newUrls = this.getUrl();
		}
		else{
			if(this.base_url.endsWith("/")){
				newUrls = this.base_url+"/"+this.getUrl();
			}
			else{
				newUrls = this.base_url+this.getUrl();
			}
			
		}
//		if(true){
//			page = move2Urls(this.getHttpURLConnection(), newUrls);
//			page.getBody().toHtml();
////			return page.getBody().toHtml();
//		}
		log4.info("newUrls="+newUrls);

		String str = this.move2UrlsHtml(this.getHttpURLConnection(), newUrls, isByCookie);
		
		String[] msgs = str.split("\n");
		List htmls = new LinkedList();
		for(String msg: msgs){
			msg = msg.trim();
			msg = msg.replaceAll("&nbsp;", "");
			msg = msg.replaceAll("&lt;", "");
			msg = msg.replaceAll("&gt;", "");
			msg = msg.replaceAll("&quot;", "");
			msg = msg.replaceAll("td", "");
			msg = msg.replaceAll("tr", "");
//			msg = msg.replaceAll("&", "&amp;");
//			msg = msg.replaceAll("<", "&lt;");
//			msg = msg.replaceAll(">", "&gt;");
//			msg = msg.replaceAll("\"", "&quot;");
//			msg = msg.replaceAll("'", "&apos;");
			if(!ErrorCode.isEmpty(msg)){
				htmls.add(msg);
			}
		}
	
		return htmls;
	}
 
开发者ID:jview,项目名称:jtools,代码行数:62,代码来源:WebUtil.java

示例13: list

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "newslist", results = { @Result(type = "json", params = {
		"root", "list" }) })
public String list() {
	Cache c = CacheManager.getInstance().getCache("News");
	String ckey = "newslist"+listid + page;
	Element ele = c.get(ckey);
	if (!CommonUtil.isEmpty(ele)) {
		list = (List) ele.getObjectValue();

	} else {
		StringBuffer retstr = fetch(RD+"/news/"+listid+"/"+page+".html");
		Parser p = Parser.createParser(retstr.toString(), "utf-8");
		list = new ArrayList<News>();
		try {
			NodeList ls = p
					.extractAllNodesThatMatch(new HasAttributeFilter("class","date"));
			SimpleNodeIterator i = ls.elements();
			while (i.hasMoreNodes()) {
				Node n = i.nextNode();
				if (n instanceof TagNode) {
					TagNode tn = (TagNode) n;
					News news = new News();
					news.setPubdate(tn.toPlainTextString());
					Node tmp=tn.getNextSibling();
					while(tmp!=null &&!(tmp instanceof LinkTag))
						tmp=tmp.getNextSibling();
					if(tmp!=null)
					{
						LinkTag link=(LinkTag)tmp;
						news.setId(link.getAttribute("href"));
						news.setTitle(link.getAttribute("title"));
					}
					list.add(news);
				}
			}
			c.put(new Element(ckey, list));
		} catch (ParserException e) {

			e.printStackTrace();
		}
	}

	return SUCCESS;
}
 
开发者ID:BaixiangLiu,项目名称:fudanweixin,代码行数:46,代码来源:CampusNewsAction.java

示例14: list

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
@SuppressWarnings("rawtypes")
@Action(value = "eventlist")
public String list() throws IOException {
	Cache c = CacheManager.getInstance().getCache("News");
	String ckey = "eventlist"+page ;
	Element ele = c.get(ckey);
	if (!CommonUtil.isEmpty(ele)) {
		list = (List) ele.getObjectValue();

	} else {
		StringBuffer retstr = fetch(RD+"/calendar/?a=list&&m=recent&range=30&_="+System.currentTimeMillis()+"&type=0&place=0&type="+page	);
		Parser p = Parser.createParser(retstr.toString(), "utf-8");
		list = new ArrayList<News>();
		try {
			NodeList ls = p
					.extractAllNodesThatMatch(new HasAttributeFilter("class","clear"));
			if(ls.size()==2)
			{
				int tk1=ls.elementAt(0).getEndPosition();
				int tk2=ls.elementAt(1).getStartPosition();
				ServletActionContext.getResponse().setCharacterEncoding("utf-8");
				p=Parser.createParser(retstr.substring(tk1+6, tk2), "utf-8");
				NodeList nl=p.parse(null);
				NodeList links=nl.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class),true);
				SimpleNodeIterator i=links.elements();
				while(i.hasMoreNodes())
				{
					LinkTag lt=(LinkTag)i.nextNode();
					NodeList ll=new NodeList();
					ll.add(new TextNode(lt.getAttribute("title")));
					lt.setChildren(ll);
					lt.removeAttribute("title");
				}
				
				
				ServletActionContext.getResponse().getWriter().print(nl.toHtml());
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
	}

	return NONE;
}
 
开发者ID:BaixiangLiu,项目名称:fudanweixin,代码行数:45,代码来源:CampusEventAction.java

示例15: extracLinks

import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public Set<String> extracLinks(PageResult pageResult, LinkFilter filter) {
	//String url=crawlUrl.getOriUrl();
	Set<String> links = new HashSet<String>();
	try {
		Parser parser = new Parser(pageResult.getContent());
		
		parser.setEncoding(pageResult.getCharSet());
		
		// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
		NodeFilter frameFilter = new NodeFilter() {
			public boolean accept(Node node) {
				if (node.getText().startsWith("frame src=")) {
					return true;
				} else {
					return false;
				}
			}
		};
		// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
		OrFilter linkFilter = new OrFilter(new NodeClassFilter(
				LinkTag.class), frameFilter);
		// 得到所有经过过滤的标签
		NodeList list = parser.extractAllNodesThatMatch(linkFilter);
		for (int i = 0; i < list.size(); i++) {
			Node tag = list.elementAt(i);
			if (tag instanceof LinkTag)// <a> 标签
			{
				LinkTag link = (LinkTag) tag;
				String linkUrl = link.getLink();// url
				if (filter.accept(linkUrl))
				{
					//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(linkUrl);
					links.add(linkUrl);
				}
					
			} else// <frame> 标签
			{
				// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
				String frame = tag.getText();
				int start = frame.indexOf("src=");
				frame = frame.substring(start);
				int end = frame.indexOf(" ");
				if (end == -1)
					end = frame.indexOf(">");
				String frameUrl = frame.substring(5, end - 1);
				if (filter.accept(frameUrl))
				{
					//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(frameUrl);
					links.add(frameUrl);
				}
					
			}
		}
	} catch (ParserException e) {
		e.printStackTrace();
	}
	return links;
}
 
开发者ID:hxt168,项目名称:webpasser,代码行数:59,代码来源:DefaultHtmlParser.java


注:本文中的org.htmlparser.tags.LinkTag类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。