当前位置: 首页>>代码示例>>Java>>正文


Java Parser.createParser方法代码示例

本文整理汇总了Java中org.htmlparser.Parser.createParser方法的典型用法代码示例。如果您正苦于以下问题:Java Parser.createParser方法的具体用法?Java Parser.createParser怎么用?Java Parser.createParser使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.htmlparser.Parser的用法示例。


在下文中一共展示了Parser.createParser方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: parseMessage

import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
   * parses the body of the message, and returns a parsed representation
   * See {@link http://htmlparser.sourceforge.net/} for details
   * @param url the url that the message resulted from
   * @param message the Message to parse
   * @return a NodeList containing the various Nodes making up the page
   */
  public Object parseMessage(HttpUrl url, Message message) {
      String contentType = message.getHeader("Content-Type");
      if (contentType == null || !contentType.matches("text/html.*")) {
          return null;
      }
      byte[] content = message.getContent();
      if (content == null || content.length == 0) {
          return null;
      }
      Parser parser = Parser.createParser(new String(content), null);
      try {
          NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
                  return true;
              }
          });
          return nodelist;
      } catch (ParserException pe) {
          _logger.severe(pe.toString());
          return null;
      }
  }
 
开发者ID:Neraud,项目名称:PADListener,代码行数:30,代码来源:HTMLParser.java

示例2: splitHtml

import org.htmlparser.Parser; //导入方法依赖的package包/类
private List<String> splitHtml() {
	List<String> resultList = new ArrayList<String>();
	try {
		Parser parser = Parser.createParser(content, "UTF-8");
		NodeList nodeList = parser.parse(null);
		resultList = recusiveSplitHtml(nodeList);
		StringBuffer lastPageContent = new StringBuffer();
		for (TagNode tagNode : tagNodeList) {
			if (tagNode.getStartPosition() < startPosition && tagNode.getEndTag().getEndPosition() >= startPosition) {
				lastPageContent.append("<");
				lastPageContent.append(tagNode.getText());
				lastPageContent.append(">");
			}
		}
		lastPageContent.append(content.substring(startPosition));
		Parser lastPageContentParser = Parser.createParser(lastPageContent.toString(), "UTF-8");
		NodeList pageContentNodeList = lastPageContentParser.parse(null);
		resultList.add(pageContentNodeList.toHtml());
	} catch (ParserException e) {
		e.printStackTrace();
	}
	return resultList;
}
 
开发者ID:wangko27,项目名称:SelfSoftShop,代码行数:24,代码来源:Article.java

示例3: PostCleaner

import org.htmlparser.Parser; //导入方法依赖的package包/类
public PostCleaner(String html, int minCodeChars, boolean excludeCode) {
  try {
    Parser htmlParser = Parser.createParser(html, "utf8");  

    PostCleanerVisitor res = new PostCleanerVisitor(minCodeChars, excludeCode);      
    htmlParser.visitAllNodesWith(res);      
    mText = res.getText();
  } catch (ParserException e) {      
    System.err.println(" Parser exception: " + e + " trying simple conversion");
    // Plan B!!!
    mText = PostCleanerVisitor.simpleProc(html);
  }    
}
 
开发者ID:oaqa,项目名称:knn4qa,代码行数:14,代码来源:ConvertStackOverflow.java

示例4: readByHtml

import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
 * 按页面方式处理.解析标准的html页面
 * @param content 网页的内容
 * @throws Exception
 */
public static void readByHtml(String content) throws Exception {
    Parser myParser;
    myParser = Parser.createParser(content, "utf8");
    HtmlPage visitor = new HtmlPage(myParser);
    myParser.visitAllNodesWith(visitor);

    String textInPage = visitor.getTitle();
    System.out.println(textInPage);
    NodeList nodelist;
    nodelist = visitor.getBody();
    
    System.out.print(nodelist.asString().trim());
}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:19,代码来源:HtmlParserTest.java

示例5: readTextAndLinkAndTitle

import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
 * 分别读纯文本和链接.
 * @param result 网页的内容
 * @throws Exception
 */
public static void readTextAndLinkAndTitle(String result) throws Exception {
    Parser parser;
    NodeList nodelist;
    parser = Parser.createParser(result, "utf8");

    NodeFilter textFilter = new NodeClassFilter(TextNode.class);
    NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
    NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
    OrFilter lastFilter = new OrFilter();
    lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter, titleFilter });
    nodelist = parser.parse(lastFilter);
    Node[] nodes = nodelist.toNodeArray();
    String line = "";
    
    for (int i = 0; i < nodes.length; i++) {
        Node node = nodes[i];
        if (node instanceof TextNode) {
            TextNode textnode = (TextNode) node;
            line = textnode.getText();
        } else if (node instanceof LinkTag) {
            LinkTag link = (LinkTag) node;
            line = link.getLink();
        } else if (node instanceof TitleTag) {
            TitleTag titlenode = (TitleTag) node;
            line = titlenode.getTitle();
        }
        
        if (isTrimEmpty(line))
            continue;
        System.out.println(line);
    }
}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:38,代码来源:HtmlParserTest.java

示例6: extractTextByTextNode

import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> extractTextByTextNode(String content){
  	List<String> doc=new ArrayList<String>();//每个元素为一个段落
  	 if (content == null) {
    	return doc;
  }
  	 try{
	     Parser parser = Parser.createParser(content, "utf8");      
	     NodeFilter textFilter = new NodeClassFilter(TextNode.class);
        NodeList nodelist=parser.extractAllNodesThatMatch(textFilter);
        HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
        for (int i = 0; i < nodelist.size(); i++) {
        	Node textnode = (Node) nodelist.elementAt(i);
        	if(textnode.toPlainTextString().trim().length()>0)
        		log.debug(i+": "+" content: "+textnode.toPlainTextString());
        	if(isInformativeStricter(textnode,parentWeight)){
	        	log.debug(i+": "+" content: "+textnode.toPlainTextString());
	        	doc.add(textnode.toPlainTextString());
        	}        	
        }  
}catch(Exception e){
	e.printStackTrace();
	log.error("Text extractor  has encountered a problem!! "+e.getMessage());
}
  	 
    return doc;

  }
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:28,代码来源:HtmlContentExtractor.java

示例7: extractTextByTagP

import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> extractTextByTagP(String content){
			List<String> doc=new ArrayList<String>();//每个元素为一个段落
			try{
				 if (content == null) {
				    	return doc;
				    }
				     Parser parser = Parser.createParser(content, "utf8");      
			        TagNameFilter paraFilter=new TagNameFilter("p");//get content between <p> </p>
//			        TagNameFilter paraFilter2=new TagNameFilter("br");//get content between <br> </br>
//			        NodeFilter filter = new OrFilter(paraFilter, paraFilter2);
			        NodeList nodelist=parser.extractAllNodesThatMatch(paraFilter);//报错!!
			        HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
			        for (int i = 0; i < nodelist.size(); i++) {
			        	Node textnode = (Node) nodelist.elementAt(i);
			        	log.debug(i+": "+" content: "+textnode.toPlainTextString());

			        	if(isInformative(textnode,parentWeight)){
				        	log.debug(i+": "+" content: "+textnode.toPlainTextString());
				        	doc.add(textnode.toPlainTextString());
			        	}        	
			        }  
			}catch(Exception e){
				e.printStackTrace();
				log.error("Text extractor  has encountered a problem!! "+e.getMessage());
			}
	        return doc;
	}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:28,代码来源:HtmlContentExtractor.java

示例8: filterSelectNode

import org.htmlparser.Parser; //导入方法依赖的package包/类
private NodeList filterSelectNode(String responseBody) throws ParserException {
    Parser parser = Parser.createParser(responseBody, HTTP.ISO_8859_1);
    return parser.extractAllNodesThatMatch(new NodeFilter() {
        @Override
        public boolean accept(Node node) {
            if (node.getText().startsWith("select")) {
                return true;
            }
            return false;
        }
    });
}
 
开发者ID:emivaljr,项目名称:hojenaoapp,代码行数:13,代码来源:MyEndpoint.java

示例9: filterTable

import org.htmlparser.Parser; //导入方法依赖的package包/类
private NodeList filterTable(String responseBody) throws ParserException {
    Parser parser = Parser.createParser(responseBody, HTTP.ISO_8859_1);
    return parser.extractAllNodesThatMatch(new NodeFilter() {
        @Override
        public boolean accept(Node node) {
            if (node.getText().toUpperCase().startsWith("TABLE") || node.getText().toUpperCase().startsWith("H3")) {
                return true;
            }
            return false;
        }
    });
}
 
开发者ID:emivaljr,项目名称:hojenaoapp,代码行数:13,代码来源:MyEndpoint.java

示例10: getContentText

import org.htmlparser.Parser; //导入方法依赖的package包/类
@Transient
public String getContentText() {
	try {
		Parser parser = Parser.createParser(content, "UTF-8");
		TextExtractingVisitor textExtractingVisitor = new TextExtractingVisitor();
		parser.visitAllNodesWith(textExtractingVisitor);
		return textExtractingVisitor.getExtractedText();
	} catch (ParserException e) {
		e.printStackTrace();
		return null;
	}
}
 
开发者ID:wangko27,项目名称:SelfSoftShop,代码行数:13,代码来源:Article.java

示例11: extractLink

import org.htmlparser.Parser; //导入方法依赖的package包/类
public static void extractLink(String content, String keyword) {
		/**
		 * 通过判断链接中是否含keyword确定是否为有效链接。
		 * 注:keyword可能是一组词语或者是一个短语,检索出的内容或许只是匹配上keyword中部分词语
		 */
		try {
		    Parser  parser = Parser.createParser(content, "utf8");
	        NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
			NodeList nodelist = parser.extractAllNodesThatMatch(linkFilter);
			int lastNodeID=0;//上一个确定为有效链接的node ID
			int disThre=8; //通常检索出来的有效链接的id是连续的,因此可用此区分那些广告信息
			  for (int i = 0; i < nodelist.size(); i++) {
		        	Node node = (Node) nodelist.elementAt(i);
		        	LinkTag link = (LinkTag) node;
					String linkUrl = link.getLink();// url
					String text = link.getLinkText();// 链接文字
/*					//simple keywords test for debug
					boolean flag=false;
					String[] tmps=keyword.split("\\s+");
					for(String tmp:tmps){
						if(text.contains(tmp)){
							flag=true;break;
						}
					}
					if(flag){*/
		        	if(containKeyword(text,keyword)){
		        		if(lastNodeID>0 &&i-lastNodeID>disThre){
		        			log.debug("Noisy link!!!");
		        			continue;
		        		}
			        	if(!linkUrl.startsWith("http")) continue;
			        	log.debug(i+":"+linkUrl+", "+text);
			        	lastNodeID=i;
			        	LinkDb.addUnvisitedUrl(linkUrl);
		        	}else{
/*		        		if(text.contains("下一页")){
			        		System.out.println(i+":"+linkUrl+", "+text);
		        		}*/
		        	}
		      }    
		} catch (Exception e) {
			e.printStackTrace();
			log.error("Link extractor  has encountered a problem!! "+e.getMessage());
		}
		
	}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:47,代码来源:HtmlContentExtractor.java

示例12: filter

import org.htmlparser.Parser; //导入方法依赖的package包/类
private AX2JClassTranslator filter(String content) {
    try {
        Parser parser = Parser.createParser(content, Config.ENCODE);
        AndFilter andFilter1 =
                new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class","alt-color api apilevel-"));
        AndFilter andFilter2 =
                //kill me, the " api apilevel-" has a space at the start
                new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class"," api apilevel-"));
        OrFilter orFilter = new OrFilter(andFilter1, andFilter2);
        NodeList tableNodeList = parser.parse(orFilter);
        NodeIterator tableIt = tableNodeList.elements();

        AX2JClassTranslator map = new AX2JClassTranslator(type);
        while(tableIt.hasMoreNodes()) {
            Node trNode = tableIt.nextNode();
            NodeList trNodeList = trNode.getChildren();
            /**
             * ***** trNodeList example *****
             *    Txt (268[6,37],269[7,0]): \nTag (269[7,0],292[7,23]): td class="jd-linkcol"
             *      Tag (292[7,23],381[7,112]): a href="../../../reference/android/view/View.html...
             *        Txt (381[7,112],412[7,143]): android:accessibilityLiveRegion
             *        End (412[7,143],416[7,147]): /a
             *      End (416[7,147],421[7,152]): /td
             *    Txt (421[7,152],422[8,0]): \nTag (422[8,0],445[8,23]): td class="jd-linkcol"
             *      Txt (445[8,23],446[9,0]): \n
             *      Tag (446[9,0],530[9,84]): a href="../../../reference/android/view/View.html#s...
             *        Txt (530[9,84],561[9,115]): setAccessibilityLiveRegion(int)
             *        End (561[9,115],565[9,119]): /a
             *      Txt (565[9,119],566[10,0]): \n
             *      End (566[10,0],571[10,5]): /td
             *    Txt (571[10,5],572[11,0]): \nTag (572[11,0],609[11,37]): td class="jd-descrcol" width="100%"
             *      Txt (609[11,37],712[14,0]): \nIndicates to accessibility services whether the...
             *      End (712[14,0],717[14,5]): /td
             *    Txt (717[14,5],718[15,0]): \n
             * ***** trNodeList example *****
             */
            if (trNodeList.size() != 7) {
                throw new AndroidDocException(AndroidDocException.ATM_FORMAT_ERROR);
            }

            String attr = trNodeList.elementAt(1).toPlainTextString();
            attr = attr.replace("\n", "");
            String method = trNodeList.elementAt(3).toPlainTextString();
            map.add(attr, method);
        }
        return map;
    } catch (ParserException e) {
        throw new AndroidDocException(AndroidDocException.AXML_FORMAT_ERROR);
    }
}
 
开发者ID:sickworm,项目名称:AndroidXMLToJava,代码行数:51,代码来源:Filter2014.java

示例13: list

import org.htmlparser.Parser; //导入方法依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "sdlist", results = { @Result(type = "json", params = {
		"root", "list" }) })
public String list() {
	Cache c = CacheManager.getInstance().getCache("News");
	String ckey =domain+listid + page;
	Element ele = c.get(ckey);
	if (!CommonUtil.isEmpty(ele)) {
		list = (List) ele.getObjectValue();

	} else {
		StringBuffer retstr = fetch(domain+"/"+listid+"/list"
				+ page+".htm");
		Parser p = Parser.createParser(retstr.toString(), "utf-8");
		list = new ArrayList<News>();
		try {
			NodeList ls = p
					.extractAllNodesThatMatch(new AttributeRegexFilter(
							"href", ".*/page\\.htm"));
			SimpleNodeIterator i = ls.elements();
			while (i.hasMoreNodes()) {
				Node n = i.nextNode();
				if (n instanceof TagNode) {
					TagNode tn = (TagNode) n;
					News news = new News();
					String href = tn.getAttribute("href");						
					news.setId(href);
					news.setTitle(tn.getAttribute("alt"));
					Node tmp=tn.getParent().getNextSibling();
					while(tmp!=null &&!(tmp instanceof TableColumn))
						tmp=tmp.getNextSibling();
					if(tmp!=null)
						news.setPubdate(tmp.toPlainTextString());
					list.add(news);
				}
			}
			c.put(new Element(ckey, list));
		} catch (ParserException e) {

			e.printStackTrace();
		}
	}
	jsonp(list);
	return NONE;
}
 
开发者ID:BaixiangLiu,项目名称:fudanweixin,代码行数:46,代码来源:SudyPageAction.java

示例14: list

import org.htmlparser.Parser; //导入方法依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "newslist", results = { @Result(type = "json", params = {
		"root", "list" }) })
public String list() {
	Cache c = CacheManager.getInstance().getCache("News");
	String ckey = "newslist"+listid + page;
	Element ele = c.get(ckey);
	if (!CommonUtil.isEmpty(ele)) {
		list = (List) ele.getObjectValue();

	} else {
		StringBuffer retstr = fetch(RD+"/news/"+listid+"/"+page+".html");
		Parser p = Parser.createParser(retstr.toString(), "utf-8");
		list = new ArrayList<News>();
		try {
			NodeList ls = p
					.extractAllNodesThatMatch(new HasAttributeFilter("class","date"));
			SimpleNodeIterator i = ls.elements();
			while (i.hasMoreNodes()) {
				Node n = i.nextNode();
				if (n instanceof TagNode) {
					TagNode tn = (TagNode) n;
					News news = new News();
					news.setPubdate(tn.toPlainTextString());
					Node tmp=tn.getNextSibling();
					while(tmp!=null &&!(tmp instanceof LinkTag))
						tmp=tmp.getNextSibling();
					if(tmp!=null)
					{
						LinkTag link=(LinkTag)tmp;
						news.setId(link.getAttribute("href"));
						news.setTitle(link.getAttribute("title"));
					}
					list.add(news);
				}
			}
			c.put(new Element(ckey, list));
		} catch (ParserException e) {

			e.printStackTrace();
		}
	}

	return SUCCESS;
}
 
开发者ID:BaixiangLiu,项目名称:fudanweixin,代码行数:46,代码来源:CampusNewsAction.java

示例15: list

import org.htmlparser.Parser; //导入方法依赖的package包/类
@SuppressWarnings("rawtypes")
@Action(value = "eventlist")
public String list() throws IOException {
	Cache c = CacheManager.getInstance().getCache("News");
	String ckey = "eventlist"+page ;
	Element ele = c.get(ckey);
	if (!CommonUtil.isEmpty(ele)) {
		list = (List) ele.getObjectValue();

	} else {
		StringBuffer retstr = fetch(RD+"/calendar/?a=list&&m=recent&range=30&_="+System.currentTimeMillis()+"&type=0&place=0&type="+page	);
		Parser p = Parser.createParser(retstr.toString(), "utf-8");
		list = new ArrayList<News>();
		try {
			NodeList ls = p
					.extractAllNodesThatMatch(new HasAttributeFilter("class","clear"));
			if(ls.size()==2)
			{
				int tk1=ls.elementAt(0).getEndPosition();
				int tk2=ls.elementAt(1).getStartPosition();
				ServletActionContext.getResponse().setCharacterEncoding("utf-8");
				p=Parser.createParser(retstr.substring(tk1+6, tk2), "utf-8");
				NodeList nl=p.parse(null);
				NodeList links=nl.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class),true);
				SimpleNodeIterator i=links.elements();
				while(i.hasMoreNodes())
				{
					LinkTag lt=(LinkTag)i.nextNode();
					NodeList ll=new NodeList();
					ll.add(new TextNode(lt.getAttribute("title")));
					lt.setChildren(ll);
					lt.removeAttribute("title");
				}
				
				
				ServletActionContext.getResponse().getWriter().print(nl.toHtml());
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
	}

	return NONE;
}
 
开发者ID:BaixiangLiu,项目名称:fudanweixin,代码行数:45,代码来源:CampusEventAction.java


注:本文中的org.htmlparser.Parser.createParser方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。