本文整理汇总了Java中org.htmlparser.Parser.createParser方法的典型用法代码示例。如果您正苦于以下问题:Java Parser.createParser方法的具体用法?Java Parser.createParser怎么用?Java Parser.createParser使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlparser.Parser
的用法示例。
在下文中一共展示了Parser.createParser方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parseMessage
import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
* parses the body of the message, and returns a parsed representation
* See {@link http://htmlparser.sourceforge.net/} for details
* @param url the url that the message resulted from
* @param message the Message to parse
* @return a NodeList containing the various Nodes making up the page
*/
public Object parseMessage(HttpUrl url, Message message) {
String contentType = message.getHeader("Content-Type");
if (contentType == null || !contentType.matches("text/html.*")) {
return null;
}
byte[] content = message.getContent();
if (content == null || content.length == 0) {
return null;
}
Parser parser = Parser.createParser(new String(content), null);
try {
NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
return true;
}
});
return nodelist;
} catch (ParserException pe) {
_logger.severe(pe.toString());
return null;
}
}
示例2: splitHtml
import org.htmlparser.Parser; //导入方法依赖的package包/类
private List<String> splitHtml() {
List<String> resultList = new ArrayList<String>();
try {
Parser parser = Parser.createParser(content, "UTF-8");
NodeList nodeList = parser.parse(null);
resultList = recusiveSplitHtml(nodeList);
StringBuffer lastPageContent = new StringBuffer();
for (TagNode tagNode : tagNodeList) {
if (tagNode.getStartPosition() < startPosition && tagNode.getEndTag().getEndPosition() >= startPosition) {
lastPageContent.append("<");
lastPageContent.append(tagNode.getText());
lastPageContent.append(">");
}
}
lastPageContent.append(content.substring(startPosition));
Parser lastPageContentParser = Parser.createParser(lastPageContent.toString(), "UTF-8");
NodeList pageContentNodeList = lastPageContentParser.parse(null);
resultList.add(pageContentNodeList.toHtml());
} catch (ParserException e) {
e.printStackTrace();
}
return resultList;
}
示例3: PostCleaner
import org.htmlparser.Parser; //导入方法依赖的package包/类
public PostCleaner(String html, int minCodeChars, boolean excludeCode) {
try {
Parser htmlParser = Parser.createParser(html, "utf8");
PostCleanerVisitor res = new PostCleanerVisitor(minCodeChars, excludeCode);
htmlParser.visitAllNodesWith(res);
mText = res.getText();
} catch (ParserException e) {
System.err.println(" Parser exception: " + e + " trying simple conversion");
// Plan B!!!
mText = PostCleanerVisitor.simpleProc(html);
}
}
示例4: readByHtml
import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
* 按页面方式处理.解析标准的html页面
* @param content 网页的内容
* @throws Exception
*/
public static void readByHtml(String content) throws Exception {
Parser myParser;
myParser = Parser.createParser(content, "utf8");
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
String textInPage = visitor.getTitle();
System.out.println(textInPage);
NodeList nodelist;
nodelist = visitor.getBody();
System.out.print(nodelist.asString().trim());
}
示例5: readTextAndLinkAndTitle
import org.htmlparser.Parser; //导入方法依赖的package包/类
/**
* 分别读纯文本和链接.
* @param result 网页的内容
* @throws Exception
*/
public static void readTextAndLinkAndTitle(String result) throws Exception {
Parser parser;
NodeList nodelist;
parser = Parser.createParser(result, "utf8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter, titleFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
} else if (node instanceof TitleTag) {
TitleTag titlenode = (TitleTag) node;
line = titlenode.getTitle();
}
if (isTrimEmpty(line))
continue;
System.out.println(line);
}
}
示例6: extractTextByTextNode
import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> extractTextByTextNode(String content){
List<String> doc=new ArrayList<String>();//每个元素为一个段落
if (content == null) {
return doc;
}
try{
Parser parser = Parser.createParser(content, "utf8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeList nodelist=parser.extractAllNodesThatMatch(textFilter);
HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
for (int i = 0; i < nodelist.size(); i++) {
Node textnode = (Node) nodelist.elementAt(i);
if(textnode.toPlainTextString().trim().length()>0)
log.debug(i+": "+" content: "+textnode.toPlainTextString());
if(isInformativeStricter(textnode,parentWeight)){
log.debug(i+": "+" content: "+textnode.toPlainTextString());
doc.add(textnode.toPlainTextString());
}
}
}catch(Exception e){
e.printStackTrace();
log.error("Text extractor has encountered a problem!! "+e.getMessage());
}
return doc;
}
示例7: extractTextByTagP
import org.htmlparser.Parser; //导入方法依赖的package包/类
public static List<String> extractTextByTagP(String content){
List<String> doc=new ArrayList<String>();//每个元素为一个段落
try{
if (content == null) {
return doc;
}
Parser parser = Parser.createParser(content, "utf8");
TagNameFilter paraFilter=new TagNameFilter("p");//get content between <p> </p>
// TagNameFilter paraFilter2=new TagNameFilter("br");//get content between <br> </br>
// NodeFilter filter = new OrFilter(paraFilter, paraFilter2);
NodeList nodelist=parser.extractAllNodesThatMatch(paraFilter);//报错!!
HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
for (int i = 0; i < nodelist.size(); i++) {
Node textnode = (Node) nodelist.elementAt(i);
log.debug(i+": "+" content: "+textnode.toPlainTextString());
if(isInformative(textnode,parentWeight)){
log.debug(i+": "+" content: "+textnode.toPlainTextString());
doc.add(textnode.toPlainTextString());
}
}
}catch(Exception e){
e.printStackTrace();
log.error("Text extractor has encountered a problem!! "+e.getMessage());
}
return doc;
}
示例8: filterSelectNode
import org.htmlparser.Parser; //导入方法依赖的package包/类
private NodeList filterSelectNode(String responseBody) throws ParserException {
Parser parser = Parser.createParser(responseBody, HTTP.ISO_8859_1);
return parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if (node.getText().startsWith("select")) {
return true;
}
return false;
}
});
}
示例9: filterTable
import org.htmlparser.Parser; //导入方法依赖的package包/类
private NodeList filterTable(String responseBody) throws ParserException {
Parser parser = Parser.createParser(responseBody, HTTP.ISO_8859_1);
return parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if (node.getText().toUpperCase().startsWith("TABLE") || node.getText().toUpperCase().startsWith("H3")) {
return true;
}
return false;
}
});
}
示例10: getContentText
import org.htmlparser.Parser; //导入方法依赖的package包/类
@Transient
public String getContentText() {
try {
Parser parser = Parser.createParser(content, "UTF-8");
TextExtractingVisitor textExtractingVisitor = new TextExtractingVisitor();
parser.visitAllNodesWith(textExtractingVisitor);
return textExtractingVisitor.getExtractedText();
} catch (ParserException e) {
e.printStackTrace();
return null;
}
}
示例11: extractLink
import org.htmlparser.Parser; //导入方法依赖的package包/类
public static void extractLink(String content, String keyword) {
/**
* 通过判断链接中是否含keyword确定是否为有效链接。
* 注:keyword可能是一组词语或者是一个短语,检索出的内容或许只是匹配上keyword中部分词语
*/
try {
Parser parser = Parser.createParser(content, "utf8");
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeList nodelist = parser.extractAllNodesThatMatch(linkFilter);
int lastNodeID=0;//上一个确定为有效链接的node ID
int disThre=8; //通常检索出来的有效链接的id是连续的,因此可用此区分那些广告信息
for (int i = 0; i < nodelist.size(); i++) {
Node node = (Node) nodelist.elementAt(i);
LinkTag link = (LinkTag) node;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
/* //simple keywords test for debug
boolean flag=false;
String[] tmps=keyword.split("\\s+");
for(String tmp:tmps){
if(text.contains(tmp)){
flag=true;break;
}
}
if(flag){*/
if(containKeyword(text,keyword)){
if(lastNodeID>0 &&i-lastNodeID>disThre){
log.debug("Noisy link!!!");
continue;
}
if(!linkUrl.startsWith("http")) continue;
log.debug(i+":"+linkUrl+", "+text);
lastNodeID=i;
LinkDb.addUnvisitedUrl(linkUrl);
}else{
/* if(text.contains("下一页")){
System.out.println(i+":"+linkUrl+", "+text);
}*/
}
}
} catch (Exception e) {
e.printStackTrace();
log.error("Link extractor has encountered a problem!! "+e.getMessage());
}
}
示例12: filter
import org.htmlparser.Parser; //导入方法依赖的package包/类
private AX2JClassTranslator filter(String content) {
try {
Parser parser = Parser.createParser(content, Config.ENCODE);
AndFilter andFilter1 =
new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class","alt-color api apilevel-"));
AndFilter andFilter2 =
//kill me, the " api apilevel-" has a space at the start
new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class"," api apilevel-"));
OrFilter orFilter = new OrFilter(andFilter1, andFilter2);
NodeList tableNodeList = parser.parse(orFilter);
NodeIterator tableIt = tableNodeList.elements();
AX2JClassTranslator map = new AX2JClassTranslator(type);
while(tableIt.hasMoreNodes()) {
Node trNode = tableIt.nextNode();
NodeList trNodeList = trNode.getChildren();
/**
* ***** trNodeList example *****
* Txt (268[6,37],269[7,0]): \nTag (269[7,0],292[7,23]): td class="jd-linkcol"
* Tag (292[7,23],381[7,112]): a href="../../../reference/android/view/View.html...
* Txt (381[7,112],412[7,143]): android:accessibilityLiveRegion
* End (412[7,143],416[7,147]): /a
* End (416[7,147],421[7,152]): /td
* Txt (421[7,152],422[8,0]): \nTag (422[8,0],445[8,23]): td class="jd-linkcol"
* Txt (445[8,23],446[9,0]): \n
* Tag (446[9,0],530[9,84]): a href="../../../reference/android/view/View.html#s...
* Txt (530[9,84],561[9,115]): setAccessibilityLiveRegion(int)
* End (561[9,115],565[9,119]): /a
* Txt (565[9,119],566[10,0]): \n
* End (566[10,0],571[10,5]): /td
* Txt (571[10,5],572[11,0]): \nTag (572[11,0],609[11,37]): td class="jd-descrcol" width="100%"
* Txt (609[11,37],712[14,0]): \nIndicates to accessibility services whether the...
* End (712[14,0],717[14,5]): /td
* Txt (717[14,5],718[15,0]): \n
* ***** trNodeList example *****
*/
if (trNodeList.size() != 7) {
throw new AndroidDocException(AndroidDocException.ATM_FORMAT_ERROR);
}
String attr = trNodeList.elementAt(1).toPlainTextString();
attr = attr.replace("\n", "");
String method = trNodeList.elementAt(3).toPlainTextString();
map.add(attr, method);
}
return map;
} catch (ParserException e) {
throw new AndroidDocException(AndroidDocException.AXML_FORMAT_ERROR);
}
}
示例13: list
import org.htmlparser.Parser; //导入方法依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "sdlist", results = { @Result(type = "json", params = {
"root", "list" }) })
public String list() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey =domain+listid + page;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(domain+"/"+listid+"/list"
+ page+".htm");
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new AttributeRegexFilter(
"href", ".*/page\\.htm"));
SimpleNodeIterator i = ls.elements();
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
News news = new News();
String href = tn.getAttribute("href");
news.setId(href);
news.setTitle(tn.getAttribute("alt"));
Node tmp=tn.getParent().getNextSibling();
while(tmp!=null &&!(tmp instanceof TableColumn))
tmp=tmp.getNextSibling();
if(tmp!=null)
news.setPubdate(tmp.toPlainTextString());
list.add(news);
}
}
c.put(new Element(ckey, list));
} catch (ParserException e) {
e.printStackTrace();
}
}
jsonp(list);
return NONE;
}
示例14: list
import org.htmlparser.Parser; //导入方法依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "newslist", results = { @Result(type = "json", params = {
"root", "list" }) })
public String list() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "newslist"+listid + page;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD+"/news/"+listid+"/"+page+".html");
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new HasAttributeFilter("class","date"));
SimpleNodeIterator i = ls.elements();
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
News news = new News();
news.setPubdate(tn.toPlainTextString());
Node tmp=tn.getNextSibling();
while(tmp!=null &&!(tmp instanceof LinkTag))
tmp=tmp.getNextSibling();
if(tmp!=null)
{
LinkTag link=(LinkTag)tmp;
news.setId(link.getAttribute("href"));
news.setTitle(link.getAttribute("title"));
}
list.add(news);
}
}
c.put(new Element(ckey, list));
} catch (ParserException e) {
e.printStackTrace();
}
}
return SUCCESS;
}
示例15: list
import org.htmlparser.Parser; //导入方法依赖的package包/类
@SuppressWarnings("rawtypes")
@Action(value = "eventlist")
public String list() throws IOException {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "eventlist"+page ;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD+"/calendar/?a=list&&m=recent&range=30&_="+System.currentTimeMillis()+"&type=0&place=0&type="+page );
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new HasAttributeFilter("class","clear"));
if(ls.size()==2)
{
int tk1=ls.elementAt(0).getEndPosition();
int tk2=ls.elementAt(1).getStartPosition();
ServletActionContext.getResponse().setCharacterEncoding("utf-8");
p=Parser.createParser(retstr.substring(tk1+6, tk2), "utf-8");
NodeList nl=p.parse(null);
NodeList links=nl.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class),true);
SimpleNodeIterator i=links.elements();
while(i.hasMoreNodes())
{
LinkTag lt=(LinkTag)i.nextNode();
NodeList ll=new NodeList();
ll.add(new TextNode(lt.getAttribute("title")));
lt.setChildren(ll);
lt.removeAttribute("title");
}
ServletActionContext.getResponse().getWriter().print(nl.toHtml());
}
} catch (ParserException e) {
e.printStackTrace();
}
}
return NONE;
}