本文整理汇总了Java中org.htmlparser.util.ParserException.printStackTrace方法的典型用法代码示例。如果您正苦于以下问题:Java ParserException.printStackTrace方法的具体用法?Java ParserException.printStackTrace怎么用?Java ParserException.printStackTrace使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlparser.util.ParserException
的用法示例。
在下文中一共展示了ParserException.printStackTrace方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parserUrl
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
@Override
public NodeList parserUrl(Parser parser) {
NodeFilter hrefNodeFilter = new NodeFilter() {
@Override
public boolean accept(Node node) {
if (node.getText().startsWith("a href=")) {
return true;
} else {
return false;
}
}
};
try {
return parser.extractAllNodesThatMatch(hrefNodeFilter);
} catch (ParserException e) {
e.printStackTrace();
}
return null;
}
示例2: getPlainText
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
public static String getPlainText(String htmlStr) {
Parser parser = new Parser();
String plainText = "";
try {
parser.setInputHTML(htmlStr);
StringBean stringBean = new StringBean();
// 设置不需要得到页面所包含的链接信息
stringBean.setLinks(false);
// 设置将不间断空格由正规空格所替代
stringBean.setReplaceNonBreakingSpaces(true);
// 设置将一序列空格由单一空格替代
stringBean.setCollapse(true);
parser.visitAllNodesWith(stringBean);
plainText = stringBean.getStrings();
} catch (ParserException e) {
e.printStackTrace();
}
return plainText;
}
示例3: splitHtml
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
private List<String> splitHtml() {
List<String> resultList = new ArrayList<String>();
try {
Parser parser = Parser.createParser(content, "UTF-8");
NodeList nodeList = parser.parse(null);
resultList = recusiveSplitHtml(nodeList);
StringBuffer lastPageContent = new StringBuffer();
for (TagNode tagNode : tagNodeList) {
if (tagNode.getStartPosition() < startPosition && tagNode.getEndTag().getEndPosition() >= startPosition) {
lastPageContent.append("<");
lastPageContent.append(tagNode.getText());
lastPageContent.append(">");
}
}
lastPageContent.append(content.substring(startPosition));
Parser lastPageContentParser = Parser.createParser(lastPageContent.toString(), "UTF-8");
NodeList pageContentNodeList = lastPageContentParser.parse(null);
resultList.add(pageContentNodeList.toHtml());
} catch (ParserException e) {
e.printStackTrace();
}
return resultList;
}
示例4: extractKeyWordText
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
public static void extractKeyWordText(String url, String keyword) {
try {
// 生成一个解析器对象,用网页的 url 作为参数
Parser parser = new Parser(url);
// 设置网页的编码,这里只是请求了一个 gb2312 编码网页
parser.setEncoding("utf-8");// gb2312
// 迭代所有节点, null 表示不使用 NodeFilter
NodeList list = parser.parse(null);
// 从初始的节点列表跌倒所有的节点
processNodeList(list, keyword);
} catch (ParserException e) {
e.printStackTrace();
}
}
示例5: getContentText
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
@Transient
public String getContentText() {
try {
Parser parser = Parser.createParser(content, "UTF-8");
TextExtractingVisitor textExtractingVisitor = new TextExtractingVisitor();
parser.visitAllNodesWith(textExtractingVisitor);
return textExtractingVisitor.getExtractedText();
} catch (ParserException e) {
e.printStackTrace();
return null;
}
}
示例6: processResponse
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
private boolean processResponse(HttpResponse resp, Document doc, Element root) {
if(resp.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
System.out.println("[INFO] HTTP Status OK.");
System.out.println("[INFO] Extracting html page...");
String html = extractHtml(resp);
if(html == null) return false;
System.out.println("[INFO] " + html.length() + "B html page extracted.");
if(html.length() < 500) {
System.out.println("[INFO] EOF reached, task completed.");
return false;
} else {
System.out.println("[INFO] Parsing html page...");
try {
Parser parser = new Parser(html);
NodeList weibo_list = parser.extractAllNodesThatMatch(
new HasAttributeFilter("action-type", "feed_list_item"));
System.out.println("[INFO] " + weibo_list.size() + " entries detected.");
SimpleNodeIterator iter = weibo_list.elements();
while(iter.hasMoreNodes()) {
System.out.println("[INFO] processing entry #" + (++total) + "...");
Element elem = extractContent(iter.nextNode(), doc);
if(elem == null) {
System.out.println("[ERROR] Data extraction failed.");
return false;
}
root.appendChild(elem);
}
if(weibo_list.size() != 15) return false;
} catch (ParserException e) {
System.out.println("[ERROR] Parser failed.");
e.printStackTrace();
return false;
}
}
} else {
return false;
}
return true;
}
示例7: extracLinks
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
public static Set<String> extracLinks(String url, LinkFilter filter) {
Set<String> links = new HashSet<String>();
try{
Parser parser = new Parser(url);
parser.setEncoding("gb2312");
// <frame >
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter(){
public boolean accept(Node node){
if (node.getText().startsWith("frame src=")){
return true;
}
else{
return false;
}
}
};
//<a><frame>
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag){// <a>
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
if (filter.accept(linkUrl))
links.add(linkUrl);
}
else{
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl))
links.add(frameUrl);
}
}
System.out.println(links);
} catch (ParserException e){
e.printStackTrace();
}
return links;
}
示例8: extracLinks
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
public static void extracLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");// gb2312
// 过滤 <frame> 标签的 filter,用来提取 frame 标签里的 src 属性所、表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,<img> 标签和 <frame> 标签,三个标签是 or 的关系
OrFilter orFilter = new OrFilter(
new NodeClassFilter(LinkTag.class), new NodeClassFilter(
ImageTag.class));
OrFilter linkFilter = new OrFilter(orFilter, frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag)// <a> 标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
System.out.println(linkUrl + "**********" + text);
} else if (tag instanceof ImageTag)// <img> 标签
{
ImageTag image = (ImageTag) list.elementAt(i);
System.out.print(image.getImageURL() + "********");// 图片地址
System.out.println(image.getText());// 图片文字
} else// <frame> 标签
{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
frame = frame.substring(5, end - 1);
System.out.println(frame);
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
示例9: extracLinks
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
/**
* 获取一个网页上的链接,并加入到队列中
* @param content
* @param filter 用来过滤链接
* @return Set<String>
* @author cxn 2015年11月5日
*/
public static Set<String> extracLinks(String content, LinkFilter filter) {
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(content);
// parser.setEncoding("utf-8");
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag){
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
if(linkUrl.startsWith("http") && filter.accept(linkUrl, Main.keyWord)){
links.add(linkUrl);
}else if(linkUrl.startsWith("/") && filter.accept(Main.baseUrl+linkUrl, Main.keyWord)){
links.add(Main.baseUrl+linkUrl);
}
}else{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1){
end = frame.indexOf(">");
}
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl)){
links.add(frameUrl);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
LinkQueue.addUnvisitedUrl(links);
return links;
}
示例10: list
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "sdlist", results = { @Result(type = "json", params = {
"root", "list" }) })
public String list() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey =domain+listid + page;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(domain+"/"+listid+"/list"
+ page+".htm");
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new AttributeRegexFilter(
"href", ".*/page\\.htm"));
SimpleNodeIterator i = ls.elements();
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
News news = new News();
String href = tn.getAttribute("href");
news.setId(href);
news.setTitle(tn.getAttribute("alt"));
Node tmp=tn.getParent().getNextSibling();
while(tmp!=null &&!(tmp instanceof TableColumn))
tmp=tmp.getNextSibling();
if(tmp!=null)
news.setPubdate(tmp.toPlainTextString());
list.add(news);
}
}
c.put(new Element(ckey, list));
} catch (ParserException e) {
e.printStackTrace();
}
}
jsonp(list);
return NONE;
}
示例11: list
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "newslist", results = { @Result(type = "json", params = {
"root", "list" }) })
public String list() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "newslist"+listid + page;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD+"/news/"+listid+"/"+page+".html");
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new HasAttributeFilter("class","date"));
SimpleNodeIterator i = ls.elements();
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
News news = new News();
news.setPubdate(tn.toPlainTextString());
Node tmp=tn.getNextSibling();
while(tmp!=null &&!(tmp instanceof LinkTag))
tmp=tmp.getNextSibling();
if(tmp!=null)
{
LinkTag link=(LinkTag)tmp;
news.setId(link.getAttribute("href"));
news.setTitle(link.getAttribute("title"));
}
list.add(news);
}
}
c.put(new Element(ckey, list));
} catch (ParserException e) {
e.printStackTrace();
}
}
return SUCCESS;
}
示例12: list
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
@SuppressWarnings("rawtypes")
@Action(value = "eventlist")
public String list() throws IOException {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "eventlist"+page ;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD+"/calendar/?a=list&&m=recent&range=30&_="+System.currentTimeMillis()+"&type=0&place=0&type="+page );
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new HasAttributeFilter("class","clear"));
if(ls.size()==2)
{
int tk1=ls.elementAt(0).getEndPosition();
int tk2=ls.elementAt(1).getStartPosition();
ServletActionContext.getResponse().setCharacterEncoding("utf-8");
p=Parser.createParser(retstr.substring(tk1+6, tk2), "utf-8");
NodeList nl=p.parse(null);
NodeList links=nl.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class),true);
SimpleNodeIterator i=links.elements();
while(i.hasMoreNodes())
{
LinkTag lt=(LinkTag)i.nextNode();
NodeList ll=new NodeList();
ll.add(new TextNode(lt.getAttribute("title")));
lt.setChildren(ll);
lt.removeAttribute("title");
}
ServletActionContext.getResponse().getWriter().print(nl.toHtml());
}
} catch (ParserException e) {
e.printStackTrace();
}
}
return NONE;
}
示例13: content
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
@Action(value = "eventcontent", results = { @Result(type = "json", params = {
"root", "en" }) })
public String content() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "eventcontent" + newsid;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
en = (News) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD+"/calendar/?a=one&evid="
+ newsid+"&_="+System.currentTimeMillis());
Parser p = Parser.createParser(retstr.toString(), "utf-8");
try {
NodeList nl = p.extractAllNodesThatMatch(new OrFilter(
new TagNameFilter("h1"), new TagNameFilter("table")));
SimpleNodeIterator i = nl.elements();
en = new News();
en.setId(newsid);
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
if (tn.getTagName().equalsIgnoreCase("h1"))
en.setTitle(tn.toPlainTextString());
if (tn.getTagName().equalsIgnoreCase("table")) {
en.setContent(tn.toHtml());
}
}
}
String str=retstr.toString().trim();
int tk=retstr.indexOf("imageurl");
if(tk>0)
{
tk=retstr.indexOf("'",tk);
int tk1=retstr.indexOf("'", tk+1);
String imgurl=RD+str.substring(tk+1,tk1);
String imgid = EncodeHelper.digest(
imgurl, "MD5");
BasicDBObject obj = new BasicDBObject("id",
imgid);
DBCollection col = MongoUtil.getInstance().getDB()
.getCollection("CrawlerImages");
DBObject dbo = col.findOne(obj);
if (dbo == null)
col.save(obj.append("url",imgurl));
en.setPubdate(imgid);
}
} catch (ParserException e) {
e.printStackTrace();
}
if (!CommonUtil.isEmpty(en) && !CommonUtil.isEmpty(en.getContent()))
c.put(new Element(ckey, en));
}
return SUCCESS;
}
示例14: list
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "calist", results = { @Result(type = "json", params = {
"root", "list" }) })
public String list() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "calist" + page;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD+"/announce/announce_list.php?page="
+ page);
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new AttributeRegexFilter(
"href", "announce/\\?announceid=\\d+"));
SimpleNodeIterator i = ls.elements();
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
News news = new News();
String href = tn.getAttribute("href");
int tk = href.indexOf("=");
if (tk > 0)
news.setId(href.substring(tk + 1));
news.setTitle(tn.toPlainTextString());
list.add(news);
}
}
c.put(new Element(ckey, list));
} catch (ParserException e) {
e.printStackTrace();
}
}
return SUCCESS;
}
示例15: list
import org.htmlparser.util.ParserException; //导入方法依赖的package包/类
@SuppressWarnings({ "unchecked", "rawtypes" })
@Action(value = "liblist", results = { @Result(type = "json", params = {
"root", "list" }) })
public String list() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "liblist" + page;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD
+ "/ddlib/getPublishInfoList.shtml?tid=1012&k=&p="
+ (page - 1));
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new AttributeRegexFilter(
"href", "publishInfo\\.shtml\\?.+"));
SimpleNodeIterator i = ls.elements();
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
News news = new News();
String href = tn.getAttribute("href");
news.setId(href);
news.setTitle(tn.toPlainTextString());
Node tmp = tn.getNextSibling();
if (tmp != null && tmp instanceof TextNode) {
if (tmp.getText() != null)
news.setPubdate(tmp.getText().replaceAll(
" ", ""));
}
list.add(news);
}
}
c.put(new Element(ckey, list));
} catch (ParserException e) {
e.printStackTrace();
}
}
return SUCCESS;
}