本文整理汇总了Java中org.htmlparser.tags.LinkTag类的典型用法代码示例。如果您正苦于以下问题:Java LinkTag类的具体用法?Java LinkTag怎么用?Java LinkTag使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
LinkTag类属于org.htmlparser.tags包,在下文中一共展示了LinkTag类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parseDetailInfo
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
private Map<String, String> parseDetailInfo(NodeList nodeList) {
Map<String, String> InfoMap = Maps.newHashMap();
if (nodeList.size() == 0) {
return InfoMap;
}
for (Node pageNode : nodeList.elementAt(0).getChildren().toNodeArray()) {
try {
if (pageNode instanceof LinkTag) {
String rawId = ((LinkTag) pageNode).getAttribute("id");
if (StringUtils.isBlank(rawId)) {
continue;
}
if (rawId.contains("all")) {
continue;
}
String id = rawId.substring(rawId.indexOf("_") + 1);
InfoMap.put(id, pageNode.toPlainTextString());
}
} catch (Exception e) {
log.error("parse parseDetailInfo catch Exception:", e);
}
}
return InfoMap;
}
示例2: parseLinkTag
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
* 解析楼幢数
*
* @param nodeList
* @return
*/
private String parseLinkTag(NodeList nodeList) {
for (Node node : nodeList.toNodeArray()) {
if (node instanceof LinkTag) {
return node.toPlainTextString();
}
}
return StringUtils.EMPTY;
}
示例3: readTextAndLinkAndTitle
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
* 分别读纯文本和链接.
* @param result 网页的内容
* @throws Exception
*/
public static void readTextAndLinkAndTitle(String result) throws Exception {
Parser parser;
NodeList nodelist;
parser = Parser.createParser(result, "utf8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter, titleFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
} else if (node instanceof TitleTag) {
TitleTag titlenode = (TitleTag) node;
line = titlenode.getTitle();
}
if (isTrimEmpty(line))
continue;
System.out.println(line);
}
}
示例4: getLinks
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public static List<String> getLinks(String url) throws ParserException {
Parser htmlParser = new Parser(url);
List<String> links = new LinkedList<String>();
NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
for (int m = 0; m < tagNodeList.size(); m++) {
LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
String linkName = loopLinks.getLink();
links.add(linkName);
}
return links;
}
示例5: extracLinks
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
* 获取一个网站上的a链接
* @param url
* @return
*/
public static Set<String> extracLinks(String url) {
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag) {
// <a> 标签
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
links.add(linkUrl);
} else {
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1) {
end = frame.indexOf(">");
}
String frameUrl = frame.substring(5, end - 1);
links.add(frameUrl);
}
}
} catch (ParserException e) {
logger.error("", e);
}
return links;
}
示例6: parserNode
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
* 对结点进行词法分析
* @param node 所要分析的结点
*/
private void parserNode(Node node) {
depth ++;
String regex = "[ \b\t\n\f\r]*";
if(node instanceof TextNode) { // 若为文本结点,则进行分词
if(depth == 1) {
System.out.println("TextNode!");
Lexer lexer = new Lexer(node.getPage());
Parser parser = new Parser(lexer, Parser.STDOUT);
//TODO filter script & style
OrFilter it = new OrFilter(new NotFilter(new TagNameFilter("script ")), new NotFilter(new TagNameFilter("style ")));
try {
NodeList nl = parser.extractAllNodesThatMatch(it);
NodeIterator nit = nl.elements();
while(nit.hasMoreNodes()) {
Node n = nit.nextNode();
if(n instanceof TextNode) {
if(!(n.getText().matches(regex))) { // 用正则表达式进行匹配,对非空的文本进行分词
segment(n.getText()); // 对网页中的文本进行分词
}
}
}
}
catch(ParserException exc) {
System.out.println("ParserException");
//exc.printStackTrace();
}
}
}
else if(node instanceof TagNode) { // 若为链接结点,则扩展外链
if(node instanceof LinkTag) {
LinkTag tag = (LinkTag)node;
if(!(tag.getLink().matches(regex))) {
urlInfo.addExtendedURL(tag.getLink()); // 将得到的外链加入到urlInfo中
}
}
dealTag(node);
}
depth --;
}
示例7: extracLinks
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public static Set<String> extracLinks(String url, LinkFilter filter) {
Set<String> links = new HashSet<String>();
try{
Parser parser = new Parser(url);
parser.setEncoding("gb2312");
// <frame >
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter(){
public boolean accept(Node node){
if (node.getText().startsWith("frame src=")){
return true;
}
else{
return false;
}
}
};
//<a><frame>
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag){// <a>
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
if (filter.accept(linkUrl))
links.add(linkUrl);
}
else{
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl))
links.add(frameUrl);
}
}
System.out.println(links);
} catch (ParserException e){
e.printStackTrace();
}
return links;
}
示例8: getGames
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public ArrayList<FootballGame> getGames() {
Parser parser = new Parser();
ArrayList<FootballGame> games = new ArrayList<FootballGame>();
try {
NodeFilter tagNameFilter = new TagNameFilter("table");
HasAttributeFilter attrFilter = new HasAttributeFilter("bgcolor", "#666666");
parser.setResource("http://livescores.com/");
NodeList nl = parser.parse(tagNameFilter);
nl = nl.extractAllNodesThatMatch(attrFilter);
attrFilter = new HasAttributeFilter("width", "331");
nl = nl.extractAllNodesThatMatch(attrFilter);
Node node = nl.remove(0);
nl = node.getChildren();
Node[] nodes = nl.toNodeArray();
Tag tag;
String country = "";
String league = "";
String hometeam = "";
String awayteam = "";
String gametime = "";
String link = "";
String result = "";
for (int i = 0; i < nodes.length; i++) {
if (nodes[i] instanceof Tag) {
tag = (Tag) nodes[i];
String str = tag.getAttribute("bgcolor");
if (str != null) {
//if(str.contains("11111"))
//NEW LEAGUE!
// ;
if (str.contains("3333")) {
tag = (Tag) tag.getFirstChild();
str = tag.getAttribute("class");
if (str != null && str.contains("title")) {
country = tag.getChildren().toNodeArray()[2].getText();
league = tag.getChildren().toNodeArray()[4].getText();
}
} else if (str.contains("f")) {
Node[] tempnodes = tag.getChildren().toNodeArray();
String[] t = tempnodes[0].getFirstChild().getText().split(";");
if (t.length > 1)
gametime = t[1];
else
gametime = tempnodes[0].getFirstChild().getNextSibling().getNextSibling().getText();
hometeam = tempnodes[1].getFirstChild().getText();
awayteam = tempnodes[3].getFirstChild().getText();
//RESULTAT
if (tempnodes[2].getFirstChild().getFirstChild() != null) {
//MED LÄNK
result = tempnodes[2].getFirstChild().getFirstChild().getText();
link = ((LinkTag) (tempnodes[2].getFirstChild())).extractLink();
} else {
//UTAN LÄNK
result = tempnodes[2].getFirstChild().getText();
link = null;
}
ArrayList<FootballEvent> ev = new ArrayList<FootballEvent>();
if (link != null) {
ev = getScorers(link);
}
games.add(new FootballGame(country, league, hometeam, awayteam, gametime, ev, result));
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return games;
}
示例9: extracLinks
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public static void extracLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");// gb2312
// 过滤 <frame> 标签的 filter,用来提取 frame 标签里的 src 属性所、表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,<img> 标签和 <frame> 标签,三个标签是 or 的关系
OrFilter orFilter = new OrFilter(
new NodeClassFilter(LinkTag.class), new NodeClassFilter(
ImageTag.class));
OrFilter linkFilter = new OrFilter(orFilter, frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag)// <a> 标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
System.out.println(linkUrl + "**********" + text);
} else if (tag instanceof ImageTag)// <img> 标签
{
ImageTag image = (ImageTag) list.elementAt(i);
System.out.print(image.getImageURL() + "********");// 图片地址
System.out.println(image.getText());// 图片文字
} else// <frame> 标签
{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
frame = frame.substring(5, end - 1);
System.out.println(frame);
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
示例10: extractLink
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public static void extractLink(String content, String keyword) {
/**
* 通过判断链接中是否含keyword确定是否为有效链接。
* 注:keyword可能是一组词语或者是一个短语,检索出的内容或许只是匹配上keyword中部分词语
*/
try {
Parser parser = Parser.createParser(content, "utf8");
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeList nodelist = parser.extractAllNodesThatMatch(linkFilter);
int lastNodeID=0;//上一个确定为有效链接的node ID
int disThre=8; //通常检索出来的有效链接的id是连续的,因此可用此区分那些广告信息
for (int i = 0; i < nodelist.size(); i++) {
Node node = (Node) nodelist.elementAt(i);
LinkTag link = (LinkTag) node;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
/* //simple keywords test for debug
boolean flag=false;
String[] tmps=keyword.split("\\s+");
for(String tmp:tmps){
if(text.contains(tmp)){
flag=true;break;
}
}
if(flag){*/
if(containKeyword(text,keyword)){
if(lastNodeID>0 &&i-lastNodeID>disThre){
log.debug("Noisy link!!!");
continue;
}
if(!linkUrl.startsWith("http")) continue;
log.debug(i+":"+linkUrl+", "+text);
lastNodeID=i;
LinkDb.addUnvisitedUrl(linkUrl);
}else{
/* if(text.contains("下一页")){
System.out.println(i+":"+linkUrl+", "+text);
}*/
}
}
} catch (Exception e) {
e.printStackTrace();
log.error("Link extractor has encountered a problem!! "+e.getMessage());
}
}
示例11: extracLinks
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
* 获取一个网页上的链接,并加入到队列中
* @param content
* @param filter 用来过滤链接
* @return Set<String>
* @author cxn 2015年11月5日
*/
public static Set<String> extracLinks(String content, LinkFilter filter) {
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(content);
// parser.setEncoding("utf-8");
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag){
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
if(linkUrl.startsWith("http") && filter.accept(linkUrl, Main.keyWord)){
links.add(linkUrl);
}else if(linkUrl.startsWith("/") && filter.accept(Main.baseUrl+linkUrl, Main.keyWord)){
links.add(Main.baseUrl+linkUrl);
}
}else{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1){
end = frame.indexOf(">");
}
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl)){
links.add(frameUrl);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
LinkQueue.addUnvisitedUrl(links);
return links;
}
示例12: visit
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
/**
* 更新最新的部门发文
* @param httpConn
* @throws Exception
*/
public List visit(boolean isByCookie) throws Exception{
log4.info("======访问网站============cookie="+isByCookie);
String newUrls = null;
TableColumn[] arrColumns;
LinkTag lt = null;
String title, dates, codes, link;
HtmlPage page = null;
TableTag tableContent[] = null;
int order_count = 0;
List list = new LinkedList();
if(this.getUrl().startsWith("http:")){
newUrls = this.getUrl();
}
else{
if(this.base_url.endsWith("/")){
newUrls = this.base_url+"/"+this.getUrl();
}
else{
newUrls = this.base_url+this.getUrl();
}
}
// if(true){
// page = move2Urls(this.getHttpURLConnection(), newUrls);
// page.getBody().toHtml();
//// return page.getBody().toHtml();
// }
log4.info("newUrls="+newUrls);
String str = this.move2UrlsHtml(this.getHttpURLConnection(), newUrls, isByCookie);
String[] msgs = str.split("\n");
List htmls = new LinkedList();
for(String msg: msgs){
msg = msg.trim();
msg = msg.replaceAll(" ", "");
msg = msg.replaceAll("<", "");
msg = msg.replaceAll(">", "");
msg = msg.replaceAll(""", "");
msg = msg.replaceAll("td", "");
msg = msg.replaceAll("tr", "");
// msg = msg.replaceAll("&", "&");
// msg = msg.replaceAll("<", "<");
// msg = msg.replaceAll(">", ">");
// msg = msg.replaceAll("\"", """);
// msg = msg.replaceAll("'", "'");
if(!ErrorCode.isEmpty(msg)){
htmls.add(msg);
}
}
return htmls;
}
示例13: list
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "newslist", results = { @Result(type = "json", params = {
"root", "list" }) })
public String list() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "newslist"+listid + page;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD+"/news/"+listid+"/"+page+".html");
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new HasAttributeFilter("class","date"));
SimpleNodeIterator i = ls.elements();
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
News news = new News();
news.setPubdate(tn.toPlainTextString());
Node tmp=tn.getNextSibling();
while(tmp!=null &&!(tmp instanceof LinkTag))
tmp=tmp.getNextSibling();
if(tmp!=null)
{
LinkTag link=(LinkTag)tmp;
news.setId(link.getAttribute("href"));
news.setTitle(link.getAttribute("title"));
}
list.add(news);
}
}
c.put(new Element(ckey, list));
} catch (ParserException e) {
e.printStackTrace();
}
}
return SUCCESS;
}
示例14: list
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
@SuppressWarnings("rawtypes")
@Action(value = "eventlist")
public String list() throws IOException {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "eventlist"+page ;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD+"/calendar/?a=list&&m=recent&range=30&_="+System.currentTimeMillis()+"&type=0&place=0&type="+page );
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new HasAttributeFilter("class","clear"));
if(ls.size()==2)
{
int tk1=ls.elementAt(0).getEndPosition();
int tk2=ls.elementAt(1).getStartPosition();
ServletActionContext.getResponse().setCharacterEncoding("utf-8");
p=Parser.createParser(retstr.substring(tk1+6, tk2), "utf-8");
NodeList nl=p.parse(null);
NodeList links=nl.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class),true);
SimpleNodeIterator i=links.elements();
while(i.hasMoreNodes())
{
LinkTag lt=(LinkTag)i.nextNode();
NodeList ll=new NodeList();
ll.add(new TextNode(lt.getAttribute("title")));
lt.setChildren(ll);
lt.removeAttribute("title");
}
ServletActionContext.getResponse().getWriter().print(nl.toHtml());
}
} catch (ParserException e) {
e.printStackTrace();
}
}
return NONE;
}
示例15: extracLinks
import org.htmlparser.tags.LinkTag; //导入依赖的package包/类
public Set<String> extracLinks(PageResult pageResult, LinkFilter filter) {
//String url=crawlUrl.getOriUrl();
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(pageResult.getContent());
parser.setEncoding(pageResult.getCharSet());
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag)// <a> 标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();// url
if (filter.accept(linkUrl))
{
//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(linkUrl);
links.add(linkUrl);
}
} else// <frame> 标签
{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl))
{
//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(frameUrl);
links.add(frameUrl);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return links;
}