本文整理汇总了Java中org.htmlparser.util.NodeList.elementAt方法的典型用法代码示例。如果您正苦于以下问题:Java NodeList.elementAt方法的具体用法?Java NodeList.elementAt怎么用?Java NodeList.elementAt使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlparser.util.NodeList
的用法示例。
在下文中一共展示了NodeList.elementAt方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parseFlashEmbedTag
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
* Processes the EMBED node that should contain the Flash animation:
* @param embedTag the Root object tag to tackle
* @param flashObjToFill the flash obect to fill in with data
* @return the updated flash object
*/
@SuppressWarnings("unchecked")
private FlashEmbeddedObject parseFlashEmbedTag( NodeList embeds, final FlashEmbeddedObject flashObjToFill ) {
if( embeds != null ) {
logger.debug( "The number of embed-tag nodes is " + embeds.size() );
for( int i = 0; i < embeds.size() ; i++ ) {
Node embedNode = embeds.elementAt( i );
if( embedNode instanceof Tag ) {
Tag embedTag = (Tag) embedNode;
//If it is not an end node then we process its attributes, if it is an empty
//XML tag then we do the same I believe an empty XML tag is smth like: <TAG />
if( !embedTag.isEndTag() || embedTag.isEmptyXmlTag() ) {
//Process the attributes
logger.debug("Processing embed node's '" + embedTag + "' attributes");
Vector<Attribute> atts = (Vector<Attribute>) embedTag.getAttributesEx();
if( atts != null ) {
for( Attribute att : atts ) {
String nameValue = att.getName();
String valueValue = att.getValue();
if( ! flashObjToFill.setNameValue( nameValue, valueValue ) ) {
logger.warn("An unknown EMBED attribute, name='" + nameValue + "' value='" + valueValue + "'" );
} else {
logger.debug("Set the EMBED attribute, name='" + nameValue + "' value='" + valueValue + "'");
}
}
}
} else {
logger.warn( "Encountered an EMBED node: " + embedTag + " that is an end tag!" );
}
} else {
logger.warn( "Encountered a EMBED node: " + embedNode + " that is not an EMBED tag!" );
}
}
} else {
logger.debug( "The list of embed-tag nodes is null" );
}
return flashObjToFill;
}
示例2: extractTextByTextNode
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static List<String> extractTextByTextNode(String content){
List<String> doc=new ArrayList<String>();//每个元素为一个段落
if (content == null) {
return doc;
}
try{
Parser parser = Parser.createParser(content, "utf8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeList nodelist=parser.extractAllNodesThatMatch(textFilter);
HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
for (int i = 0; i < nodelist.size(); i++) {
Node textnode = (Node) nodelist.elementAt(i);
if(textnode.toPlainTextString().trim().length()>0)
log.debug(i+": "+" content: "+textnode.toPlainTextString());
if(isInformativeStricter(textnode,parentWeight)){
log.debug(i+": "+" content: "+textnode.toPlainTextString());
doc.add(textnode.toPlainTextString());
}
}
}catch(Exception e){
e.printStackTrace();
log.error("Text extractor has encountered a problem!! "+e.getMessage());
}
return doc;
}
示例3: extractTextByTagP
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static List<String> extractTextByTagP(String content){
List<String> doc=new ArrayList<String>();//每个元素为一个段落
try{
if (content == null) {
return doc;
}
Parser parser = Parser.createParser(content, "utf8");
TagNameFilter paraFilter=new TagNameFilter("p");//get content between <p> </p>
// TagNameFilter paraFilter2=new TagNameFilter("br");//get content between <br> </br>
// NodeFilter filter = new OrFilter(paraFilter, paraFilter2);
NodeList nodelist=parser.extractAllNodesThatMatch(paraFilter);//报错!!
HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
for (int i = 0; i < nodelist.size(); i++) {
Node textnode = (Node) nodelist.elementAt(i);
log.debug(i+": "+" content: "+textnode.toPlainTextString());
if(isInformative(textnode,parentWeight)){
log.debug(i+": "+" content: "+textnode.toPlainTextString());
doc.add(textnode.toPlainTextString());
}
}
}catch(Exception e){
e.printStackTrace();
log.error("Text extractor has encountered a problem!! "+e.getMessage());
}
return doc;
}
示例4: listarCidades
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
@ApiMethod(name = "listarCidades")
public ListaEstadosCidades listarCidades(@Named("state") String state) throws Exception{
inicializaMapaEstados();
if(mapaCidades== null){
mapaCidades = new HashMap<String,Map<String,String>>();
}
if(!mapaCidades.containsKey(state)) {
Map<String,String> mapa = new HashMap<String, String>();
mapaCidades.put(state,mapa);
String responseBody = recuperarDados(mapaEstados.get(state), null);
NodeList nodeList = filterSelectNode(responseBody);
Node cidadeNode = nodeList.elementAt(2);
SimpleNodeIterator iteratorEstado = cidadeNode.getChildren().elements();
while (iteratorEstado.hasMoreNodes()) {
OptionTag node = (OptionTag) iteratorEstado.nextNode();
String cidadeId = node.getValue();
String cidadeNome = node.getChildren().elements().nextNode().getText();
if(!(cidadeNome.indexOf("Selecione") != -1)) {
//System.out.println(cidadeId+","+cidadeNome+","+mapaEstados.get(state));
mapa.put(cidadeNome, cidadeId);
}
}
}
ListaEstadosCidades listaEstados = new ListaEstadosCidades();
listaEstados.setLista(new ArrayList<String>(mapaCidades.get(state).keySet()));
return listaEstados;
}
示例5: preencheMapaEstados
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
private void preencheMapaEstados() throws IOException, ParserException {
String responseBody = recuperarDados(null, null);
NodeList nodeList = filterSelectNode(responseBody);
Node estadoNode = nodeList.elementAt(1);
SimpleNodeIterator iteratorEstado = estadoNode.getChildren().elements();
while (iteratorEstado.hasMoreNodes()) {
OptionTag node = (OptionTag) iteratorEstado.nextNode();
String estadoId = node.getValue();
String estadoNome = node.getChildren().elements().nextNode().getText();
//System.out.println(estadoId+","+estadoNome);
mapaEstados.put(estadoNome,estadoId);
}
}
示例6: getLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static List<String> getLinks(String url) throws ParserException {
Parser htmlParser = new Parser(url);
List<String> links = new LinkedList<String>();
NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
for (int m = 0; m < tagNodeList.size(); m++) {
LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
String linkName = loopLinks.getLink();
links.add(linkName);
}
return links;
}
示例7: extracLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
* 获取一个网站上的a链接
* @param url
* @return
*/
public static Set<String> extracLinks(String url) {
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag) {
// <a> 标签
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
links.add(linkUrl);
} else {
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1) {
end = frame.indexOf(">");
}
String frameUrl = frame.substring(5, end - 1);
links.add(frameUrl);
}
}
} catch (ParserException e) {
logger.error("", e);
}
return links;
}
示例8: extracLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static Set<String> extracLinks(String url, LinkFilter filter) {
Set<String> links = new HashSet<String>();
try{
Parser parser = new Parser(url);
parser.setEncoding("gb2312");
// <frame >
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter(){
public boolean accept(Node node){
if (node.getText().startsWith("frame src=")){
return true;
}
else{
return false;
}
}
};
//<a><frame>
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag){// <a>
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
if (filter.accept(linkUrl))
links.add(linkUrl);
}
else{
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl))
links.add(frameUrl);
}
}
System.out.println(links);
} catch (ParserException e){
e.printStackTrace();
}
return links;
}
示例9: extracLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static void extracLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");// gb2312
// 过滤 <frame> 标签的 filter,用来提取 frame 标签里的 src 属性所、表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,<img> 标签和 <frame> 标签,三个标签是 or 的关系
OrFilter orFilter = new OrFilter(
new NodeClassFilter(LinkTag.class), new NodeClassFilter(
ImageTag.class));
OrFilter linkFilter = new OrFilter(orFilter, frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag)// <a> 标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
System.out.println(linkUrl + "**********" + text);
} else if (tag instanceof ImageTag)// <img> 标签
{
ImageTag image = (ImageTag) list.elementAt(i);
System.out.print(image.getImageURL() + "********");// 图片地址
System.out.println(image.getText());// 图片文字
} else// <frame> 标签
{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
frame = frame.substring(5, end - 1);
System.out.println(frame);
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
示例10: extractLink
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static void extractLink(String content, String keyword) {
/**
* 通过判断链接中是否含keyword确定是否为有效链接。
* 注:keyword可能是一组词语或者是一个短语,检索出的内容或许只是匹配上keyword中部分词语
*/
try {
Parser parser = Parser.createParser(content, "utf8");
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeList nodelist = parser.extractAllNodesThatMatch(linkFilter);
int lastNodeID=0;//上一个确定为有效链接的node ID
int disThre=8; //通常检索出来的有效链接的id是连续的,因此可用此区分那些广告信息
for (int i = 0; i < nodelist.size(); i++) {
Node node = (Node) nodelist.elementAt(i);
LinkTag link = (LinkTag) node;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
/* //simple keywords test for debug
boolean flag=false;
String[] tmps=keyword.split("\\s+");
for(String tmp:tmps){
if(text.contains(tmp)){
flag=true;break;
}
}
if(flag){*/
if(containKeyword(text,keyword)){
if(lastNodeID>0 &&i-lastNodeID>disThre){
log.debug("Noisy link!!!");
continue;
}
if(!linkUrl.startsWith("http")) continue;
log.debug(i+":"+linkUrl+", "+text);
lastNodeID=i;
LinkDb.addUnvisitedUrl(linkUrl);
}else{
/* if(text.contains("下一页")){
System.out.println(i+":"+linkUrl+", "+text);
}*/
}
}
} catch (Exception e) {
e.printStackTrace();
log.error("Link extractor has encountered a problem!! "+e.getMessage());
}
}
示例11: extracLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
* 获取一个网页上的链接,并加入到队列中
* @param content
* @param filter 用来过滤链接
* @return Set<String>
* @author cxn 2015年11月5日
*/
public static Set<String> extracLinks(String content, LinkFilter filter) {
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(content);
// parser.setEncoding("utf-8");
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag){
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
if(linkUrl.startsWith("http") && filter.accept(linkUrl, Main.keyWord)){
links.add(linkUrl);
}else if(linkUrl.startsWith("/") && filter.accept(Main.baseUrl+linkUrl, Main.keyWord)){
links.add(Main.baseUrl+linkUrl);
}
}else{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1){
end = frame.indexOf(">");
}
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl)){
links.add(frameUrl);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
LinkQueue.addUnvisitedUrl(links);
return links;
}
示例12: extracLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public Set<String> extracLinks(PageResult pageResult, LinkFilter filter) {
//String url=crawlUrl.getOriUrl();
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(pageResult.getContent());
parser.setEncoding(pageResult.getCharSet());
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag)// <a> 标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();// url
if (filter.accept(linkUrl))
{
//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(linkUrl);
links.add(linkUrl);
}
} else// <frame> 标签
{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl))
{
//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(frameUrl);
links.add(frameUrl);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return links;
}