本文整理匯總了Java中org.htmlparser.Parser.visitAllNodesWith方法的典型用法代碼示例。如果您正苦於以下問題:Java Parser.visitAllNodesWith方法的具體用法?Java Parser.visitAllNodesWith怎麽用?Java Parser.visitAllNodesWith使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.htmlparser.Parser
的用法示例。
在下文中一共展示了Parser.visitAllNodesWith方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: getPlainText
import org.htmlparser.Parser; //導入方法依賴的package包/類
public static String getPlainText(String htmlStr) {
Parser parser = new Parser();
String plainText = "";
try {
parser.setInputHTML(htmlStr);
StringBean stringBean = new StringBean();
// 設置不需要得到頁麵所包含的鏈接信息
stringBean.setLinks(false);
// 設置將不間斷空格由正規空格所替代
stringBean.setReplaceNonBreakingSpaces(true);
// 設置將一序列空格由單一空格替代
stringBean.setCollapse(true);
parser.visitAllNodesWith(stringBean);
plainText = stringBean.getStrings();
} catch (ParserException e) {
e.printStackTrace();
}
return plainText;
}
示例2: html2text
import org.htmlparser.Parser; //導入方法依賴的package包/類
/**
* Converts an HTML document into plain text.
*
* @param html HTML document
* @return plain text or <code>null</code> if the conversion failed
*/
public static synchronized String html2text(String html) {
// convert HTML document
StringBean sb = new StringBean();
sb.setLinks(false); // no links
sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
sb.setCollapse(true); // replace sequences of whitespaces
Parser parser = new Parser();
try {
parser.setInputHTML(html);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
if (docText == null) docText = ""; // no content
return docText;
}
示例3: file2text
import org.htmlparser.Parser; //導入方法依賴的package包/類
/**
* Reads an HTML document from a file and converts it into plain text.
*
* @param filename name of file containing HTML documents
* @return plain text or <code>null</code> if the reading or conversion failed
*/
public static synchronized String file2text(String filename) {
// read from file and convert HTML document
StringBean sb = new StringBean();
sb.setLinks(false); // no links
sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
sb.setCollapse(true); // replace sequences of whitespaces
Parser parser = new Parser();
try {
parser.setResource(filename);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
return docText;
}
示例4: PostCleaner
import org.htmlparser.Parser; //導入方法依賴的package包/類
public PostCleaner(String html, int minCodeChars, boolean excludeCode) {
try {
Parser htmlParser = Parser.createParser(html, "utf8");
PostCleanerVisitor res = new PostCleanerVisitor(minCodeChars, excludeCode);
htmlParser.visitAllNodesWith(res);
mText = res.getText();
} catch (ParserException e) {
System.err.println(" Parser exception: " + e + " trying simple conversion");
// Plan B!!!
mText = PostCleanerVisitor.simpleProc(html);
}
}
示例5: readByHtml
import org.htmlparser.Parser; //導入方法依賴的package包/類
/**
* 按頁麵方式處理.解析標準的html頁麵
* @param content 網頁的內容
* @throws Exception
*/
public static void readByHtml(String content) throws Exception {
Parser myParser;
myParser = Parser.createParser(content, "utf8");
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
String textInPage = visitor.getTitle();
System.out.println(textInPage);
NodeList nodelist;
nodelist = visitor.getBody();
System.out.print(nodelist.asString().trim());
}
示例6: move2Urls
import org.htmlparser.Parser; //導入方法依賴的package包/類
private HtmlPage move2Urls(HttpURLConnection httpConn, String newUrls)
throws Exception {
httpConn.disconnect();
String cookies = this.getCookies();
// String
// newUrls="http://eip.shenzhenair.com/shenzhenair/pub_bmwj.nsf/vwbydept?SearchView&count=20&Query=%E8%88%AA%E6%A0%A1&view=vwbydept";
URL newURL = new URL(newUrls);
log4.info("-----move2Urls the URL=" + newUrls);
// OK, now we are ready to get the cookies out of the URLConnection
// System.out.println("======cookies====="+cookies);
httpConn = (HttpURLConnection) newURL.openConnection();
httpConn.setRequestProperty("User-Agent",
"Mozilla/5.0 (compatible; MSIE 6.0; Windows NT)");
httpConn.setRequestProperty("Content-Type",
"application/x-www-form-urlencoded");
httpConn.setRequestProperty("Cookie", cookies);
httpConn.setDoInput(true);
Parser parser = new Parser(httpConn);
HtmlPage page = new HtmlPage(parser);
try {
parser.visitAllNodesWith(page);
} catch (ParserException e1) {
e1 = null;
}
return page;
}
示例7: getContentText
import org.htmlparser.Parser; //導入方法依賴的package包/類
@Transient
public String getContentText() {
try {
Parser parser = Parser.createParser(content, "UTF-8");
TextExtractingVisitor textExtractingVisitor = new TextExtractingVisitor();
parser.visitAllNodesWith(textExtractingVisitor);
return textExtractingVisitor.getExtractedText();
} catch (ParserException e) {
e.printStackTrace();
return null;
}
}
示例8: parse
import org.htmlparser.Parser; //導入方法依賴的package包/類
public DocData parse(DocData docData,
String name,
Date date,
InputSource source,
ContentSourceDateUtil trecSrc)
throws IOException {
String title = "";
String bodyText = "";
String baseHref = "http://fake-domain.com";
String encoding = "utf8";
/*
*
* This is clearly not the most efficient way to parse,
* but it is much more stable.
*
*/
StringWriter writer = new StringWriter();
BufferedReader br = new BufferedReader(source.getCharacterStream());
String line;
while(null != (line = br.readLine())) {
writer.append(line);
}
br.close();
String html = writer.toString();
try {
Parser HtmlParser = Parser.createParser(html, encoding);
LeoCleanerUtil res = new LeoCleanerUtil(baseHref);
HtmlParser.visitAllNodesWith(res);
title = res.GetTitleText();
bodyText = title + " " +
res.GetDescriptionText() + " " +
res.GetKeywordText() + " " +
res.GetBodyText();
} catch (ParserException e) {
System.err.println(" Parser exception: " + e + " trying simple conversion");
// Plan B!!!
Pair<String,String> sres = LeoCleanerUtil.SimpleProc(html);
title = sres.getFirst();
bodyText = title + " " + sres.getSecond();
}
docData.clear();
docData.setName(name);
docData.setTitle(title);
docData.setBody(bodyText);
docData.setProps(new Properties());
docData.setDate(date);
return docData;
}