本文整理汇总了Java中org.cyberneko.html.parsers.DOMParser.getDocument方法的典型用法代码示例。如果您正苦于以下问题:Java DOMParser.getDocument方法的具体用法?Java DOMParser.getDocument怎么用?Java DOMParser.getDocument使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.cyberneko.html.parsers.DOMParser
的用法示例。
在下文中一共展示了DOMParser.getDocument方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: readHtmlDocument
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
public static Document readHtmlDocument(String str) {
Document document = null;
try {
URL url = FlexibleLocation.resolveLocation(str);
if (url != null) {
DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(url.toExternalForm());
document = parser.getDocument();
} else {
Debug.logError("Unable to locate HTML document " + str, module);
}
} catch (Exception e) {
Debug.logError(e, "Error while reading HTML document " + str, module);
}
return document;
}
示例2: main
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
DOMParser parser = new DOMParser();
String pathname = "src/test/resources/html/simple/3.html";
parser.parse(new InputSource(new FileReader(pathname)));
Document document = parser.getDocument();
TreeNode node = new TreeNode(document, null);
node.postOrderIndex(new AtomicInteger(1));
node.prettyPrint();
List<TreeNode> nodes = node.postOrderTraverse();
System.out.println(nodes);
System.out.println(node.getKeyRoots());
}
示例3: getSource
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
@TimeThis(task="read-file", category=TimerCategory.LOAD_RESOURCE)
protected Source getSource(@SuppressWarnings("unused") ProcessingContext<Corpus> ctx, InputStream file) throws SAXException, IOException, ParserConfigurationException {
if (html) {
DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
parser.setFeature("http://cyberneko.org/html/features/parse-noscript-content", false);
parser.setProperty("http://cyberneko.org/html/properties/default-encoding", sourcePath.getCharset());
if (rawTagNames) {
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
}
else {
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
}
parser.parse(new InputSource(file));
Document doc = parser.getDocument();
return new DOMSource(doc);
}
SAXParserFactory spf = SAXParserFactory.newInstance();
spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
org.xml.sax.XMLReader xmlReader = spf.newSAXParser().getXMLReader();
xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
xmlReader.setEntityResolver(new EntityResolver() {
@Override
public InputSource resolveEntity(String pid, String sid) throws SAXException {
return new InputSource(new ByteArrayInputStream(new byte[] {}));
}
});
new SAXSource(xmlReader, new InputSource(file));
return new StreamSource(file);
}
示例4: getDoc
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
public static Document getDoc(InputStream in, String encoding)
throws Exception {
DOMParser parser = new DOMParser();
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
encoding);
parser.setFeature("http://xml.org/sax/features/namespaces", false);
BufferedReader br = new BufferedReader(new InputStreamReader(in,
encoding));
parser.parse(new InputSource(br));
return parser.getDocument();
}
示例5: computeDistance
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
* Computes edit distance between two html files
* @param file1 first html file
* @param file2 second html file
* @return edit distance measure
* @throws IOException when an error occurs
* @throws SAXException when parser fails
*/
public static double computeDistance(File file1, File file2)
throws IOException, SAXException {
DOMParser domParser = new DOMParser();
domParser.parse(new InputSource(new FileReader(file1)));
Document doc1 = domParser.getDocument();
domParser.reset();
domParser.parse(new InputSource(new FileReader(file2)));
Document doc2 = domParser.getDocument();
ZSTEDComputer computer = new ZSTEDComputer();
return computer.computeDistance(new TreeNode(doc1, null), new TreeNode(doc2, null));
}
示例6: VSMVector
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
public VSMVector(String file, boolean isForm, StopList stoplist) throws MalformedURLException, IOException, SAXException {
this.stoplist = stoplist;
this.elems = new HashMap<>();
if(isForm){
DOMParser parser = new DOMParser();
if((file.toLowerCase()).indexOf("<form ") != -1){//verify if the string is the name of file or the content of form
parser.parse(new InputSource(new BufferedReader(new StringReader(file))));
}else{
parser.parse(file);
}
String srcForm = "";
Document doc = parser.getDocument();
NodeList list = doc.getElementsByTagName("form");
StringBuffer source = new StringBuffer();
parse(list.item(0), source, new StringBuffer(), "html", stoplist);
srcForm = source.toString().toLowerCase();
PaginaURL formPage = new PaginaURL(new URL("http://www"),srcForm, stoplist);
stemPage(formPage, true);
} else {
StringBuffer content = new StringBuffer();
BufferedReader input = new BufferedReader(new FileReader(new File(
file)));
for (String line = input.readLine(); line != null;
line = input.readLine()) {
content.append(line);
content.append("\n");
}
input.close();
String src = content.toString();
PaginaURL page = new PaginaURL(new URL("http://www"), src, stoplist);
addTitle(page, stoplist);
stemPage(page, false);
}
}
示例7: asDocument
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
* transforms a string into a Document object. TODO This needs more optimizations. As it seems
* the getDocument is called way too much times causing a lot of parsing which is slow and not
* necessary.
*
* @param html
* the HTML string.
* @return The DOM Document version of the HTML string.
* @throws IOException
* if an IO failure occurs.
* @throws SAXException
* if an exception occurs while parsing the HTML string.
*/
public static Document asDocument(String html) throws IOException {
DOMParser domParser = new DOMParser();
try {
domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
domParser.setFeature("http://xml.org/sax/features/namespaces", false);
domParser.parse(new InputSource(new StringReader(html)));
} catch (SAXException e) {
throw new IOException("Error while reading HTML: " + html, e);
}catch (Exception unknown){
unknown.printStackTrace();
throw new IOException("Error while reading HTML: " + html);
}
return domParser.getDocument();
}
示例8: parse
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
@Override
public Document parse() throws SAXException, IOException
{
DOMParser parser = new DOMParser(new HTMLConfiguration());
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
if (charset != null)
parser.setProperty("http://cyberneko.org/html/properties/default-encoding", charset);
parser.parse(new org.xml.sax.InputSource(getDocumentSource().getInputStream()));
return parser.getDocument();
}
示例9: main
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
* Testing
*
* @param argv
* @throws Exception
*/
public static void main(String[] argv) throws Exception {
// URL input = new
// URL("file:///home/dd/Programming/Readability4J/t.html");
// URL input = new
// URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm");
final URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/");
// URL input = new URL("http://euobserver.com/9/30465");
// URL input = new URL("http://euobserver.com/?aid=23383");
// URL input = new
// URL("http://abandoninplace.squarespace.com/blog/2010/6/8/wwdc-monday.html");
// URL input = new URL("file:///Users/jsh2/Desktop/test.html");
// URL input = new
// URL("http://mobile.engadget.com/2010/06/17/htc-aria-review/");
// URL input = new URL("http://thedailywtf.com/Articles/Benched.aspx");
// URL input = new
// URL("http://www.dailymail.co.uk/news/article-1287625/Woman-sparked-150-000-manhunt-slashing-face-crying-rape-faces-jail.html");
// URL input = new
// URL("http://mrpaparazzi.com/post/11619/Lindsay-Lohan-Tests-Negative-For-Alcohol-Goes-Clubbing-To-Celebrate.aspx");
// URL input = new
// URL("http://www.bbc.co.uk/news/world-middle-east-11415719");
// URL input = new URL("http://www.thebigproject.co.uk/news/");
// URL input = new
// URL("http://blogs.euobserver.com/popescu/2009/12/15/on-euro-optimism-pessimism-and-failures/#more-958");
// URL input = new
// URL("http://www.cnn.com/2010/WORLD/meast/09/27/west.bank.settlement.construction/index.html?hpt=T2");
// URL input = new
// URL("http://www.huffingtonpost.com/steven-cohen/its-time-to-enact-congest_b_740315.html");
// URL input = new
// URL("http://uk.mac.ign.com/articles/573/573319p1.html");
final DOMParser parser = new DOMParser();
parser.parse(new InputSource(input.openStream()));
final Readability r = new Readability(parser.getDocument(), true, true);
// System.out.println(r.getArticleTitle());
System.out.println(r.getArticleHTML());
// System.out.println(r.getAllLinks());
// System.out.println(r.getArticleText());
System.out.println();
System.out.println("***");
System.out.println();
for (final MappingNode s : r.getArticleTextMapping())
System.out.println(s);
// PrintStream out = new PrintStream("news-sites");
// for (Anchor anchor : r.getAllLinks()) {
// out.println(anchor.getHref() + "\t" + anchor.getText());
// }
// out.close();
System.out.println(r.getArticleImages());
// System.out.println(r.getArticleSubheadings());
// System.out.println(r.getArticleHTML());
// System.out.println(r.getArticleHTML_DOM());
// System.out.println(r.getArticleDateString());
// System.out.println(r.getArticleDate());
}
示例10: getReadability
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
* Convenience method to build a {@link Readability} instance from an html
* string.
*
* @param html
* The html string
* @param addTitle
* Should the title be added to the generated article?
* @return new {@link Readability} instance.
* @throws SAXException
* @throws IOException
*/
public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
final DOMParser parser = new DOMParser();
parser.parse(new InputSource(new StringReader(html)));
return new Readability(parser.getDocument(), false, addTitle);
}
示例11: getDocumentNoBalance
import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
* @param html
* the HTML string.
* @return a Document object made from the HTML string.
* @throws SAXException
* if an exception occurs while parsing the HTML string.
* @throws IOException
* if an IO failure occurs.
*/
public static Document getDocumentNoBalance(String html) throws SAXException, IOException {
DOMParser domParser = new DOMParser();
domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
domParser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
domParser.parse(new InputSource(new StringReader(html)));
return domParser.getDocument();
}