本文整理汇总了Java中org.cyberneko.html.parsers.DOMParser类的典型用法代码示例。如果您正苦于以下问题:Java DOMParser类的具体用法?Java DOMParser怎么用?Java DOMParser使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
DOMParser类属于org.cyberneko.html.parsers包,在下文中一共展示了DOMParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: readHtmlDocument
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public static Document readHtmlDocument(String str) {
Document document = null;
try {
URL url = FlexibleLocation.resolveLocation(str);
if (url != null) {
DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(url.toExternalForm());
document = parser.getDocument();
} else {
Debug.logError("Unable to locate HTML document " + str, module);
}
} catch (Exception e) {
Debug.logError(e, "Error while reading HTML document " + str, module);
}
return document;
}
示例2: testPost
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
@Test
public void testPost() throws Exception {
cfg.setProperty(WSFedConstants.PROP_USE_REDIRECT, false);
StringWriter sw = new StringWriter();
when(res.getWriter()).thenReturn(new PrintWriter(sw));
LoginHandler lh = new LoginHandler();
lh.handleGet(rc);
WebWindow win = mock(WebWindow.class);
when(win.getScriptObject()).thenThrow(new RuntimeException("test"));
when(win.getWebClient()).thenReturn(new WebClient(BrowserVersion.FIREFOX_2));
DOMParser parser = new DOMParser();
parser.parse(new InputSource(new ByteArrayInputStream(sw.toString().getBytes())));
HTMLElement e = (HTMLElement) parser.getDocument().getDocumentElement();
NodeList forms = e.getElementsByTagName("form");
assertEquals(1, forms.getLength());
Element form = (Element)forms.item(0);
assertEquals("loginform", form.getAttribute("name"));
assertEquals(rc.getIdpMetadata().getFirstMetadata().getSingleSignonServiceLocation(WSFedConstants.WSFED_PROTOCOL), form.getAttribute("action"));
verify(res, never()).sendRedirect(anyString());
}
示例3: main
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
DOMParser parser = new DOMParser();
String pathname = "src/test/resources/html/simple/3.html";
parser.parse(new InputSource(new FileReader(pathname)));
Document document = parser.getDocument();
TreeNode node = new TreeNode(document, null);
node.postOrderIndex(new AtomicInteger(1));
node.prettyPrint();
List<TreeNode> nodes = node.postOrderTraverse();
System.out.println(nodes);
System.out.println(node.getKeyRoots());
}
示例4: getSource
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
@TimeThis(task="read-file", category=TimerCategory.LOAD_RESOURCE)
protected Source getSource(@SuppressWarnings("unused") ProcessingContext<Corpus> ctx, InputStream file) throws SAXException, IOException, ParserConfigurationException {
if (html) {
DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
parser.setFeature("http://cyberneko.org/html/features/parse-noscript-content", false);
parser.setProperty("http://cyberneko.org/html/properties/default-encoding", sourcePath.getCharset());
if (rawTagNames) {
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
}
else {
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
}
parser.parse(new InputSource(file));
Document doc = parser.getDocument();
return new DOMSource(doc);
}
SAXParserFactory spf = SAXParserFactory.newInstance();
spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
org.xml.sax.XMLReader xmlReader = spf.newSAXParser().getXMLReader();
xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
xmlReader.setEntityResolver(new EntityResolver() {
@Override
public InputSource resolveEntity(String pid, String sid) throws SAXException {
return new InputSource(new ByteArrayInputStream(new byte[] {}));
}
});
new SAXSource(xmlReader, new InputSource(file));
return new StreamSource(file);
}
示例5: getDoc
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public static Document getDoc(InputStream in, String encoding)
throws Exception {
DOMParser parser = new DOMParser();
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
encoding);
parser.setFeature("http://xml.org/sax/features/namespaces", false);
BufferedReader br = new BufferedReader(new InputStreamReader(in,
encoding));
parser.parse(new InputSource(br));
return parser.getDocument();
}
示例6: computeDistance
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
* Computes edit distance between two html files
* @param file1 first html file
* @param file2 second html file
* @return edit distance measure
* @throws IOException when an error occurs
* @throws SAXException when parser fails
*/
public static double computeDistance(File file1, File file2)
throws IOException, SAXException {
DOMParser domParser = new DOMParser();
domParser.parse(new InputSource(new FileReader(file1)));
Document doc1 = domParser.getDocument();
domParser.reset();
domParser.parse(new InputSource(new FileReader(file2)));
Document doc2 = domParser.getDocument();
ZSTEDComputer computer = new ZSTEDComputer();
return computer.computeDistance(new TreeNode(doc1, null), new TreeNode(doc2, null));
}
示例7: VSMVector
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public VSMVector(String file, boolean isForm, StopList stoplist) throws MalformedURLException, IOException, SAXException {
this.stoplist = stoplist;
this.elems = new HashMap<>();
if(isForm){
DOMParser parser = new DOMParser();
if((file.toLowerCase()).indexOf("<form ") != -1){//verify if the string is the name of file or the content of form
parser.parse(new InputSource(new BufferedReader(new StringReader(file))));
}else{
parser.parse(file);
}
String srcForm = "";
Document doc = parser.getDocument();
NodeList list = doc.getElementsByTagName("form");
StringBuffer source = new StringBuffer();
parse(list.item(0), source, new StringBuffer(), "html", stoplist);
srcForm = source.toString().toLowerCase();
PaginaURL formPage = new PaginaURL(new URL("http://www"),srcForm, stoplist);
stemPage(formPage, true);
} else {
StringBuffer content = new StringBuffer();
BufferedReader input = new BufferedReader(new FileReader(new File(
file)));
for (String line = input.readLine(); line != null;
line = input.readLine()) {
content.append(line);
content.append("\n");
}
input.close();
String src = content.toString();
PaginaURL page = new PaginaURL(new URL("http://www"), src, stoplist);
addTitle(page, stoplist);
stemPage(page, false);
}
}
示例8: asDocument
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
* transforms a string into a Document object. TODO This needs more optimizations. As it seems
* the getDocument is called way too much times causing a lot of parsing which is slow and not
* necessary.
*
* @param html
* the HTML string.
* @return The DOM Document version of the HTML string.
* @throws IOException
* if an IO failure occurs.
* @throws SAXException
* if an exception occurs while parsing the HTML string.
*/
public static Document asDocument(String html) throws IOException {
DOMParser domParser = new DOMParser();
try {
domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
domParser.setFeature("http://xml.org/sax/features/namespaces", false);
domParser.parse(new InputSource(new StringReader(html)));
} catch (SAXException e) {
throw new IOException("Error while reading HTML: " + html, e);
}catch (Exception unknown){
unknown.printStackTrace();
throw new IOException("Error while reading HTML: " + html);
}
return domParser.getDocument();
}
示例9: parse
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
@Override
public Document parse() throws SAXException, IOException
{
DOMParser parser = new DOMParser(new HTMLConfiguration());
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
if (charset != null)
parser.setProperty("http://cyberneko.org/html/properties/default-encoding", charset);
parser.parse(new org.xml.sax.InputSource(getDocumentSource().getInputStream()));
return parser.getDocument();
}
示例10: computeDistances
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
* Computes the edit distance between files in a directory
* @param inputDir directory of html pages
* @throws IOException
* @throws SAXException
*/
private static void computeDistances(File inputDir) throws IOException, SAXException {
File[] files = inputDir.listFiles();
List<TreeNode> docs = new ArrayList<>();
List<String> htmlPaths = new ArrayList<>();
DOMParser parser = new DOMParser();
for (File file : files) {
if (!file.isFile()) {
//skip
continue;
}
try(FileReader reader = new FileReader(file)) {
parser.parse(new InputSource(reader));
htmlPaths.add(file.getAbsolutePath());
docs.add(new TreeNode(parser.getDocument(), null));
parser.reset();
}
}
int n = docs.size();
if (n < 2) {
throw new RuntimeException("At least 2 html/xml files should be present in the input directory");
}
ZSTEDComputer edComputer = new ZSTEDComputer();
StructureSimComputer simComputer = new StructureSimComputer(edComputer);
double[][] distMatrix = edComputer.computeDistanceMatrix(docs);
int treeSizes[] = new int[n];
for (int i = 0; i < docs.size(); i++) {
treeSizes[i] = docs.get(i).getSize();
}
double[][] simMatrix = simComputer.compute(treeSizes, distMatrix);
System.out.println("#Index\tFile Path");
for (int i = 0; i < htmlPaths.size(); i++) {
System.out.println(i + "\t" + htmlPaths.get(i));
}
System.out.println("\n#Distance Matrix");
MatrixUtils.printMatrix(distMatrix);
System.out.println("\n#Similarity Matrix");
MatrixUtils.printMatrix(simMatrix);
}
示例11: main
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
* Testing
*
* @param argv
* @throws Exception
*/
public static void main(String[] argv) throws Exception {
// URL input = new
// URL("file:///home/dd/Programming/Readability4J/t.html");
// URL input = new
// URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm");
final URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/");
// URL input = new URL("http://euobserver.com/9/30465");
// URL input = new URL("http://euobserver.com/?aid=23383");
// URL input = new
// URL("http://abandoninplace.squarespace.com/blog/2010/6/8/wwdc-monday.html");
// URL input = new URL("file:///Users/jsh2/Desktop/test.html");
// URL input = new
// URL("http://mobile.engadget.com/2010/06/17/htc-aria-review/");
// URL input = new URL("http://thedailywtf.com/Articles/Benched.aspx");
// URL input = new
// URL("http://www.dailymail.co.uk/news/article-1287625/Woman-sparked-150-000-manhunt-slashing-face-crying-rape-faces-jail.html");
// URL input = new
// URL("http://mrpaparazzi.com/post/11619/Lindsay-Lohan-Tests-Negative-For-Alcohol-Goes-Clubbing-To-Celebrate.aspx");
// URL input = new
// URL("http://www.bbc.co.uk/news/world-middle-east-11415719");
// URL input = new URL("http://www.thebigproject.co.uk/news/");
// URL input = new
// URL("http://blogs.euobserver.com/popescu/2009/12/15/on-euro-optimism-pessimism-and-failures/#more-958");
// URL input = new
// URL("http://www.cnn.com/2010/WORLD/meast/09/27/west.bank.settlement.construction/index.html?hpt=T2");
// URL input = new
// URL("http://www.huffingtonpost.com/steven-cohen/its-time-to-enact-congest_b_740315.html");
// URL input = new
// URL("http://uk.mac.ign.com/articles/573/573319p1.html");
final DOMParser parser = new DOMParser();
parser.parse(new InputSource(input.openStream()));
final Readability r = new Readability(parser.getDocument(), true, true);
// System.out.println(r.getArticleTitle());
System.out.println(r.getArticleHTML());
// System.out.println(r.getAllLinks());
// System.out.println(r.getArticleText());
System.out.println();
System.out.println("***");
System.out.println();
for (final MappingNode s : r.getArticleTextMapping())
System.out.println(s);
// PrintStream out = new PrintStream("news-sites");
// for (Anchor anchor : r.getAllLinks()) {
// out.println(anchor.getHref() + "\t" + anchor.getText());
// }
// out.close();
System.out.println(r.getArticleImages());
// System.out.println(r.getArticleSubheadings());
// System.out.println(r.getArticleHTML());
// System.out.println(r.getArticleHTML_DOM());
// System.out.println(r.getArticleDateString());
// System.out.println(r.getArticleDate());
}
示例12: evaluate
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public Object evaluate(TaskRequest req, TaskResponse res) {
Node rslt = null;
String ctx_str = (String) context.evaluate(req, res);
String loc_str = (String) location.evaluate(req, res);
try {
URL ctx = new URL(ctx_str);
URL src = new URL(ctx, loc_str);
DOMParser parser = new DOMParser();
parser.parse(src.toString());
Document doc = new DOMReader().read(parser.getDocument());
rslt = doc.getRootElement();
} catch (Throwable t) {
String msg = "Unable to read the specified document:"
+ "\n\tCONTEXT=" + ctx_str
+ "\n\tLOCATION=" + loc_str;
throw new RuntimeException(msg, t);
}
return rslt;
}
示例13: getReadability
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
* Convenience method to build a {@link Readability} instance from an html
* string.
*
* @param html
* The html string
* @param addTitle
* Should the title be added to the generated article?
* @return new {@link Readability} instance.
* @throws SAXException
* @throws IOException
*/
public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
final DOMParser parser = new DOMParser();
parser.parse(new InputSource(new StringReader(html)));
return new Readability(parser.getDocument(), false, addTitle);
}
示例14: LSMEnglishHymnalHTMLParser
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public LSMEnglishHymnalHTMLParser() throws Exception {
parser = new DOMParser();
//parser.setProperty("http://cyberneko.org/html/properties/default-encoding","" );
hymns = new ArrayList<Hymn>(2000);
hymnal = new Hymnal(new AlphanumComparator());
hymnal.id = PUB_PREFIX + "English";
hymnal.title = "Hymns";
}
示例15: getDocumentNoBalance
import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
* @param html
* the HTML string.
* @return a Document object made from the HTML string.
* @throws SAXException
* if an exception occurs while parsing the HTML string.
* @throws IOException
* if an IO failure occurs.
*/
public static Document getDocumentNoBalance(String html) throws SAXException, IOException {
DOMParser domParser = new DOMParser();
domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
domParser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
domParser.parse(new InputSource(new StringReader(html)));
return domParser.getDocument();
}