当前位置: 首页>>代码示例>>Java>>正文


Java OutputDocument类代码示例

本文整理汇总了Java中net.htmlparser.jericho.OutputDocument的典型用法代码示例。如果您正苦于以下问题:Java OutputDocument类的具体用法?Java OutputDocument怎么用?Java OutputDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


OutputDocument类属于net.htmlparser.jericho包,在下文中一共展示了OutputDocument类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: processPage

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
protected String processPage( PathOrigin baseDir, String pagePath ) throws IOException {

    long start = System.currentTimeMillis();
    InputStream file = null;
    try {
      file = baseDir.getReader( getRepo() ).getFileInputStream( pagePath );
      Source html = new Source( file ); 
      OutputDocument outDoc = new OutputDocument( html );
      // transform
      modifyDocument( html, baseDir, outDoc );
      return outDoc.toString();
    } finally {
      IOUtils.closeQuietly( file );
      if ( log.isDebugEnabled() ) {
        log.debug( String.format( "processPage for %s took %dms", pagePath, System.currentTimeMillis() - start ) );
      }
    }

  }
 
开发者ID:webdetails,项目名称:cte,代码行数:20,代码来源:ProcessedHtmlPage.java

示例2: replaceUrlAttribute

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
protected int replaceUrlAttribute ( Iterable<StartTag> tags, final String pathAttribute, PathOrigin baseDir, OutputDocument doc ) {
  int count = 0;
  for ( StartTag tag : tags ) {
    Attributes attr = tag.parseAttributes();
    String path = attr.getValue( pathAttribute );
    if ( shouldProcessPath( path ) ) {
      String newPath = processPath( baseDir, path, getUrlProvider() );
      if ( log.isTraceEnabled() ) { //TODO: trace
        log.trace( String.format( "replaced: in %[email protected]%s \"%s\" --> \"%s\"", tag.getName(), pathAttribute, path, newPath ) );
      }
      doc.replace( attr, true ).put( pathAttribute, newPath );
      count++;
    }
  }
  return count;
}
 
开发者ID:webdetails,项目名称:cte,代码行数:17,代码来源:ProcessedHtmlPage.java

示例3: strip

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
/**
	 * Retira tags indesejadas
	 * 
	 * @param html
	 * @return
	 */
	public String strip(String html) {

		
		if (html == null)
			return "";
		Source source = new Source(html);
		source.fullSequentialParse();
		OutputDocument output = new OutputDocument(source);
		List<Tag> tags = source.getAllTags();

		for (Tag tag : tags) {
			if (processTag(tag, output)) {
				tag.setUserData(VALID_MARKER);
			} else {
				output.remove(tag);
			}
//			reencodeTextSegment(source, output, pos, tag.getBegin());
		}
//		reencodeTextSegment(source, output, pos, source.getEnd());
		return output.toString();
	}
 
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:28,代码来源:HtmlStripperDiscussion.java

示例4: strip

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
/**
 * Retira tags indesejadas
 * 
 * @param html
 * @return
 */
public String strip(String html) {
	if (html == null)
		return "";
	Source source = new Source(html);
	source.fullSequentialParse();
	OutputDocument output = new OutputDocument(source);
	List<Tag> tags = source.getAllTags();
	int pos = 0;
	for (Tag tag : tags) {
		if (processTag(tag, output)) {
			tag.setUserData(VALID_MARKER);
		} else {
			output.remove(tag);
		}
		reencodeTextSegment(source, output, pos, tag.getBegin());
		pos = tag.getEnd();
	}
	reencodeTextSegment(source, output, pos, source.getEnd());
	return output.toString();
}
 
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:27,代码来源:HtmlStripper.java

示例5: printHTMLPage

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
public void printHTMLPage(Source source)
        throws UnsupportedEncodingException, IOException {
    List<StartTag> list = source.getAllStartTags(HTMLElementName.STYLE);
    Iterator<StartTag> iterator = list.iterator();
    String text = "";
    while (iterator.hasNext()) {
        final StartTag tag = iterator.next();
        final Segment s = new Segment(source, tag.getEnd(), tag
                .getElement().getEndTag().getBegin());
        text += s.toString();
    }
    Vector<ReplaceRight> rights = Style.getStyles(text, styles, counter);

    // генерація сторінки з оновленими стилями.

    list = source.getAllStartTags();
    iterator = list.iterator();
    StartTag startTag = null;
    while (iterator.hasNext()) {
        final StartTag st = iterator.next();
        if (HTMLElementName.BODY.equals(st.getName())) {
            startTag = st;
            break;
        }
    }
    if (startTag == null)
        return;

    final StartTag body = startTag;
    final OutputDocument document = new OutputDocument(source);
    while (iterator.hasNext()) {
        startTag = iterator.next();
        replaceAttrs(startTag, document, rights);
    }

    OutputStreamWriter writer = new OutputStreamWriter(this.out, "UTF-8");
    document.writeTo(writer, body.getEnd(), body.getElement().getEndTag()
            .getBegin());
    writer.flush();
}
 
开发者ID:Vitaliy-Yakovchuk,项目名称:ramus,代码行数:41,代码来源:Out.java

示例6: modifyDocument

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
/**
 * Updates relative source attributes to externally accessible abs paths
 * @param html the document
 * @param baseDir html location
 * @param out processed document
 */
protected void modifyDocument( Source html, PathOrigin baseDir, OutputDocument out ) {
  replaceUrlAttribute( html.getAllStartTags( HTMLElementName.LINK ), "href", baseDir, out );
  replaceUrlAttribute( html.getAllStartTags( HTMLElementName.SCRIPT ), "src", baseDir, out );
  replaceUrlAttribute( html.getAllStartTags( HTMLElementName.IMG ), "src", baseDir, out );
  //int insertPos = html.getFirstElement( HTMLElementName.HEAD ).getEndTag().getBegin();
  //out.insert( insertPos, getCodeSnippet( getBackendAssignments( getUrlProvider() ) ) );
}
 
开发者ID:webdetails,项目名称:cte,代码行数:14,代码来源:ProcessedHtmlPage.java

示例7: sanitise

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private static String sanitise(String pseudoHTML, boolean formatWhiteSpace, boolean stripInvalidElements) {
	Source source=new Source(pseudoHTML);
	source.fullSequentialParse();
	OutputDocument outputDocument=new OutputDocument(source);
	List<Tag> tags=source.getAllTags();
	int pos=0;
	for (Tag tag : tags) {
		if (processTag(tag,outputDocument)) {
			tag.setUserData(VALID_MARKER);
		} else {
			if (!stripInvalidElements) continue; // element will be encoded along with surrounding text
			if (!stripInvalidElements) continue; // element will be encoded along with surrounding text
			if(tag.getName().equalsIgnoreCase("style")){
				Tag nextTag=tag.getNextTag();
				int endPos=0;
				if(nextTag!=null){
					endPos=nextTag.getBegin()-1;
				}else{
					endPos=source.getEnd();
				}
				outputDocument.remove(tag.getBegin(),endPos);
			}else{
				outputDocument.remove(tag);
			}
		}
		//reencodeTextSegment(source,outputDocument,pos,tag.getBegin(),formatWhiteSpace);
		pos=tag.getEnd();
	}
	//reencodeTextSegment(source,outputDocument,pos,source.getEnd(),formatWhiteSpace);
	return outputDocument.toString();
}
 
开发者ID:trackplus,项目名称:Genji,代码行数:32,代码来源:HTMLSanitiser.java

示例8: processTag

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private static boolean processTag(Tag tag, OutputDocument outputDocument) {
	String elementName=tag.getName();
	if (!VALID_ELEMENT_NAMES.contains(elementName)){
		//System.out.println("Not ok tag:!"+elementName+"!");
		return false;
	}
	if (tag.getTagType()==StartTagType.NORMAL) {
		Element element=tag.getElement();
		if (elementName==HTMLElementName.THEAD && !isValidTbodyTHeadTag(tag)) return false;
		if (elementName==HTMLElementName.TBODY && !isValidTbodyTHeadTag(tag)) return false;
		if (elementName==HTMLElementName.TR && !isValidTRTag(tag)) return false;
		if (elementName==HTMLElementName.TD && !isValidTDTHTag(tag)) return false;
		if (elementName==HTMLElementName.TH && !isValidTDTHTag(tag)) return false;
		if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) {
			if (element.getEndTag()==null) return false; // reject start tag if its required end tag is missing
		} else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName)) {
			if (elementName==HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags
			if (element.getEndTag()==null) outputDocument.insert(element.getEnd(),getEndTagHTML(elementName)); // insert optional end tag if it is missing
		}
		outputDocument.replace(tag,getStartTagHTML(element.getStartTag()));
	} else if (tag.getTagType()==EndTagType.NORMAL) {
		if (tag.getElement()==null) return false; // reject end tags that aren't associated with a start tag
		if (elementName==HTMLElementName.THEAD && !isValidTbodyTHeadTag(tag)) return false;
		if (elementName==HTMLElementName.TBODY && !isValidTbodyTHeadTag(tag)) return false;
		if (elementName==HTMLElementName.TR && !isValidTRTag(tag)) return false;
		if (elementName==HTMLElementName.TD && !isValidTDTHTag(tag)) return false;
		if (elementName==HTMLElementName.TH && !isValidTDTHTag(tag)) return false;
		if (elementName==HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags
		outputDocument.replace(tag,getEndTagHTML(elementName));
	} else {
		return false; // reject abnormal tags
	}
	return true;
}
 
开发者ID:trackplus,项目名称:Genji,代码行数:35,代码来源:HTMLSanitiser.java

示例9: reencodeTextSegment

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private static void reencodeTextSegment(Source source, OutputDocument outputDocument, int begin, int end, boolean formatWhiteSpace) {
  if (begin>=end) return;
  Segment textSegment=new Segment(source,begin,end);
	String decodedText=CharacterReference.decode(textSegment);
	String encodedText=formatWhiteSpace ? CharacterReference.encodeWithWhiteSpaceFormatting(decodedText) : CharacterReference.encode(decodedText);
   outputDocument.replace(textSegment,encodedText);
}
 
开发者ID:trackplus,项目名称:Genji,代码行数:8,代码来源:HTMLSanitiser.java

示例10: processTag

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private boolean processTag(Tag tag, OutputDocument output) {
	String elementName = tag.getName().toLowerCase();
	if (!allowedTags.contains(elementName))
		return false;
	if (tag.getTagType() == StartTagType.NORMAL) {
		Element element = tag.getElement();
		if (HTMLElements.getEndTagRequiredElementNames().contains(
				elementName)) {
			if (element.getEndTag() == null)
				return false; // reject start tag if its required end tag is
								// missing
		} else if (HTMLElements.getEndTagOptionalElementNames().contains(
				elementName)) {
			if (elementName == HTMLElementName.LI && !isValidLITag(tag))
				return false; // reject invalid LI tags
			if (element.getEndTag() == null)
				output.insert(element.getEnd(), getEndTagHTML(elementName)); // insert
																				// optional
																				// end
																				// tag
																				// if
																				// it
																				// is
																				// missing
		}
		output.replace(tag, getStartTagHTML(element.getStartTag()));
	} else if (tag.getTagType() == EndTagType.NORMAL) {
		if (tag.getElement() == null)
			return false; // reject end tags that aren't associated with a
							// start tag
		if (elementName == HTMLElementName.LI && !isValidLITag(tag))
			return false; // reject invalid LI tags
		output.replace(tag, getEndTagHTML(elementName));
	} else {
		return false; // reject abnormal tags
	}
	return true;
}
 
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:39,代码来源:HtmlStripperDiscussion.java

示例11: reencodeTextSegment

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private void reencodeTextSegment(Source source, OutputDocument output, int begin, int end) {
	if (begin >= end)
		return;
	Segment textSegment = new Segment(source, begin, end);
	String decodedText = CharacterReference.decode(textSegment);
	String encodedText = CharacterReference.encode(decodedText);
	output.replace(textSegment, encodedText);
}
 
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:9,代码来源:HtmlStripper.java

示例12: processTag

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private boolean processTag(Tag tag, OutputDocument output) {
	String elementName = tag.getName().toLowerCase();
	if (!allowedTags.contains(elementName))
		return false;
	if (tag.getTagType() == StartTagType.NORMAL) {
		Element element = tag.getElement();
		if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) {
			if (element.getEndTag() == null)
				return false; // reject start tag if its required end tag is
				              // missing
		} else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName)) {
			if (elementName == HTMLElementName.LI && !isValidLITag(tag))
				return false; // reject invalid LI tags
			if (element.getEndTag() == null)
				// insert optional end tag if it is missing
				output.insert(element.getEnd(), getEndTagHTML(elementName));
		}
		output.replace(tag, getStartTagHTML(element.getStartTag()));
	} else if (tag.getTagType() == EndTagType.NORMAL) {
		if (tag.getElement() == null)
			return false; // reject end tags that aren't associated with a
			              // start tag
		if (elementName == HTMLElementName.LI && !isValidLITag(tag))
			return false; // reject invalid LI tags
		output.replace(tag, getEndTagHTML(elementName));
	} else {
		return false; // reject abnormal tags
	}
	return true;
}
 
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:31,代码来源:HtmlStripper.java

示例13: reencodeTextSegment

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private void reencodeTextSegment(Source source, OutputDocument output,
		int begin, int end) {
	if (begin >= end)
		return;
	Segment textSegment = new Segment(source, begin, end);
	String decodedText = CharacterReference.decode(textSegment);
	String encodedText = CharacterReference.encode(decodedText);
	output.replace(textSegment, encodedText);
}
 
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:10,代码来源:HtmlStripper.java

示例14: realWriteWithHTMLUpdate

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
public void realWriteWithHTMLUpdate() throws IOException {
    flush();
    ByteArrayOutputStream out = (ByteArrayOutputStream) this.out;
    Source source = new Source(new String(out.toByteArray(), "UTF-8"));
    source.fullSequentialParse();
    List<StartTag> list = source.getAllStartTags("html");
    if (list.size() == 0) {
        realWrite();
        return;
    }

    this.out = outputStream;

    OutputStreamWriter writer = new OutputStreamWriter(this.out, "UTF-8");
    OutputDocument document = new OutputDocument(source);

    StringBuffer style = new StringBuffer();
    if (this.styles.size() > 0) {
        for (Style style2 : this.styles)
            style.append(style2.toString());
    }

    List<StartTag> h = source.getAllStartTags("style");
    if (h.size() > 0) {
        document.insert(h.get(0).getElement().getEndTag().getBegin(), style);
    } else {

        style.insert(0, "\n<style>\n");
        style.append("</style>\n");

        h = source.getAllStartTags("head");
        if (h.size() > 0) {
            document.insert(h.get(0).getElement().getEndTag().getBegin(),
                    style);
        } else {
            style.insert(0, "\n<head>\n");
            style.append("</head>\n");
            document.insert(h.get(0).getElement().getEndTag().getBegin(),
                    style);
        }
    }
    document.writeTo(writer);
    writer.flush();

}
 
开发者ID:Vitaliy-Yakovchuk,项目名称:ramus,代码行数:46,代码来源:Out.java

示例15: removeNotAllowedTags

import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
/**
    * Serduszko dla Bartka od Kasi <3
    * @param htmlFragment
    * @param docUri
    * @return
    */
   private String removeNotAllowedTags(String htmlFragment, URI docUri) {
       Source source = new Source(htmlFragment);
       OutputDocument outputDocument = new OutputDocument(source);
       List<Element> elements = source.getAllElements();


    for (Element element : elements) {
    	Attributes attrs = element.getAttributes();
    	Map<String, String> attrsUpdate = outputDocument.replace(attrs, true);
    	if (!element.getName().contains("a")) {
			attrsUpdate.clear();
		} else {
    		if (attrsUpdate.get("href")!=null) {
	    		String link = attrsUpdate.get("href");
	    		if (!link.contains("http")) {
		    		URI documentUri = docUri;

		    		URI anchorUri;
					try {
						anchorUri = new URI(link);
						URI result = documentUri.resolve(anchorUri);

						attrsUpdate.put("href",	result.toString());
					} catch (URISyntaxException e) {
						outputDocument.remove(element);
					}
	    		}
    		}
    	}

    	if (NOT_ALLOWED_HTML_TAGS.contains(element.getName())) {
    		Segment content = element.getContent();
    		if (element.getName() == "script"
    				|| element.getName() == "style"
    				|| element.getName() == "form") {
    			outputDocument.remove(content);
    		}
            outputDocument.remove(element.getStartTag());

            if (!element.getStartTag().isSyntacticalEmptyElementTag()) {
                outputDocument.remove(element.getEndTag());
            }
        }
    }

    String out = outputDocument.toString();
    out = out.replaceAll("\\n", "");
    out = out.replaceAll("\\t", "");

    return out;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:58,代码来源:HtmlArticleExtractor.java


注:本文中的net.htmlparser.jericho.OutputDocument类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。