当前位置: 首页>>代码示例>>Java>>正文


Java UriToDocumentTextAnnotator类代码示例

本文整理汇总了Java中org.cleartk.util.ae.UriToDocumentTextAnnotator的典型用法代码示例。如果您正苦于以下问题:Java UriToDocumentTextAnnotator类的具体用法?Java UriToDocumentTextAnnotator怎么用?Java UriToDocumentTextAnnotator使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


UriToDocumentTextAnnotator类属于org.cleartk.util.ae包,在下文中一共展示了UriToDocumentTextAnnotator类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: train

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
@Override
public void train(CollectionReader collectionReader, File outputDirectory) throws Exception {
  AggregateBuilder builder = new AggregateBuilder();
  builder.add(UriToDocumentTextAnnotator.getDescription());
  builder.add(SentenceAnnotator.getDescription());
  builder.add(TokenAnnotator.getDescription());
  builder.add(PosTaggerAnnotator.getDescription());
  builder.add(DefaultSnowballStemmer.getDescription("English"));
  builder.add(AnalysisEngineFactory.createEngineDescription(GoldQuestionCategoryAnnotator.class));
  AnalysisEngineDescription documentClassificationAnnotator = AnalysisEngineFactory.createEngineDescription(
      QuestionCategoryAnnotator.class, CleartkAnnotator.PARAM_IS_TRAINING, true,
      DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, outputDirectory,
      DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, LibSvmStringOutcomeDataWriter.class.getName());
  builder.add(documentClassificationAnnotator);
  SimplePipeline.runPipeline(collectionReader, builder.createAggregateDescription());
  System.err.println("Train model and write model.jar file.");
  HideOutput hider = new HideOutput();
  Train.main(outputDirectory, this.trainingArguments.toArray(new String[this.trainingArguments.size()]));
  hider.restoreOutput();
}
 
开发者ID:utk4rsh,项目名称:question-classifier,代码行数:21,代码来源:QuestionCategoryEvaluation.java

示例2: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  Options options = CliFactory.parseArguments(Options.class, args);
  CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(options.getTestDirectory(),
      UriCollectionReader.RejectSystemFiles.class, UriCollectionReader.RejectSystemDirectories.class);
  AggregateBuilder builder = new AggregateBuilder();
  builder.add(UriToDocumentTextAnnotator.getDescription());
  builder.add(SentenceAnnotator.getDescription());
  builder.add(TokenAnnotator.getDescription());
  builder.add(PosTaggerAnnotator.getDescription());
  builder.add(DefaultSnowballStemmer.getDescription("English"));
  builder.add(AnalysisEngineFactory.createEngineDescription(QuestionCategoryAnnotator.class,
      CleartkAnnotator.PARAM_IS_TRAINING, false, GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
      JarClassifierBuilder.getModelJarFile(options.getModelsDirectory())));
  SimplePipeline.runPipeline(reader, builder.createAggregateDescription(),
      AnalysisEngineFactory.createEngineDescription(PrintClassificationsAnnotator.class));
}
 
开发者ID:utk4rsh,项目名称:question-classifier,代码行数:17,代码来源:App.java

示例3: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {

    Options options = CliFactory.parseArguments(Options.class, args);

    CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(options.getInputDirectory());

    AnalysisEngineDescription uriToText = UriToDocumentTextAnnotator.getDescription();

    AnalysisEngineDescription sentences = SentenceAnnotator.getDescription();

    AnalysisEngineDescription tokenizer = TokenAnnotator.getDescription();

    AnalysisEngineDescription posTagger = PosTaggerAnnotator.getDescription();

    AnalysisEngineDescription lineWriter = AnalysisEngineFactory.createEngineDescription(
        LineWriter.class,
        LineWriter.PARAM_OUTPUT_FILE_NAME,
        options.getOutputFile(),
        LineWriter.PARAM_OUTPUT_ANNOTATION_CLASS_NAME,
        Token.class.getName(),
        LineWriter.PARAM_ANNOTATION_WRITER_CLASS_NAME,
        TokenAnnotationWriter.class.getName());

    SimplePipeline.runPipeline(reader, uriToText, sentences, tokenizer, posTagger, lineWriter);
    System.out.println("results written to " + options.getOutputFile());
  }
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:27,代码来源:Docs2Tokens.java

示例4: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  Options options = CliFactory.parseArguments(Options.class, args);

  CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(options.getInputDirectory());

  AggregateBuilder builder = new AggregateBuilder();
  builder.add(UriToDocumentTextAnnotator.getDescription());
  builder.add(SentenceAnnotator.getDescription());
  builder.add(AnalysisEngineFactory.createEngineDescription(
      LineWriter.class,
      LineWriter.PARAM_OUTPUT_FILE_NAME,
      options.getOutputFile(),
      LineWriter.PARAM_OUTPUT_ANNOTATION_CLASS_NAME,
      Sentence.class.getName()));

  SimplePipeline.runPipeline(reader, builder.createAggregateDescription());
  System.out.println("results written to " + options.getOutputFile());

}
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:20,代码来源:Docs2Sentences.java

示例5: executeUIMAAnnotator

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
/**
 * Execute Stanford annotators.
 * The annotators are: Tokenize, SSplit, POS, Lemma, NER, Parse, Dcoref + Snowball Stemmer
 *
 * @param _inputFile the _input file
 * @param _outputFile the _output file
 */
public void executeUIMAAnnotator(String _inputFile, String _outputFile) {
	logger.info("Running executeUIMAAnnotator with: "+_inputFile+" and "+_outputFile);  
	File file = new File(_inputFile);
	Stopwatch stopwatch = new Stopwatch().start();
	  try {
		SimplePipeline.runPipeline(
			UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(file)),
			UriToDocumentTextAnnotator.getDescription(),
			this.stanfordNLP, //stanford tokenize, ssplit, pos, lemma, ner, parse, dcoref
		    DefaultSnowballStemmer.getDescription("English"), //stemmer
		    createAEDescription(UIMA_RUTA_SCRIPT), //RUTA Analysis Engine
		    AnnotationRemover.getDescription(), //Remove useless annotations
		    DesignDecisionSentenceRemover.getDescription(), //Remove sentence annotations that are also designdecisions
		    AnalysisEngineFactory.createEngineDescription(//result files
	            XCasWriter.class,
	            XCasWriter.PARAM_OUTPUT_FILE_NAME,
	            _outputFile)
		);
	} catch (Exception e) {
		logger.error("Error executing the uima annotator.",e);
	}
	stopwatch.stop(); // optional
    logger.info("executeUIMAAnnotator took: " + stopwatch); // formatted string like "12.3 ms"
}
 
开发者ID:germanattanasio,项目名称:traceability-assistant,代码行数:32,代码来源:CarchaPipeline.java

示例6: executeSentenceAnnotator

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
/**
 * Execute sentence annotator.
 *
 * @param _inputFile the _input file
 * @param _outputFile the _output file
 */
public void executeSentenceAnnotator(String _inputFile, String _outputFile) {
	logger.info("Running executeSentenceAnnotator with: "+_inputFile+" and "+_outputFile);  
	File file = new File(_inputFile);
	Stopwatch stopwatch = new Stopwatch().start();
	  try {
		SimplePipeline.runPipeline(
			UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(file)),
			UriToDocumentTextAnnotator.getDescription(),
		    this.stanfordNLP, //stanford tokenize, ssplit, pos, lemma, ner, parse, dcoref
		    AnnotationRemover.getDescription(), //Remove useless annotations
		    AnalysisEngineFactory.createEngineDescription(//result files
	            XCasWriter.class,
	            XCasWriter.PARAM_OUTPUT_FILE_NAME,
	            _outputFile)
		);
	} catch (Exception e) {
		logger.error("Error executing the uima annotator.",e);
	}
	stopwatch.stop(); // optional
    logger.info("executeSentenceAnnotator took: " + stopwatch); // formatted string like "12.3 ms"
}
 
开发者ID:germanattanasio,项目名称:traceability-assistant,代码行数:28,代码来源:CarchaPipeline.java

示例7: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  Options options = CliFactory.parseArguments(Options.class, args);

  // Loads URIS specified by files into URI view
  String[] suffixes = options.getTreebankFileSuffixes().toArray(
      new String[options.getTreebankFileSuffixes().size()]);
  File treebankDir = options.getTreebankDirectory();
  Collection<File> files = FileUtils.listFiles(treebankDir, suffixes, false);
  CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);

  AggregateBuilder builder = new AggregateBuilder();

  // Reads text into TREEBANK_VIEW
  builder.add(UriToDocumentTextAnnotator.getDescriptionForView(PennTreebankReader.TREEBANK_VIEW));

  // Ensures GOLD_VIEW is present
  builder.add(AnalysisEngineFactory.createEngineDescription(
      ViewCreatorAnnotator.class,
      ViewCreatorAnnotator.PARAM_VIEW_NAME,
      EvaluationConstants.GOLD_VIEW));

  // Parses treebank text into gold view (instead of default sofa view)
  builder.add(
      TreebankGoldAnnotator.getDescription(),
      CAS.NAME_DEFAULT_SOFA,
      EvaluationConstants.GOLD_VIEW);

  // XMI Writer
  builder.add(XmiWriter.getDescription(new File(options.getOutputDirectory())));

  SimplePipeline.runPipeline(reader, builder.createAggregateDescription());

}
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:34,代码来源:TreebankParsingExample.java

示例8: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {

    File filesDirectory = new File(args[0]);

    SimplePipeline.runPipeline(
        UriCollectionReader.getCollectionReaderFromDirectory(filesDirectory),
        UriToDocumentTextAnnotator.getDescription(),
        SentenceAnnotator.getDescription(),
        TokenAnnotator.getDescription(),
        PosTaggerAnnotator.getDescription(),
        ParserAnnotator.getDescription());
  }
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:13,代码来源:ParserExample.java

示例9: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  String samplePath = "data/sample/2008_Sichuan_earthquake.txt";
  List<File> files = Arrays.asList(new File(samplePath));

  // A collection reader that creates one CAS per file, containing the file's URI
  CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);

  // The pipeline of annotators
  AggregateBuilder builder = new AggregateBuilder();

  // An annotator that reads in the file text
  builder.add(UriToDocumentTextAnnotator.getDescription());

  // An annotator that adds Sentence annotations
  builder.add(SentenceAnnotator.getDescription());

  // An annotator that adds Token annotations
  builder.add(TokenAnnotator.getDescription());

  // The POS annotator, configured to make predictions
builder.add(ExamplePosAnnotator
		.getClassifierDescription(JarClassifierBuilder.getModelJarFile(
				ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY).getPath()));

  // An annotator that write out the tokens and their part of speech tags
  builder.add(AnalysisEngineFactory.createEngineDescription(
      ExamplePosPlainTextWriter.class,
      ExamplePosPlainTextWriter.PARAM_OUTPUT_DIRECTORY_NAME,
      ExamplePosPlainTextWriter.DEFAULT_OUTPUT_DIRECTORY));

  // Run the pipeline of annotators on each of the CASes produced by the reader
  SimplePipeline.runPipeline(reader, builder.createAggregateDescription());

  System.out.println("Please look at the file generated by this program: "
      + ExamplePosPlainTextWriter.DEFAULT_OUTPUT_DIRECTORY + "/2008_Sichuan_earthquake.txt.pos");
}
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:37,代码来源:RunExamplePosAnnotator.java

示例10: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  Options options = CliFactory.parseArguments(Options.class, args);

  // ////////////////////////////////////////
  // Create collection reader to load URIs
  // ////////////////////////////////////////
  CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(
      options.getTestDirectory(),
      UriCollectionReader.RejectSystemFiles.class,
      UriCollectionReader.RejectSystemDirectories.class);

  // ////////////////////////////////////////
  // Create document classification pipeline
  // ////////////////////////////////////////
  AggregateBuilder builder = new AggregateBuilder();

  // Convert URIs in CAS URI View to Plain Text
  builder.add(UriToDocumentTextAnnotator.getDescription());

  // NLP pre-processing components
  builder.add(SentenceAnnotator.getDescription()); // Sentence segmentation
  builder.add(TokenAnnotator.getDescription()); // Tokenization
  builder.add(DefaultSnowballStemmer.getDescription("English")); // Stemming

  // Simple document classification annotator
  builder.add(AnalysisEngineFactory.createEngineDescription(
      BasicDocumentClassificationAnnotator.class,
      CleartkAnnotator.PARAM_IS_TRAINING,
      false,
      GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
      JarClassifierBuilder.getModelJarFile(options.getModelsDirectory())));

  // //////////////////////////////////////////////////////////////////////////////
  // Run pipeline and classify documents
  // //////////////////////////////////////////////////////////////////////////////
  SimplePipeline.runPipeline(
      reader,
      builder.createAggregateDescription(),
      AnalysisEngineFactory.createEngineDescription(PrintClassificationsAnnotator.class));
}
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:41,代码来源:RunModel.java

示例11: train

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
@Override
public void train(CollectionReader collectionReader, File outputDirectory) throws Exception {
  // assemble the training pipeline
  AggregateBuilder aggregate = new AggregateBuilder();

  // an annotator that loads the text from the training file URIs
  aggregate.add(UriToDocumentTextAnnotator.getDescription());

  // an annotator that parses and loads MASC named entity annotations (and tokens)
  aggregate.add(MascGoldAnnotator.getDescription());

  // an annotator that adds part-of-speech tags
  aggregate.add(PosTaggerAnnotator.getDescription());

  // our NamedEntityChunker annotator, configured to write Mallet CRF training data
  aggregate.add(AnalysisEngineFactory.createEngineDescription(
      NamedEntityChunker.class,
      CleartkSequenceAnnotator.PARAM_IS_TRAINING,
      true,
      DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
      outputDirectory,
      DefaultSequenceDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
      MalletCrfStringOutcomeDataWriter.class));

  // run the pipeline over the training corpus
  SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());

  // quiet Mallet down a bit (but still leave likelihoods so you can see progress)
  Logger malletLogger = Logger.getLogger("cc.mallet");
  malletLogger.setLevel(Level.WARNING);
  Logger likelihoodLogger = Logger.getLogger("cc.mallet.fst.CRFOptimizableByLabelLikelihood");
  likelihoodLogger.setLevel(Level.INFO);

  // train a Mallet CRF model on the training data
  Train.main(outputDirectory);

}
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:38,代码来源:EvaluateNamedEntityChunker.java

示例12: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  Options options = CliFactory.parseArguments(Options.class, args);

  // a reader that loads the URIs of the text file
  CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(options.getTextFile()));

  // assemble the classification pipeline
  AggregateBuilder aggregate = new AggregateBuilder();

  // an annotator that loads the text from the training file URIs
  aggregate.add(UriToDocumentTextAnnotator.getDescription());

  // annotators that identify sentences, tokens and part-of-speech tags in the text
  aggregate.add(SentenceAnnotator.getDescription());
  aggregate.add(TokenAnnotator.getDescription());
  aggregate.add(PosTaggerAnnotator.getDescription());

  // our NamedEntityChunker annotator, configured to classify on the new texts
  aggregate.add(AnalysisEngineFactory.createEngineDescription(
      NamedEntityChunker.class,
      CleartkSequenceAnnotator.PARAM_IS_TRAINING,
      false,
      GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
      JarClassifierBuilder.getModelJarFile(options.getModelDirectory())));

  // a very simple annotator that just prints out any named entities we found
  aggregate.add(AnalysisEngineFactory.createEngineDescription(PrintNamedEntityMentions.class));

  // run the classification pipeline on the new texts
  SimplePipeline.runPipeline(reader, aggregate.createAggregateDescription());
}
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:32,代码来源:RunNamedEntityChunker.java

示例13: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  Options options = CliFactory.parseArguments(Options.class, args);

  // a reader that loads the URIs of the training files
  CollectionReaderDescription reader = UriCollectionReader.getDescriptionFromDirectory(
      options.getTrainDirectory(),
      MascTextFileFilter.class,
      null);

  // assemble the training pipeline
  AggregateBuilder aggregate = new AggregateBuilder();

  // an annotator that loads the text from the training file URIs
  aggregate.add(UriToDocumentTextAnnotator.getDescription());

  // an annotator that parses and loads MASC named entity annotations (and tokens)
  aggregate.add(MascGoldAnnotator.getDescription());

  // an annotator that adds part-of-speech tags (so we can use them for features)
  aggregate.add(PosTaggerAnnotator.getDescription());

  // our NamedEntityChunker annotator, configured to write Mallet CRF training data
  aggregate.add(AnalysisEngineFactory.createEngineDescription(
      NamedEntityChunker.class,
      CleartkSequenceAnnotator.PARAM_IS_TRAINING,
      true,
      DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
      options.getModelDirectory(),
      DefaultSequenceDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
      MalletCrfStringOutcomeDataWriter.class));

  // run the pipeline over the training corpus
  SimplePipeline.runPipeline(reader, aggregate.createAggregateDescription());

  // train a Mallet CRF model on the training data
  Train.main(options.getModelDirectory());
}
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:38,代码来源:TrainNamedEntityChunker.java

示例14: main

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  // check arguments
  if (args.length != 1 && args.length != 2) {
    error("wrong number of arguments");
  } else if (!new File(args[0]).exists()) {
    error("file or directory not found: " + args[0]);
  }

  // parse arguments
  File inputFileOrDir = new File(args[0]);
  File outputDir;
  if (args.length == 2) {
    outputDir = new File(args[1]);
  } else {
    outputDir = new File(".");
  }
  if (!outputDir.exists()) {
    outputDir.mkdirs();
  }
  
  CollectionReader uriReader = (inputFileOrDir.isDirectory()) 
      ? UriCollectionReader.getCollectionReaderFromDirectory(inputFileOrDir) 
      : UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(inputFileOrDir));
      
  // run the components on the selected documents
  SimplePipeline.runPipeline(
      uriReader,
      UriToDocumentTextAnnotator.getDescription(),
      SentenceAnnotator.getDescription(),
      TokenAnnotator.getDescription(),
      PosTaggerAnnotator.getDescription(),
      DefaultSnowballStemmer.getDescription("English"),
      ParserAnnotator.getDescription(),
      VerbClauseTemporalAnnotator.FACTORY.getAnnotatorDescription(),
      TempEval2007Writer.getDescription(outputDir.getPath()));
}
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:37,代码来源:VerbClauseTemporalAnnotate.java

示例15: buildTrainingAggregate

import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public AggregateBuilder buildTrainingAggregate() throws ResourceInitializationException {

    AggregateBuilder builder = new AggregateBuilder();

    builder.add(UriToDocumentTextAnnotator.getDescription());

    // NLP pre-processing components
    builder.add(SentenceAnnotator.getDescription());
    builder.add(TokenAnnotator.getDescription());
    builder.add(PosTaggerAnnotator.getDescription());
    builder.add(DefaultSnowballStemmer.getDescription("English"));

    // This will extract the features for summarization
    builder.add(AnalysisEngineFactory.createEngineDescription(
        SumBasicAnnotator.class,
        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
        SumBasicDataWriter.class.getName(),
        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
        this.modelDirectory.getPath(),
        SumBasicAnnotator.PARAM_TOKEN_FIELD,
        this.tokenField.name(),
        SumBasicAnnotator.PARAM_STOPWORDS_URI,
        stopwordsFile.toURI()));

    // Save off xmis for re-reading
    builder.add(AnalysisEngineFactory.createEngineDescription(
        XmiWriter.class,
        XmiWriter.PARAM_OUTPUT_DIRECTORY,
        xmiDirectory.getPath()));

    return builder;
  }
 
开发者ID:ClearTK,项目名称:cleartk,代码行数:33,代码来源:SumBasic.java


注:本文中的org.cleartk.util.ae.UriToDocumentTextAnnotator类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。