本文整理汇总了Java中org.cleartk.util.ae.UriToDocumentTextAnnotator类的典型用法代码示例。如果您正苦于以下问题:Java UriToDocumentTextAnnotator类的具体用法?Java UriToDocumentTextAnnotator怎么用?Java UriToDocumentTextAnnotator使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
UriToDocumentTextAnnotator类属于org.cleartk.util.ae包,在下文中一共展示了UriToDocumentTextAnnotator类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: train
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
@Override
public void train(CollectionReader collectionReader, File outputDirectory) throws Exception {
AggregateBuilder builder = new AggregateBuilder();
builder.add(UriToDocumentTextAnnotator.getDescription());
builder.add(SentenceAnnotator.getDescription());
builder.add(TokenAnnotator.getDescription());
builder.add(PosTaggerAnnotator.getDescription());
builder.add(DefaultSnowballStemmer.getDescription("English"));
builder.add(AnalysisEngineFactory.createEngineDescription(GoldQuestionCategoryAnnotator.class));
AnalysisEngineDescription documentClassificationAnnotator = AnalysisEngineFactory.createEngineDescription(
QuestionCategoryAnnotator.class, CleartkAnnotator.PARAM_IS_TRAINING, true,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, outputDirectory,
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, LibSvmStringOutcomeDataWriter.class.getName());
builder.add(documentClassificationAnnotator);
SimplePipeline.runPipeline(collectionReader, builder.createAggregateDescription());
System.err.println("Train model and write model.jar file.");
HideOutput hider = new HideOutput();
Train.main(outputDirectory, this.trainingArguments.toArray(new String[this.trainingArguments.size()]));
hider.restoreOutput();
}
示例2: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Options options = CliFactory.parseArguments(Options.class, args);
CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(options.getTestDirectory(),
UriCollectionReader.RejectSystemFiles.class, UriCollectionReader.RejectSystemDirectories.class);
AggregateBuilder builder = new AggregateBuilder();
builder.add(UriToDocumentTextAnnotator.getDescription());
builder.add(SentenceAnnotator.getDescription());
builder.add(TokenAnnotator.getDescription());
builder.add(PosTaggerAnnotator.getDescription());
builder.add(DefaultSnowballStemmer.getDescription("English"));
builder.add(AnalysisEngineFactory.createEngineDescription(QuestionCategoryAnnotator.class,
CleartkAnnotator.PARAM_IS_TRAINING, false, GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
JarClassifierBuilder.getModelJarFile(options.getModelsDirectory())));
SimplePipeline.runPipeline(reader, builder.createAggregateDescription(),
AnalysisEngineFactory.createEngineDescription(PrintClassificationsAnnotator.class));
}
示例3: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Options options = CliFactory.parseArguments(Options.class, args);
CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(options.getInputDirectory());
AnalysisEngineDescription uriToText = UriToDocumentTextAnnotator.getDescription();
AnalysisEngineDescription sentences = SentenceAnnotator.getDescription();
AnalysisEngineDescription tokenizer = TokenAnnotator.getDescription();
AnalysisEngineDescription posTagger = PosTaggerAnnotator.getDescription();
AnalysisEngineDescription lineWriter = AnalysisEngineFactory.createEngineDescription(
LineWriter.class,
LineWriter.PARAM_OUTPUT_FILE_NAME,
options.getOutputFile(),
LineWriter.PARAM_OUTPUT_ANNOTATION_CLASS_NAME,
Token.class.getName(),
LineWriter.PARAM_ANNOTATION_WRITER_CLASS_NAME,
TokenAnnotationWriter.class.getName());
SimplePipeline.runPipeline(reader, uriToText, sentences, tokenizer, posTagger, lineWriter);
System.out.println("results written to " + options.getOutputFile());
}
示例4: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Options options = CliFactory.parseArguments(Options.class, args);
CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(options.getInputDirectory());
AggregateBuilder builder = new AggregateBuilder();
builder.add(UriToDocumentTextAnnotator.getDescription());
builder.add(SentenceAnnotator.getDescription());
builder.add(AnalysisEngineFactory.createEngineDescription(
LineWriter.class,
LineWriter.PARAM_OUTPUT_FILE_NAME,
options.getOutputFile(),
LineWriter.PARAM_OUTPUT_ANNOTATION_CLASS_NAME,
Sentence.class.getName()));
SimplePipeline.runPipeline(reader, builder.createAggregateDescription());
System.out.println("results written to " + options.getOutputFile());
}
示例5: executeUIMAAnnotator
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
/**
* Execute Stanford annotators.
* The annotators are: Tokenize, SSplit, POS, Lemma, NER, Parse, Dcoref + Snowball Stemmer
*
* @param _inputFile the _input file
* @param _outputFile the _output file
*/
public void executeUIMAAnnotator(String _inputFile, String _outputFile) {
logger.info("Running executeUIMAAnnotator with: "+_inputFile+" and "+_outputFile);
File file = new File(_inputFile);
Stopwatch stopwatch = new Stopwatch().start();
try {
SimplePipeline.runPipeline(
UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(file)),
UriToDocumentTextAnnotator.getDescription(),
this.stanfordNLP, //stanford tokenize, ssplit, pos, lemma, ner, parse, dcoref
DefaultSnowballStemmer.getDescription("English"), //stemmer
createAEDescription(UIMA_RUTA_SCRIPT), //RUTA Analysis Engine
AnnotationRemover.getDescription(), //Remove useless annotations
DesignDecisionSentenceRemover.getDescription(), //Remove sentence annotations that are also designdecisions
AnalysisEngineFactory.createEngineDescription(//result files
XCasWriter.class,
XCasWriter.PARAM_OUTPUT_FILE_NAME,
_outputFile)
);
} catch (Exception e) {
logger.error("Error executing the uima annotator.",e);
}
stopwatch.stop(); // optional
logger.info("executeUIMAAnnotator took: " + stopwatch); // formatted string like "12.3 ms"
}
示例6: executeSentenceAnnotator
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
/**
* Execute sentence annotator.
*
* @param _inputFile the _input file
* @param _outputFile the _output file
*/
public void executeSentenceAnnotator(String _inputFile, String _outputFile) {
logger.info("Running executeSentenceAnnotator with: "+_inputFile+" and "+_outputFile);
File file = new File(_inputFile);
Stopwatch stopwatch = new Stopwatch().start();
try {
SimplePipeline.runPipeline(
UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(file)),
UriToDocumentTextAnnotator.getDescription(),
this.stanfordNLP, //stanford tokenize, ssplit, pos, lemma, ner, parse, dcoref
AnnotationRemover.getDescription(), //Remove useless annotations
AnalysisEngineFactory.createEngineDescription(//result files
XCasWriter.class,
XCasWriter.PARAM_OUTPUT_FILE_NAME,
_outputFile)
);
} catch (Exception e) {
logger.error("Error executing the uima annotator.",e);
}
stopwatch.stop(); // optional
logger.info("executeSentenceAnnotator took: " + stopwatch); // formatted string like "12.3 ms"
}
示例7: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Options options = CliFactory.parseArguments(Options.class, args);
// Loads URIS specified by files into URI view
String[] suffixes = options.getTreebankFileSuffixes().toArray(
new String[options.getTreebankFileSuffixes().size()]);
File treebankDir = options.getTreebankDirectory();
Collection<File> files = FileUtils.listFiles(treebankDir, suffixes, false);
CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
AggregateBuilder builder = new AggregateBuilder();
// Reads text into TREEBANK_VIEW
builder.add(UriToDocumentTextAnnotator.getDescriptionForView(PennTreebankReader.TREEBANK_VIEW));
// Ensures GOLD_VIEW is present
builder.add(AnalysisEngineFactory.createEngineDescription(
ViewCreatorAnnotator.class,
ViewCreatorAnnotator.PARAM_VIEW_NAME,
EvaluationConstants.GOLD_VIEW));
// Parses treebank text into gold view (instead of default sofa view)
builder.add(
TreebankGoldAnnotator.getDescription(),
CAS.NAME_DEFAULT_SOFA,
EvaluationConstants.GOLD_VIEW);
// XMI Writer
builder.add(XmiWriter.getDescription(new File(options.getOutputDirectory())));
SimplePipeline.runPipeline(reader, builder.createAggregateDescription());
}
示例8: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
File filesDirectory = new File(args[0]);
SimplePipeline.runPipeline(
UriCollectionReader.getCollectionReaderFromDirectory(filesDirectory),
UriToDocumentTextAnnotator.getDescription(),
SentenceAnnotator.getDescription(),
TokenAnnotator.getDescription(),
PosTaggerAnnotator.getDescription(),
ParserAnnotator.getDescription());
}
示例9: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
String samplePath = "data/sample/2008_Sichuan_earthquake.txt";
List<File> files = Arrays.asList(new File(samplePath));
// A collection reader that creates one CAS per file, containing the file's URI
CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
// The pipeline of annotators
AggregateBuilder builder = new AggregateBuilder();
// An annotator that reads in the file text
builder.add(UriToDocumentTextAnnotator.getDescription());
// An annotator that adds Sentence annotations
builder.add(SentenceAnnotator.getDescription());
// An annotator that adds Token annotations
builder.add(TokenAnnotator.getDescription());
// The POS annotator, configured to make predictions
builder.add(ExamplePosAnnotator
.getClassifierDescription(JarClassifierBuilder.getModelJarFile(
ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY).getPath()));
// An annotator that write out the tokens and their part of speech tags
builder.add(AnalysisEngineFactory.createEngineDescription(
ExamplePosPlainTextWriter.class,
ExamplePosPlainTextWriter.PARAM_OUTPUT_DIRECTORY_NAME,
ExamplePosPlainTextWriter.DEFAULT_OUTPUT_DIRECTORY));
// Run the pipeline of annotators on each of the CASes produced by the reader
SimplePipeline.runPipeline(reader, builder.createAggregateDescription());
System.out.println("Please look at the file generated by this program: "
+ ExamplePosPlainTextWriter.DEFAULT_OUTPUT_DIRECTORY + "/2008_Sichuan_earthquake.txt.pos");
}
示例10: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Options options = CliFactory.parseArguments(Options.class, args);
// ////////////////////////////////////////
// Create collection reader to load URIs
// ////////////////////////////////////////
CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(
options.getTestDirectory(),
UriCollectionReader.RejectSystemFiles.class,
UriCollectionReader.RejectSystemDirectories.class);
// ////////////////////////////////////////
// Create document classification pipeline
// ////////////////////////////////////////
AggregateBuilder builder = new AggregateBuilder();
// Convert URIs in CAS URI View to Plain Text
builder.add(UriToDocumentTextAnnotator.getDescription());
// NLP pre-processing components
builder.add(SentenceAnnotator.getDescription()); // Sentence segmentation
builder.add(TokenAnnotator.getDescription()); // Tokenization
builder.add(DefaultSnowballStemmer.getDescription("English")); // Stemming
// Simple document classification annotator
builder.add(AnalysisEngineFactory.createEngineDescription(
BasicDocumentClassificationAnnotator.class,
CleartkAnnotator.PARAM_IS_TRAINING,
false,
GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
JarClassifierBuilder.getModelJarFile(options.getModelsDirectory())));
// //////////////////////////////////////////////////////////////////////////////
// Run pipeline and classify documents
// //////////////////////////////////////////////////////////////////////////////
SimplePipeline.runPipeline(
reader,
builder.createAggregateDescription(),
AnalysisEngineFactory.createEngineDescription(PrintClassificationsAnnotator.class));
}
示例11: train
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
@Override
public void train(CollectionReader collectionReader, File outputDirectory) throws Exception {
// assemble the training pipeline
AggregateBuilder aggregate = new AggregateBuilder();
// an annotator that loads the text from the training file URIs
aggregate.add(UriToDocumentTextAnnotator.getDescription());
// an annotator that parses and loads MASC named entity annotations (and tokens)
aggregate.add(MascGoldAnnotator.getDescription());
// an annotator that adds part-of-speech tags
aggregate.add(PosTaggerAnnotator.getDescription());
// our NamedEntityChunker annotator, configured to write Mallet CRF training data
aggregate.add(AnalysisEngineFactory.createEngineDescription(
NamedEntityChunker.class,
CleartkSequenceAnnotator.PARAM_IS_TRAINING,
true,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
outputDirectory,
DefaultSequenceDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
MalletCrfStringOutcomeDataWriter.class));
// run the pipeline over the training corpus
SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
// quiet Mallet down a bit (but still leave likelihoods so you can see progress)
Logger malletLogger = Logger.getLogger("cc.mallet");
malletLogger.setLevel(Level.WARNING);
Logger likelihoodLogger = Logger.getLogger("cc.mallet.fst.CRFOptimizableByLabelLikelihood");
likelihoodLogger.setLevel(Level.INFO);
// train a Mallet CRF model on the training data
Train.main(outputDirectory);
}
示例12: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Options options = CliFactory.parseArguments(Options.class, args);
// a reader that loads the URIs of the text file
CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(options.getTextFile()));
// assemble the classification pipeline
AggregateBuilder aggregate = new AggregateBuilder();
// an annotator that loads the text from the training file URIs
aggregate.add(UriToDocumentTextAnnotator.getDescription());
// annotators that identify sentences, tokens and part-of-speech tags in the text
aggregate.add(SentenceAnnotator.getDescription());
aggregate.add(TokenAnnotator.getDescription());
aggregate.add(PosTaggerAnnotator.getDescription());
// our NamedEntityChunker annotator, configured to classify on the new texts
aggregate.add(AnalysisEngineFactory.createEngineDescription(
NamedEntityChunker.class,
CleartkSequenceAnnotator.PARAM_IS_TRAINING,
false,
GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
JarClassifierBuilder.getModelJarFile(options.getModelDirectory())));
// a very simple annotator that just prints out any named entities we found
aggregate.add(AnalysisEngineFactory.createEngineDescription(PrintNamedEntityMentions.class));
// run the classification pipeline on the new texts
SimplePipeline.runPipeline(reader, aggregate.createAggregateDescription());
}
示例13: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Options options = CliFactory.parseArguments(Options.class, args);
// a reader that loads the URIs of the training files
CollectionReaderDescription reader = UriCollectionReader.getDescriptionFromDirectory(
options.getTrainDirectory(),
MascTextFileFilter.class,
null);
// assemble the training pipeline
AggregateBuilder aggregate = new AggregateBuilder();
// an annotator that loads the text from the training file URIs
aggregate.add(UriToDocumentTextAnnotator.getDescription());
// an annotator that parses and loads MASC named entity annotations (and tokens)
aggregate.add(MascGoldAnnotator.getDescription());
// an annotator that adds part-of-speech tags (so we can use them for features)
aggregate.add(PosTaggerAnnotator.getDescription());
// our NamedEntityChunker annotator, configured to write Mallet CRF training data
aggregate.add(AnalysisEngineFactory.createEngineDescription(
NamedEntityChunker.class,
CleartkSequenceAnnotator.PARAM_IS_TRAINING,
true,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
options.getModelDirectory(),
DefaultSequenceDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
MalletCrfStringOutcomeDataWriter.class));
// run the pipeline over the training corpus
SimplePipeline.runPipeline(reader, aggregate.createAggregateDescription());
// train a Mallet CRF model on the training data
Train.main(options.getModelDirectory());
}
示例14: main
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
// check arguments
if (args.length != 1 && args.length != 2) {
error("wrong number of arguments");
} else if (!new File(args[0]).exists()) {
error("file or directory not found: " + args[0]);
}
// parse arguments
File inputFileOrDir = new File(args[0]);
File outputDir;
if (args.length == 2) {
outputDir = new File(args[1]);
} else {
outputDir = new File(".");
}
if (!outputDir.exists()) {
outputDir.mkdirs();
}
CollectionReader uriReader = (inputFileOrDir.isDirectory())
? UriCollectionReader.getCollectionReaderFromDirectory(inputFileOrDir)
: UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(inputFileOrDir));
// run the components on the selected documents
SimplePipeline.runPipeline(
uriReader,
UriToDocumentTextAnnotator.getDescription(),
SentenceAnnotator.getDescription(),
TokenAnnotator.getDescription(),
PosTaggerAnnotator.getDescription(),
DefaultSnowballStemmer.getDescription("English"),
ParserAnnotator.getDescription(),
VerbClauseTemporalAnnotator.FACTORY.getAnnotatorDescription(),
TempEval2007Writer.getDescription(outputDir.getPath()));
}
示例15: buildTrainingAggregate
import org.cleartk.util.ae.UriToDocumentTextAnnotator; //导入依赖的package包/类
public AggregateBuilder buildTrainingAggregate() throws ResourceInitializationException {
AggregateBuilder builder = new AggregateBuilder();
builder.add(UriToDocumentTextAnnotator.getDescription());
// NLP pre-processing components
builder.add(SentenceAnnotator.getDescription());
builder.add(TokenAnnotator.getDescription());
builder.add(PosTaggerAnnotator.getDescription());
builder.add(DefaultSnowballStemmer.getDescription("English"));
// This will extract the features for summarization
builder.add(AnalysisEngineFactory.createEngineDescription(
SumBasicAnnotator.class,
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
SumBasicDataWriter.class.getName(),
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
this.modelDirectory.getPath(),
SumBasicAnnotator.PARAM_TOKEN_FIELD,
this.tokenField.name(),
SumBasicAnnotator.PARAM_STOPWORDS_URI,
stopwordsFile.toURI()));
// Save off xmis for re-reading
builder.add(AnalysisEngineFactory.createEngineDescription(
XmiWriter.class,
XmiWriter.PARAM_OUTPUT_DIRECTORY,
xmiDirectory.getPath()));
return builder;
}