本文整理汇总了Java中opennlp.tools.util.PlainTextByLineStream类的典型用法代码示例。如果您正苦于以下问题:Java PlainTextByLineStream类的具体用法?Java PlainTextByLineStream怎么用?Java PlainTextByLineStream使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
PlainTextByLineStream类属于opennlp.tools.util包,在下文中一共展示了PlainTextByLineStream类的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: buildModel
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
@Override
public void buildModel(String entityType) {
try {
System.out.println("\tBuilding Model using " + annotatedSentences.size() + " annotations");
System.out.println("\t\treading training data...");
Charset charset = Charset.forName("UTF-8");
ObjectStream<String> lineStream =
new PlainTextByLineStream(new MarkableFileInputStreamFactory(params.getAnnotatedTrainingDataFile()), charset);
ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
TokenNameFinderModel model;
model = NameFinderME.train("en", entityType, sampleStream, null);
sampleStream.close();
OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(params.getModelFile()));
model.serialize(modelOut);
if (modelOut != null) {
modelOut.close();
}
System.out.println("\tmodel generated");
} catch (Exception e) {
}
}
示例2: getNLPModel
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
public static DoccatModel getNLPModel(File openNLPTraining) throws IOException {
DoccatModel model = null;
FeatureGenerator[] def = { new BagOfWordsFeatureGenerator() };
WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
DoccatFactory factory = new DoccatFactory(tokenizer, def);
InputStreamFactory isf = new MarkableFileInputStreamFactory(openNLPTraining);
ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
TrainingParameters params = TrainingParameters.defaultParams();
System.out.println(params.algorithm());
params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(4000));
model = DocumentCategorizerME.train("en", sampleStream, params, factory);
evaluateDoccatModel(model, openNLPTraining);
return model;
}
示例3: trainSentences
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
public static void trainSentences(final String inResource, String outFile) throws IOException {
InputStreamFactory inputStreamFactory = new InputStreamFactory() {
@Override
public InputStream createInputStream() throws IOException {
return Trainer.class.getResourceAsStream(inResource);
}
};
SentenceSampleStream samples = new SentenceSampleStream(new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8));
TrainingParameters trainingParameters = new TrainingParameters();
trainingParameters.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.name());
trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, "100");
trainingParameters.put(TrainingParameters.CUTOFF_PARAM, "0");
SentenceDetectorFactory sentenceDetectorFactory = SentenceDetectorFactory.create(null, "en", true, null, ".?!".toCharArray());
SentenceModel sentdetectModel = SentenceDetectorME.train("en", samples, sentenceDetectorFactory, trainingParameters);
//.train("en", samples, true, null, 100, 0);
samples.close();
FileOutputStream out = new FileOutputStream(outFile);
sentdetectModel.serialize(out);
out.close();
}
示例4: trainChunker
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
public static void trainChunker(final String inResource, String outFile) throws IOException {
InputStreamFactory inputStreamFactory = new InputStreamFactory() {
@Override
public InputStream createInputStream() throws IOException {
return Trainer.class.getResourceAsStream(inResource);
}
};
ChunkSampleStream samples = new ChunkSampleStream(new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8));
TrainingParameters trainingParameters = new TrainingParameters();
trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, "70");
trainingParameters.put(TrainingParameters.CUTOFF_PARAM, "1");
ChunkerFactory chunkerFactory = ChunkerFactory.create(null);
ChunkerModel model = ChunkerME.train("en", samples, trainingParameters, chunkerFactory);
//ChunkerME.train("en", samples, 1, 70);
samples.close();
FileOutputStream out = new FileOutputStream(outFile);
model.serialize(out);
out.close();
}
示例5: trainNameFinder
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
public static void trainNameFinder(final String inResource, String outFile) throws IOException {
InputStreamFactory inputStreamFactory = new InputStreamFactory() {
@Override
public InputStream createInputStream() throws IOException {
return Trainer.class.getResourceAsStream(inResource);
}
};
InputStream in = Trainer.class.getResourceAsStream(inResource);
NameSampleDataStream samples = new NameSampleDataStream(new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8));
TrainingParameters trainingParameters = new TrainingParameters();
trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, "5");
trainingParameters.put(TrainingParameters.CUTOFF_PARAM, "200");
byte[] featureGeneratorBytes = null;
Map<String, Object> resources = Collections.<String, Object>emptyMap();
SequenceCodec<String> seqCodec = new BioCodec();
TokenNameFinderFactory tokenNameFinderFactory = TokenNameFinderFactory.create(null, featureGeneratorBytes, resources, seqCodec);
TokenNameFinderModel model = NameFinderME.train("en", "person", samples, trainingParameters, tokenNameFinderFactory);
//NameFinderME.train("en", "person", samples, Collections.<String, Object>emptyMap(), 200, 5);
samples.close();
FileOutputStream out = new FileOutputStream(outFile);
model.serialize(out);
out.close();
}
示例6: openSampleData
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
static ObjectStream<POSSample> openSampleData(String sampleDataName, File sampleDataFile, Charset encoding) {
CmdLineUtil.checkInputFile(sampleDataName + " Data", sampleDataFile);
FileInputStream sampleDataIn = CmdLineUtil.openInFile(sampleDataFile);
ObjectStream<String> lineStream = new PlainTextByLineStream(sampleDataIn.getChannel(), encoding);
return new WordTagSampleStream(lineStream);
}
示例7: evaluateDoccatModel
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
public static void evaluateDoccatModel(DoccatModel model,File openNLPTraining) throws IOException{
InputStreamFactory isf = new MarkableFileInputStreamFactory(openNLPTraining);
ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
List<EvaluationMonitor<DocumentSample>> listeners = new LinkedList<EvaluationMonitor<DocumentSample>>();
listeners.add(new DoccatEvaluationErrorListener());
listeners.add(new DoccatFineGrainedReportListener());
DocumentCategorizerEvaluator eval = new DocumentCategorizerEvaluator(new DocumentCategorizerME(model),listeners.toArray(new DoccatEvaluationMonitor[listeners.size()]));
eval.evaluate(sampleStream);
System.out.println(eval);
}
示例8: run
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
@PostConstruct
@SuppressWarnings("deprecation")
public void run() {
DoccatModel model = null;
OutputStream modelOut = null;
try {
// Ensinando a máquina
InputStreamFactory dataIn = new MarkableFileInputStreamFactory(Paths.get(train).toFile());
ObjectStream<String> lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
model = DocumentCategorizerME.train("pt", sampleStream);
// Escrevendo arquivo que ela aprendeu
modelOut = new BufferedOutputStream(new FileOutputStream(Paths.get(bin).toFile()));
model.serialize(modelOut);
} catch (IOException e) {
LOGGER.error(ExceptionUtils.getStackTrace(e));
} finally {
if (Objects.nonNull(modelOut)) {
closeOutputStream(modelOut);
}
}
}
示例9: train
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
public static void train(String file_train, String file_model) throws IOException {
DoccatModel model = null;
ObjectStream<String> lineStream =
new PlainTextByLineStream(new MarkableFileInputStreamFactory(
new File(file_train)), "UTF-8");
ObjectStream<DocumentSample> sampleStream =
new DocumentSampleStream(lineStream);
TrainingParameters param = TrainingParameters.defaultParams();
DoccatFactory factory = new DoccatFactory();
model = DocumentCategorizerME.train("en", sampleStream,param,factory);
model.serialize(new FileOutputStream(file_model));
}
示例10: main
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
SentenceModel model;
Charset charset = Charset.forName("UTF-8");
InputStreamFactory isf = new MarkableFileInputStreamFactory(new File("model/openNPLTraining.txt"));
ObjectStream<String> lineStream =
new PlainTextByLineStream(isf, charset);
ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream);
try {
Dictionary dict = new Dictionary(new FileInputStream(new File("ini/stop_words.txt")));
SentenceDetectorFactory sdf = new SentenceDetectorFactory("en",true,dict,null);
TrainingParameters params = TrainingParameters.defaultParams();
model = SentenceDetectorME.train("en", sampleStream, sdf,params);
}
finally {
sampleStream.close();
}
System.out.println("done");
}
示例11: trainTokenizer
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
public static void trainTokenizer(final String inResource, String outFile) throws IOException {
InputStreamFactory inputStreamFactory = new InputStreamFactory() {
@Override
public InputStream createInputStream() throws IOException {
return Trainer.class.getResourceAsStream(inResource);
}
};
ObjectStream<TokenSample> samples = new TokenSampleStream(new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8));
TrainingParameters trainingParameters = new TrainingParameters();
trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, "100");
trainingParameters.put(TrainingParameters.CUTOFF_PARAM, "5");
String subclassname = null;
String langcode = "en";
Dictionary dict = null;
Pattern alphanumericpattern = null;
opennlp.tools.tokenize.TokenizerFactory tokenizerFactory = TokenizerFactory.create(subclassname, langcode, dict, true, alphanumericpattern);
TokenizerModel model = TokenizerME.train(samples, tokenizerFactory, trainingParameters);
//TokenizerME.train("en", samples, true, 5, 100);
samples.close();
FileOutputStream out = new FileOutputStream(outFile);
model.serialize(out);
out.close();
}
示例12: trainPOS
import opennlp.tools.util.PlainTextByLineStream; //导入依赖的package包/类
public static void trainPOS(final String inResource, String outFile) throws IOException {
InputStreamFactory inputStreamFactory = new InputStreamFactory() {
@Override
public InputStream createInputStream() throws IOException {
return Trainer.class.getResourceAsStream(inResource);
}
};
WordTagSampleStream samples = new WordTagSampleStream(new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8));
TrainingParameters trainingParameters = new TrainingParameters();
trainingParameters.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.name());
trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, "100");
trainingParameters.put(TrainingParameters.CUTOFF_PARAM, "5");
Dictionary ngramDictionary = null;
POSDictionary posDictionary = null;
POSTaggerFactory posTaggerFactory = POSTaggerFactory.create(null, ngramDictionary, posDictionary);
POSModel model = POSTaggerME.train("en", samples, trainingParameters, posTaggerFactory);
//POSTaggerME.train("en", samples, ModelType.MAXENT, null, null, 5, 100);
samples.close();
FileOutputStream out = new FileOutputStream(outFile);
model.serialize(out);
out.close();
}