本文整理汇总了Java中edu.stanford.nlp.process.TokenizerFactory类的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory类的具体用法?Java TokenizerFactory怎么用?Java TokenizerFactory使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
TokenizerFactory类属于edu.stanford.nlp.process包,在下文中一共展示了TokenizerFactory类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: init
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public void init(SeqClassifierFlags flags) {
String options = "tokenizeNLs=false,invertible=true";
if (flags.tokenizerOptions != null) {
options = options + "," + flags.tokenizerOptions;
}
TokenizerFactory<IN> factory;
if (flags.tokenizerFactory != null) {
try {
Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(flags.tokenizerFactory));
Method factoryMethod = clazz.getMethod("newCoreLabelTokenizerFactory", String.class);
factory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, options));
} catch (Exception e) {
throw new RuntimeException(e);
}
} else {
factory = ErasureUtils.uncheckedCast(PTBTokenizer.PTBTokenizerFactory.newCoreLabelTokenizerFactory(options));
}
init(flags, factory);
}
示例2: runTagger
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
* This method runs the tagger on the provided reader and writer.
*
* It takes into from the given <code>reader</code>, applies the
* tagger to it one sentence at a time (determined using
* documentPreprocessor), and writes the output to the given
* <code>writer</code>.
*
* The document is broken into sentences using the sentence
* processor determined in the tagger's TaggerConfig.
*
* <code>tagInside</code> makes the tagger run in XML mode... if set
* to non-empty, instead of processing the document as one large
* text blob, it considers each region in between the given tag to
* be a separate text blob.
*/
public void runTagger(BufferedReader reader, BufferedWriter writer,
String tagInside, OutputStyle outputStyle)
throws IOException
{
String sentenceDelimiter = config.getSentenceDelimiter();
if (sentenceDelimiter != null && sentenceDelimiter.equals("newline")) {
sentenceDelimiter = "\n";
}
final TokenizerFactory<? extends HasWord> tokenizerFactory = chooseTokenizerFactory();
//Now we do everything through the doc preprocessor
final DocumentPreprocessor docProcessor;
if (tagInside.length() > 0) {
docProcessor = new DocumentPreprocessor(reader, DocumentPreprocessor.DocType.XML);
docProcessor.setElementDelimiter(tagInside);
} else {
docProcessor = new DocumentPreprocessor(reader);
docProcessor.setSentenceDelimiter(sentenceDelimiter);
}
docProcessor.setTokenizerFactory(tokenizerFactory);
runTagger(docProcessor, writer, outputStyle);
}
示例3: writeImage
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static void writeImage(String sentence, String outFile, int scale) throws Exception {
LexicalizedParser lp = null;
try {
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
} catch (Exception e) {
System.err.println("Could not load file englishPCFG.ser.gz. Try placing this file in the same directory as Dependencee.jar");
return;
}
lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
Tree tree = lp.apply(wordList);
writeImage(tree, outFile, scale);
}
示例4: testWriteImage
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
* Test of writeImage method, of class Main.
*/
@Test
public void testWriteImage() throws Exception {
String text = "A quick brown fox jumped over the lazy dog.";
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
LexicalizedParser lp = LexicalizedParser.loadModel();
lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
Tree tree = lp.apply(wordList);
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
Main.writeImage(tdl, "image.png", 3);
assert (new File("image.png").exists());
}
示例5: main
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("usage: java TaggerDemo modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(Sentence.listToString(tSentence, false));
}
pw.close();
}
示例6: getResult
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public List<Word> getResult(String sentence) throws InvalidInputException {
if (sentence == null || sentence.length() == 0)
throw new InvalidInputException();
TokenizerFactory<Word> tf = null;
if (tf == null)
tf = PTBTokenizer.factory();
List<Word> tokens_words = tf.getTokenizer(new StringReader(sentence)).tokenize();
return tokens_words;
}
示例7: getSentencesFromText
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static List<List<HasWord>> getSentencesFromText(String str, boolean invertible, String options) {
List<List<HasWord>> sentences = new ArrayList<List<HasWord>>();
StringReader reader = new StringReader(str);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
TokenizerFactory factory = null;
if( invertible ) {
factory = PTBTokenizer.factory(true, true);
if( options != null && options.length() > 0 )
options = "invertible=true, " + options;
else
options = "invertible=true";
} else {
factory = PTBTokenizer.factory();
}
// System.out.println("Setting splitter options=" + options);
factory.setOptions(options);
dp.setTokenizerFactory(factory);
Iterator<List<HasWord>> iter = dp.iterator();
while( iter.hasNext() ) {
List<HasWord> sentence = iter.next();
sentences.add(sentence);
}
return sentences;
}
示例8: main
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(Sentence.listToString(tSentence, false));
}
// print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
List<TaggedWord> taggedSent = tagger.tagSentence(sent);
for (TaggedWord tw : taggedSent) {
if (tw.tag().startsWith("JJ")) {
pw.println(tw.word());
}
}
pw.close();
}
示例9: demoAPI
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
* demoAPI demonstrates other ways of calling the parser with already
* tokenized text, or in some cases, raw text that needs to be tokenized as
* a single sentence. Output is handled with a TreePrint object. Note that
* the options used when creating the TreePrint can determine what results
* to print out. Once again, one can capture the output by passing a
* PrintWriter to TreePrint.printTree.
*
* difference: already tokenized text
*
*
*/
public static void demoAPI(LexicalizedParser lp) {
// This option shows parsing a list of correctly tokenized words
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
Tree parse = lp.apply(rawWords);
parse.pennPrint();
System.out.println();
// This option shows loading and using an explicit tokenizer
String sent2 = "Hey @Apple, pretty much all your products are amazing. You blow minds every time you launch a new gizmo."
+ " that said, your hold music is crap";
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(
new CoreLabelTokenFactory(), "");
Tokenizer<CoreLabel> tok = tokenizerFactory
.getTokenizer(new StringReader(sent2));
List<CoreLabel> rawWords2 = tok.tokenize();
parse = lp.apply(rawWords2);
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
// You can also use a TreePrint object to print trees and dependencies
TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
tp.printTree(parse);
}
示例10: getTokenizerFactory
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
@Override
public TokenizerFactory<? extends HasWord> getTokenizerFactory() {
if (tf != null) {
return tf;
} else {
return super.getTokenizerFactory();
}
}
示例11: atbFactory
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static TokenizerFactory<CoreLabel> atbFactory() {
TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory();
for (String option : atbOptions.stringPropertyNames()) {
tf.setOptions(option);
}
return tf;
}
示例12: chooseTokenizerFactory
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
* Figures out what tokenizer factory might be described by the
* config. If it's described by name in the config, uses reflection
* to get the factory (which may cause an exception, of course...)
*/
protected TokenizerFactory<? extends HasWord> chooseTokenizerFactory() {
return chooseTokenizerFactory(config.getTokenize(),
config.getTokenizerFactory(),
config.getTokenizerOptions(),
config.getTokenizerInvertible());
}
示例13: tokenizeText
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
* Reads data from r, tokenizes it with the given tokenizer, and
* returns a List of Lists of (extends) HasWord objects, which can then be
* fed into tagSentence.
*
* @param r Reader where untokenized text is read
* @param tokenizerFactory Tokenizer. This can be <code>null</code> in which case
* the default English tokenizer (PTBTokenizerFactory) is used.
* @return List of tokenized sentences
*/
public static List<List<HasWord>> tokenizeText(Reader r,
TokenizerFactory<? extends HasWord> tokenizerFactory) {
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
if (tokenizerFactory != null) {
documentPreprocessor.setTokenizerFactory(tokenizerFactory);
}
List<List<HasWord>> out = new ArrayList<List<HasWord>>();
for (List<HasWord> item : documentPreprocessor) {
out.add(item);
}
return out;
}
示例14: getGraph
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static Graph getGraph(String sentence) throws Exception {
LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
Tree tree = lp.apply(wordList);
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> tdl = gs.typedDependencies();
return getGraph(tree, tdl);
}
示例15: initialValue
import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
@Override
protected TokenizerFactory<CoreLabel> initialValue() {
return PTBTokenizer.factory(false, true);
}