本文整理汇总了Java中edu.stanford.nlp.process.PTBTokenizer.factory方法的典型用法代码示例。如果您正苦于以下问题:Java PTBTokenizer.factory方法的具体用法?Java PTBTokenizer.factory怎么用?Java PTBTokenizer.factory使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类edu.stanford.nlp.process.PTBTokenizer
的用法示例。
在下文中一共展示了PTBTokenizer.factory方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: ExportExamplesToSentences
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public ExportExamplesToSentences(String targetFile, String sourceDir, int ngramSize,
SourceType type, String fileExtension,
boolean replaceNumbers, boolean toLowerCase,
boolean stripWords, String tagDelimiter) {
this.target = targetFile;
this.source = sourceDir;
this.ngramSize = ngramSize;
this.tokenizer = PTBTokenizer.factory();
this.tagger = new MaxentTagger(MaxentTagger.DEFAULT_JAR_PATH);
this.type = type;
this.fileExtension = fileExtension;
this.replaceNumbers = replaceNumbers;
this.toLowerCase = toLowerCase;
this.stripWords = stripWords;
this.tagDelimiter = tagDelimiter;
}
示例2: writeImage
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public static void writeImage(String sentence, String outFile, int scale) throws Exception {
LexicalizedParser lp = null;
try {
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
} catch (Exception e) {
System.err.println("Could not load file englishPCFG.ser.gz. Try placing this file in the same directory as Dependencee.jar");
return;
}
lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
Tree tree = lp.apply(wordList);
writeImage(tree, outFile, scale);
}
示例3: testWriteImage
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
/**
* Test of writeImage method, of class Main.
*/
@Test
public void testWriteImage() throws Exception {
String text = "A quick brown fox jumped over the lazy dog.";
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
LexicalizedParser lp = LexicalizedParser.loadModel();
lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
Tree tree = lp.apply(wordList);
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
Main.writeImage(tdl, "image.png", 3);
assert (new File("image.png").exists());
}
示例4: main
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("usage: java TaggerDemo modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(Sentence.listToString(tSentence, false));
}
pw.close();
}
示例5: getResult
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public List<Word> getResult(String sentence) throws InvalidInputException {
if (sentence == null || sentence.length() == 0)
throw new InvalidInputException();
TokenizerFactory<Word> tf = null;
if (tf == null)
tf = PTBTokenizer.factory();
List<Word> tokens_words = tf.getTokenizer(new StringReader(sentence)).tokenize();
return tokens_words;
}
示例6: getSentencesFromText
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public static List<List<HasWord>> getSentencesFromText(String str, boolean invertible, String options) {
List<List<HasWord>> sentences = new ArrayList<List<HasWord>>();
StringReader reader = new StringReader(str);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
TokenizerFactory factory = null;
if( invertible ) {
factory = PTBTokenizer.factory(true, true);
if( options != null && options.length() > 0 )
options = "invertible=true, " + options;
else
options = "invertible=true";
} else {
factory = PTBTokenizer.factory();
}
// System.out.println("Setting splitter options=" + options);
factory.setOptions(options);
dp.setTokenizerFactory(factory);
Iterator<List<HasWord>> iter = dp.iterator();
while( iter.hasNext() ) {
List<HasWord> sentence = iter.next();
sentences.add(sentence);
}
return sentences;
}
示例7: getWordsFromString
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public static List<Word> getWordsFromString(String str) {
PTBTokenizerFactory<Word> factory = (PTBTokenizerFactory<Word>)PTBTokenizer.factory();
// Stanford's tokenizer actually changes words to American...altering our original text. Stop it!!
factory.setOptions("americanize=false");
Tokenizer<Word> tokenizer = factory.getTokenizer(new BufferedReader(new StringReader(str)));
return tokenizer.tokenize();
}
示例8: main
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(Sentence.listToString(tSentence, false));
}
// print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
List<TaggedWord> taggedSent = tagger.tagSentence(sent);
for (TaggedWord tw : taggedSent) {
if (tw.tag().startsWith("JJ")) {
pw.println(tw.word());
}
}
pw.close();
}
示例9: demoAPI
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
/**
* demoAPI demonstrates other ways of calling the parser with already
* tokenized text, or in some cases, raw text that needs to be tokenized as
* a single sentence. Output is handled with a TreePrint object. Note that
* the options used when creating the TreePrint can determine what results
* to print out. Once again, one can capture the output by passing a
* PrintWriter to TreePrint.printTree.
*
* difference: already tokenized text
*
*
*/
public static void demoAPI(LexicalizedParser lp) {
// This option shows parsing a list of correctly tokenized words
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
Tree parse = lp.apply(rawWords);
parse.pennPrint();
System.out.println();
// This option shows loading and using an explicit tokenizer
String sent2 = "Hey @Apple, pretty much all your products are amazing. You blow minds every time you launch a new gizmo."
+ " that said, your hold music is crap";
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(
new CoreLabelTokenFactory(), "");
Tokenizer<CoreLabel> tok = tokenizerFactory
.getTokenizer(new StringReader(sent2));
List<CoreLabel> rawWords2 = tok.tokenize();
parse = lp.apply(rawWords2);
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
// You can also use a TreePrint object to print trees and dependencies
TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
tp.printTree(parse);
}
示例10: MUCMentionExtractor
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception {
super(dict, semantics);
String fileName = props.getProperty(Constants.MUC_PROP);
fileContents = IOUtils.slurpFile(fileName);
currentOffset = 0;
tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), "");
stanfordProcessor = loadStanfordProcessor(props);
}
示例11: getGraph
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public static Graph getGraph(String sentence) throws Exception {
LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
Tree tree = lp.apply(wordList);
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> tdl = gs.typedDependencies();
return getGraph(tree, tdl);
}
示例12: demoAPI
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
public static void demoAPI(LexicalizedParser lp) {
// This option shows parsing a list of correctly tokenized words
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
Tree parse = lp.apply(rawWords);
parse.pennPrint();
System.out.println();
// This option shows loading and using an explicit tokenizer
String sent2 = "This is another sentence.";
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> rawWords2 =
tokenizerFactory.getTokenizer(new StringReader(sent2)).tokenize();
parse = lp.apply(rawWords2);
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
tp.printTree(parse);
}
示例13: NonUIMAPreprocessor
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
private NonUIMAPreprocessor() {
this.tokFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
this.lemmatizer = new MorphaAnnotator(false);
this.sw = new Stopwords("lists/stopwords_en_semilar.txt");
this.posPuncSet = new HashSet<String>(Arrays.asList(posPunc));
}
示例14: load
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
@Override
public void load() {
mTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "normalizeParentheses=false,normalizeOtherBrackets=false,untokenizable=allKeep,escapeForwardSlashAsterisk=false,ptb3Escaping=false");
}
示例15: initialValue
import edu.stanford.nlp.process.PTBTokenizer; //导入方法依赖的package包/类
@Override
protected TokenizerFactory<CoreLabel> initialValue() {
return PTBTokenizer.factory(false, true);
}