本文整理汇总了Java中edu.stanford.nlp.process.PTBTokenizer类的典型用法代码示例。如果您正苦于以下问题:Java PTBTokenizer类的具体用法?Java PTBTokenizer怎么用?Java PTBTokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
PTBTokenizer类属于edu.stanford.nlp.process包,在下文中一共展示了PTBTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: initialize
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
/**
* Initializes the tokenizer to detect date columns.
*/
public void initialize() {
Properties props = new Properties();
pipeline.addAnnotator(new TokenizerAnnotator(false) {
@Override
public Tokenizer<CoreLabel> getTokenizer(Reader r) {
// TODO Auto-generated method stub
return new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), "");
}
});
pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
pipeline.addAnnotator(new POSTaggerAnnotator(false));
pipeline.addAnnotator(new TimeAnnotator("sutime", props));
}
示例2: ExportExamplesToSentences
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
public ExportExamplesToSentences(String targetFile, String sourceDir, int ngramSize,
SourceType type, String fileExtension,
boolean replaceNumbers, boolean toLowerCase,
boolean stripWords, String tagDelimiter) {
this.target = targetFile;
this.source = sourceDir;
this.ngramSize = ngramSize;
this.tokenizer = PTBTokenizer.factory();
this.tagger = new MaxentTagger(MaxentTagger.DEFAULT_JAR_PATH);
this.type = type;
this.fileExtension = fileExtension;
this.replaceNumbers = replaceNumbers;
this.toLowerCase = toLowerCase;
this.stripWords = stripWords;
this.tagDelimiter = tagDelimiter;
}
示例3: applyPTBTokenizer
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
private static List<String> applyPTBTokenizer(DocumentPreprocessor dp, boolean tokenizeNLs, boolean ptb3Escaping) {
PTBTokenizerFactory<Word> tf = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory("tokenizeNLs=" + tokenizeNLs + ",ptb3Escaping=" + ptb3Escaping + ",asciiQuotes=true");
dp.setTokenizerFactory(tf);
List<String> sentences = new ArrayList<>();
for (List<HasWord> wordList : dp) {
String sentence = "";
for (HasWord word : wordList) {
sentence += " " + splitCompounds(word.word());
}
sentences.add(sentence);
}
return sentences;
}
示例4: tokenize
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
@Override
public String[] tokenize(String sentence) {
Reader r = new StringReader(sentence);
PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
List<String> l = new ArrayList<String>();
while (tokenizer.hasNext()) {
Word w = tokenizer.next();
l.add(w.word());
}
String[] tok = new String[l.size() + 1];
tok[0] = is2.io.CONLLReader09.ROOT;
int i = 1;
for (String s : l)
tok[i++] = s;
return tok;
}
示例5: tokenizeplus
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
public StringInText[] tokenizeplus(String sentence) {
Reader r = new StringReader(sentence);
PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
List<StringInText> l = new ArrayList<StringInText>();
while (tokenizer.hasNext()) {
Word w = tokenizer.next();
l.add(new StringInText(w.word(), w.beginPosition() + startpos, w
.endPosition() + startpos));
}
StringInText[] tok = new StringInText[l.size() + 1];
tok[0] = new StringInText(is2.io.CONLLReader09.ROOT, 0, 0);
int i = 1;
for (StringInText s : l)
tok[i++] = s;
startpos += (1 + sentence.length());
return tok;
}
示例6: tokenize
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
public List<Word> tokenize(String string)
{
this.tokenizer =
new PTBTokenizer<Word>(
new StringReader(string),
new WordTokenFactory(),
"untokenizable=noneDelete,ptb3Escaping=true");
try
{
return tokenizer.tokenize();
}
catch (Exception e)
{
System.err.println(e.getMessage());
final List<Word> tokens = new ArrayList<Word>();
for (String token : pennTokenizer.tokenize(string).split("\\s+"))
{
tokens.add(new Word(token));
}
return tokens;
}
}
示例7: init
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
public void init(SeqClassifierFlags flags) {
String options = "tokenizeNLs=false,invertible=true";
if (flags.tokenizerOptions != null) {
options = options + "," + flags.tokenizerOptions;
}
TokenizerFactory<IN> factory;
if (flags.tokenizerFactory != null) {
try {
Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(flags.tokenizerFactory));
Method factoryMethod = clazz.getMethod("newCoreLabelTokenizerFactory", String.class);
factory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, options));
} catch (Exception e) {
throw new RuntimeException(e);
}
} else {
factory = ErasureUtils.uncheckedCast(PTBTokenizer.PTBTokenizerFactory.newCoreLabelTokenizerFactory(options));
}
init(flags, factory);
}
示例8: tokenizeDate
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
private void tokenizeDate(String inputDate) {
tokens = new ArrayList<String>();
Pattern pat = Pattern.compile("[-]");
if (inputDate == null) {
System.out.println("Null input date");
}
Matcher m = pat.matcher(inputDate);
String str = m.replaceAll(" - ");
str = str.replaceAll(",", " ");
PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str)));
while (tokenizer.hasNext()) {
Word nextToken = tokenizer.next();
tokens.add(nextToken.toString());
}
if(DEBUG) {
System.out.println("tokens:" + tokens);
}
}
示例9: writeImage
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
public static void writeImage(String sentence, String outFile, int scale) throws Exception {
LexicalizedParser lp = null;
try {
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
} catch (Exception e) {
System.err.println("Could not load file englishPCFG.ser.gz. Try placing this file in the same directory as Dependencee.jar");
return;
}
lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
Tree tree = lp.apply(wordList);
writeImage(tree, outFile, scale);
}
示例10: testWriteImage
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
/**
* Test of writeImage method, of class Main.
*/
@Test
public void testWriteImage() throws Exception {
String text = "A quick brown fox jumped over the lazy dog.";
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
LexicalizedParser lp = LexicalizedParser.loadModel();
lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
Tree tree = lp.apply(wordList);
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
Main.writeImage(tdl, "image.png", 3);
assert (new File("image.png").exists());
}
示例11: main
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("usage: java TaggerDemo modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(Sentence.listToString(tSentence, false));
}
pw.close();
}
示例12: getResult
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
public List<Word> getResult(String sentence) throws InvalidInputException {
if (sentence == null || sentence.length() == 0)
throw new InvalidInputException();
TokenizerFactory<Word> tf = null;
if (tf == null)
tf = PTBTokenizer.factory();
List<Word> tokens_words = tf.getTokenizer(new StringReader(sentence)).tokenize();
return tokens_words;
}
示例13: produceBagOfWords_Token
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
/**
* Loads document from file and transform it in a token multi-set using stanford PTBTokenizer.
* @param documentPath
* @return
* @throws IOException
*/
public HashMultiset<String> produceBagOfWords_Token(String documentPath) throws IOException{
HashMultiset<String>tokenMultiset = HashMultiset.create();
PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(documentPath),
new CoreLabelTokenFactory(), "");
while (ptbt.hasNext()) {
CoreLabel label = ptbt.next();
tokenMultiset.add(label.toString());
// System.out.println(label);
}
// System.out.println("\n\nMULTISET:\n\n");
// for (String token: tokenMultiset) System.out.println(token +" "+ tokenMultiset.count(token));
return tokenMultiset;
}
示例14: tokenize
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
public String[] tokenize(String s) {
s = s.replaceAll("\u00ad ", "\u00ad");
PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(
new StringReader(s),
new CoreLabelTokenFactory(),
"invertible=false,ptb3Escaping=false");
List<CoreLabel> words = tokenizer.tokenize();
String[] result = new String[words.size()];
for (int i = 0; i < words.size(); i++)
result[i] = words.get(i).toString();
return result;
}
示例15: tokenize
import edu.stanford.nlp.process.PTBTokenizer; //导入依赖的package包/类
@Override
public TokenSeq tokenize(String sentence) {
return TokenSeq.of(PTBTokenizer
.newPTBTokenizer(
new StringReader(dehyphenate ? sentence.replace("-",
" ") : sentence)).tokenize().stream()
.map(w -> w.value()).collect(Collectors.toList()));
}