本文整理汇总了Java中edu.stanford.nlp.ling.Word类的典型用法代码示例。如果您正苦于以下问题:Java Word类的具体用法?Java Word怎么用?Java Word使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Word类属于edu.stanford.nlp.ling包,在下文中一共展示了Word类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: perClusterUpdateSen
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public static void perClusterUpdateSen(ArrayList<List<HasWord>> processedText,
int common_sentNum, int representative_sentNum,
int coreStartIndex, int coreEndIndex,
int commonStartIndex, int commonEndIndex){
List<HasWord> representative_sentence =
processedText.get(representative_sentNum-1);
List<HasWord> common_sentence =
processedText.get(common_sentNum-1);
HasWord replace = new Word();
String replaceStr = "";
for (int i = coreStartIndex-1; i < coreEndIndex - 1; i++){
replaceStr += representative_sentence.get(i).toString();
replaceStr += " ";
}
replace.setWord(replaceStr.trim());
for (int i=commonStartIndex-1; i < commonEndIndex-1; i++){
common_sentence.set(i,new Word());
common_sentence.get(i).setWord("");
}
common_sentence.set(commonStartIndex-1, replace);
}
示例2: computeCorpusStatistics
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public void computeCorpusStatistics() {
Set<String> stopWords = new HashSet<>(Arrays.asList(Utils.readLines("lib/stopwords.txt")));
List<String> lines = Arrays.asList(Utils.readLines(source));
HistMap<String> unigrams = new HistMap<>();
HistMap<String> unigramsTyped = new HistMap<>();
HistMap<String> pos = new HistMap<>();
for (String line : lines) {
List<Word> tokens = tokenizer.getTokenizer(new StringReader(line)).tokenize();
tokens.stream()
.filter(word -> !stopWords.contains(word.word().toLowerCase()))
.forEach(word -> unigrams.add(word.word().toLowerCase()));
tagger.tagSentence(tokens).stream()
.filter(word -> !stopWords.contains(word.word().toLowerCase()))
.map(tagged -> {
unigramsTyped.add(String.format("%s/%s", tagged.tag().equals("NNP") ? tagged.word() : tagged.word().toLowerCase(), tagged.tag()));
return tagged.tag();
})
.forEach(pos::add);
}
Utils.write(String.format("%s.unigrams", source), unigrams.toString());
Utils.write(String.format("%s.unigramsTyped", source), unigramsTyped.toString());
Utils.write(String.format("%s.pos", source), pos.toString());
}
示例3: tag
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public Sentence tag(Sentence sent) {
List<HasWord> ss = new ArrayList<HasWord>();
for (Token t : sent) {
HasWord hw = new Word();
hw.setWord(t.toString());
ss.add(hw);
}
List<TaggedWord> sst = tagger.tagSentence(ss);
for (tuple2<Integer,TaggedWord> item : x.enumerate(sst)) {
Token tk = sent.get(item.key);
tk.annotate("pos", item.value.tag());
sent.setAt(item.key).value(tk);
}
return sent;
}
示例4: applyPTBTokenizer
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
private static List<String> applyPTBTokenizer(DocumentPreprocessor dp, boolean tokenizeNLs, boolean ptb3Escaping) {
PTBTokenizerFactory<Word> tf = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory("tokenizeNLs=" + tokenizeNLs + ",ptb3Escaping=" + ptb3Escaping + ",asciiQuotes=true");
dp.setTokenizerFactory(tf);
List<String> sentences = new ArrayList<>();
for (List<HasWord> wordList : dp) {
String sentence = "";
for (HasWord word : wordList) {
sentence += " " + splitCompounds(word.word());
}
sentences.add(sentence);
}
return sentences;
}
示例5: getWords
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public static List<Word> getWords(Element el) {
String str = stringFromElement(el);
// Some elements have newlines in them, and we don't want these :)
str = removeNewlines(str);
List<Word> words = Ling.getWordsFromString(str);
// The Stanford tokenizer sometimes adds a period where it thinks a sentence ends, particularly
// if something like "Corp." was the last token, then it will change this to "Corp. ."
// We want to remove that period since it is not in the actual text.
if( words.size() > 1 &&
words.get(words.size()-1).value().equals(".") &&
words.get(words.size()-2).value().endsWith(".") ) {
// System.out.println("getWords() removing period from: " + words);
words.remove(words.size()-1);
// System.out.println("\t-> " + words);
}
// System.out.println("getWords(Element) str = " + str);
return words;
}
示例6: findEventInEnamex
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
* Searches an ENAMEX XML Element for nested EVENT Elements to save
*/
public void findEventInEnamex(Element enamex, List<TextEvent> localEvents, int loc, int sid) {
NodeList children = enamex.getChildNodes();
for( int j = 0; j < children.getLength(); j++ ) {
Node child = children.item(j);
if( child.getNodeType() == Node.ELEMENT_NODE ) {
Element el = (Element)child;
// Save the EVENT
if( el.getTagName().equals("EVENT") ) {
int numwords = 0;
// Count all the words that appear before the EVENT
for( int k = 0; k < j; k++ ) {
List<Word> words = getWords(el);
numwords += words.size();
}
// Save the EVENT
TextEvent event = new TextEvent(el.getAttribute("eid"),sid,loc+numwords,el);
localEvents.add(event);
// events.put(el.getAttribute("eid"), event);
}
}
}
}
示例7: tokenize
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
@Override
public String[] tokenize(String sentence) {
Reader r = new StringReader(sentence);
PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
List<String> l = new ArrayList<String>();
while (tokenizer.hasNext()) {
Word w = tokenizer.next();
l.add(w.word());
}
String[] tok = new String[l.size() + 1];
tok[0] = is2.io.CONLLReader09.ROOT;
int i = 1;
for (String s : l)
tok[i++] = s;
return tok;
}
示例8: tokenizeplus
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public StringInText[] tokenizeplus(String sentence) {
Reader r = new StringReader(sentence);
PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
List<StringInText> l = new ArrayList<StringInText>();
while (tokenizer.hasNext()) {
Word w = tokenizer.next();
l.add(new StringInText(w.word(), w.beginPosition() + startpos, w
.endPosition() + startpos));
}
StringInText[] tok = new StringInText[l.size() + 1];
tok[0] = new StringInText(is2.io.CONLLReader09.ROOT, 0, 0);
int i = 1;
for (StringInText s : l)
tok[i++] = s;
startpos += (1 + sentence.length());
return tok;
}
示例9: processLines
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public void processLines(String[] sentencesToProcess, Integer[] sentencesStarts, int parentOffset) throws IOException{
for(int line_number = 0;line_number<sentencesToProcess.length;line_number++){
String line = sentencesToProcess[line_number];
List<Word> tokensInSentence = getTokens(line);
ArrayList<Integer> tokens_indexes = new ArrayList<Integer>();
ArrayList<String> tokens = new ArrayList<String>();
for(int token_index = 0;token_index<tokensInSentence.size();token_index++)
{
tokens_indexes.add(tokensInSentence.get(token_index).beginPosition()+
sentencesStarts[line_number]+
parentOffset);
tokens.add(tokensInSentence.get(token_index).word());
}
sentences_tokens_indexes.put(line_number, tokens_indexes);
sentences_tokens.put(line_number, tokensInSentence);
sentences_tokens_string.put(line_number, tokens);
sentences.put(line_number, line);
}
}
示例10: processFile
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public void processFile() throws IOException{
List<String> lines = FileUtil.loadLineByLine(txt_file_path);
int sentence_start=0;
for(int line_number = 0;line_number<lines.size();line_number++){
String line = lines.get(line_number);
List<Word> tokensInSentence = getTokens(line);
ArrayList<Integer> tokens_indexes = new ArrayList<Integer>();
for(int token_index = 0;token_index<tokensInSentence.size();token_index++)
{
tokens_indexes.add(tokensInSentence.get(token_index).beginPosition()+sentence_start+line_number+1);
}
sentences_tokens_indexes.put(line_number, tokens_indexes);
sentences_tokens.put(line_number, tokensInSentence);
sentences.put(line_number, line);
sentence_start+= line.length();
}
}
示例11: processSentence
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public List<WordLemmaTag> processSentence(String sentence, boolean isTokenized)
{
final StanfordLemmatizer lemmatizer = StanfordLemmatizer.getInstance();
final StanfordPOSTagger tagger = StanfordPOSTagger.getInstance();
final List<WordLemmaTag> tlSentence = new ArrayList<WordLemmaTag>();
// the tagged sentence
List<TaggedWord> tSentence = null;
if (isTokenized) tSentence = tagger.tag(sentence);
else
{
StanfordTokenizer tokenizer = StanfordTokenizer.getInstance();
List<Word> tokens = tokenizer.tokenize(sentence);
tSentence = tagger.tag(tokens);
}
// add to the lemmatized sentence
for (TaggedWord tw : tSentence)
tlSentence.add(lemmatizer.lemmatize(tw));
return tlSentence;
}
示例12: getPCFGScore
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
* Parses a sentence and returns the PCFG score as a confidence measure.
*
* @param sentence a sentence
* @return PCFG score
*/
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence) {
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce PCFG score
log.debug("Parsing sentence");
double score;
synchronized (parser) {
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: "+words);
parser.parse(new Sentence(words));
score = parser.getPCFGScore();
}
return score;
}
示例13: getPCFGScore
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
* Parses a sentence and returns the PCFG score as a confidence measure.
*
* @param sentence
* a sentence
* @return PCFG score
*/
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce PCFG score
log.debug("Parsing sentence");
double score;
synchronized (parser)
{
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: " + words);
parser.parse(new Sentence(words));
score = parser.getPCFGScore();
}
return score;
}
示例14: getNext
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
* Internally fetches the next token.
*
* @return the next token in the token stream, or null if none exists.
*/
@Override
protected Word getNext() {
Word token = null;
if (lexer == null) {
return token;
}
try {
token = lexer.next();
while (token == ArabicLexer.crValue) {
if (eolIsSignificant) {
return token;
} else {
token = lexer.next();
}
}
} catch (IOException e) {
// do nothing, return null
}
return token;
}
示例15: main
import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
* Reads a file from the argument and prints its tokens one per line.
* This is mainly as a testing aid, but it can also be quite useful
* standalone to turn a corpus into a one token per line file of tokens.
* <p/>
* Usage: <code>java edu.stanford.nlp.process.ArabicTokenizer filename
* </code>
*
* @param args Command line arguments
* @throws IOException If can't open files, etc.
*/
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("usage: java edu.stanford.nlp.process.ArabicTokenizer [-cr] filename");
return;
}
ArabicTokenizer tokenizer = new ArabicTokenizer(new InputStreamReader(new FileInputStream(args[args.length - 1]), "UTF-8"), args[0].equals("-cr"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true);
while (tokenizer.hasNext()) {
Word w = tokenizer.next();
if (w == ArabicLexer.crValue) {
pw.println("***CR***");
} else {
pw.println(w);
}
}
}