当前位置: 首页>>代码示例>>Java>>正文


Java Word类代码示例

本文整理汇总了Java中edu.stanford.nlp.ling.Word的典型用法代码示例。如果您正苦于以下问题:Java Word类的具体用法?Java Word怎么用?Java Word使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Word类属于edu.stanford.nlp.ling包,在下文中一共展示了Word类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: perClusterUpdateSen

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public static void perClusterUpdateSen(ArrayList<List<HasWord>> processedText,
		int common_sentNum, int representative_sentNum,
	int coreStartIndex, int coreEndIndex,
	int commonStartIndex, int commonEndIndex){
	
	List<HasWord> representative_sentence = 
			processedText.get(representative_sentNum-1);
	List<HasWord> common_sentence = 
			processedText.get(common_sentNum-1);
	
	HasWord replace = new Word();
	String replaceStr = "";
	for (int i = coreStartIndex-1; i < coreEndIndex - 1; i++){
		replaceStr += representative_sentence.get(i).toString();
		replaceStr += " ";
	}
	replace.setWord(replaceStr.trim());
	for (int i=commonStartIndex-1; i < commonEndIndex-1; i++){
		common_sentence.set(i,new Word());
		common_sentence.get(i).setWord("");
	}
	common_sentence.set(commonStartIndex-1, replace);
	
}
 
开发者ID:cs-zyluo,项目名称:CausalNet,代码行数:25,代码来源:Coreferencer.java

示例2: computeCorpusStatistics

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public void computeCorpusStatistics() {
    Set<String> stopWords = new HashSet<>(Arrays.asList(Utils.readLines("lib/stopwords.txt")));
    List<String> lines = Arrays.asList(Utils.readLines(source));
    HistMap<String> unigrams = new HistMap<>();
    HistMap<String> unigramsTyped = new HistMap<>();
    HistMap<String> pos = new HistMap<>();
    for (String line : lines) {
        List<Word> tokens = tokenizer.getTokenizer(new StringReader(line)).tokenize();            
        tokens.stream()
                .filter(word -> !stopWords.contains(word.word().toLowerCase()))
                .forEach(word -> unigrams.add(word.word().toLowerCase()));
        tagger.tagSentence(tokens).stream()
                .filter(word -> !stopWords.contains(word.word().toLowerCase()))
                .map(tagged -> {
                    unigramsTyped.add(String.format("%s/%s", tagged.tag().equals("NNP") ? tagged.word() : tagged.word().toLowerCase(), tagged.tag()));
                    return tagged.tag();
                })
                .forEach(pos::add);
    }        
    Utils.write(String.format("%s.unigrams", source), unigrams.toString());
    Utils.write(String.format("%s.unigramsTyped", source), unigramsTyped.toString());
    Utils.write(String.format("%s.pos", source), pos.toString());
}
 
开发者ID:sinantie,项目名称:Generator,代码行数:24,代码来源:ExportExamplesToSentences.java

示例3: tag

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public Sentence tag(Sentence sent) {
	List<HasWord> ss = new ArrayList<HasWord>();
	for (Token t : sent) {
		HasWord hw = new Word();
		hw.setWord(t.toString());
		ss.add(hw);
	} 
	List<TaggedWord> sst = tagger.tagSentence(ss);
	for (tuple2<Integer,TaggedWord> item : x.enumerate(sst)) {
		Token tk = sent.get(item.key);
		tk.annotate("pos", item.value.tag());
		sent.setAt(item.key).value(tk);
	}
	  
	return sent;
}
 
开发者ID:WantedTechnologies,项目名称:xpresso,代码行数:17,代码来源:MaxentPosTagger.java

示例4: applyPTBTokenizer

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
private static List<String> applyPTBTokenizer(DocumentPreprocessor dp, boolean tokenizeNLs, boolean ptb3Escaping) {
	PTBTokenizerFactory<Word> tf = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory("tokenizeNLs=" + tokenizeNLs + ",ptb3Escaping=" + ptb3Escaping + ",asciiQuotes=true");
	dp.setTokenizerFactory(tf);
	List<String> sentences = new ArrayList<>();
	for (List<HasWord> wordList : dp) {
		String sentence = "";
		for (HasWord word : wordList) {
			sentence += " " + splitCompounds(word.word());
		}
		sentences.add(sentence);
	}
	return sentences;
}
 
开发者ID:infolis,项目名称:infoLink,代码行数:14,代码来源:TokenizerStanford.java

示例5: getWords

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public static List<Word> getWords(Element el) {
    String str = stringFromElement(el);
    // Some elements have newlines in them, and we don't want these :)
    str = removeNewlines(str);
    List<Word> words = Ling.getWordsFromString(str);
    
    // The Stanford tokenizer sometimes adds a period where it thinks a sentence ends, particularly
    // if something like "Corp." was the last token, then it will change this to "Corp. ."
    // We want to remove that period since it is not in the actual text.
    if( words.size() > 1 && 
        words.get(words.size()-1).value().equals(".") &&
        words.get(words.size()-2).value().endsWith(".") ) {
//      System.out.println("getWords() removing period from: " + words);
      words.remove(words.size()-1);
//      System.out.println("\t-> " + words);
    }
        
//    System.out.println("getWords(Element) str = " + str);
    return words;
  }
 
开发者ID:nchambers,项目名称:schemas,代码行数:21,代码来源:TimebankParser.java

示例6: findEventInEnamex

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
 * Searches an ENAMEX XML Element for nested EVENT Elements to save
 */
public void findEventInEnamex(Element enamex, List<TextEvent> localEvents, int loc, int sid) {
  NodeList children = enamex.getChildNodes();
  for( int j = 0; j < children.getLength(); j++ ) {
    Node child = children.item(j);
    if( child.getNodeType() == Node.ELEMENT_NODE ) {
      Element el = (Element)child;
      // Save the EVENT
      if( el.getTagName().equals("EVENT") ) {
        int numwords = 0;
        // Count all the words that appear before the EVENT
        for( int k = 0; k < j; k++ ) {
          List<Word> words = getWords(el);
          numwords += words.size();
        }
        // Save the EVENT
        TextEvent event = new TextEvent(el.getAttribute("eid"),sid,loc+numwords,el);
        localEvents.add(event);
        //		    events.put(el.getAttribute("eid"), event);
      }
    }
  }
}
 
开发者ID:nchambers,项目名称:schemas,代码行数:26,代码来源:TimebankParser.java

示例7: tokenize

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
@Override
public String[] tokenize(String sentence) {
	Reader r = new StringReader(sentence);
	PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
	List<String> l = new ArrayList<String>();
	while (tokenizer.hasNext()) {
		Word w = tokenizer.next();
		l.add(w.word());
	}
	String[] tok = new String[l.size() + 1];
	tok[0] = is2.io.CONLLReader09.ROOT;
	int i = 1;
	for (String s : l)
		tok[i++] = s;
	return tok;
}
 
开发者ID:microth,项目名称:mateplus,代码行数:17,代码来源:StanfordPTBTokenizer.java

示例8: tokenizeplus

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public StringInText[] tokenizeplus(String sentence) {
	Reader r = new StringReader(sentence);
	PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
	List<StringInText> l = new ArrayList<StringInText>();
	while (tokenizer.hasNext()) {
		Word w = tokenizer.next();
		l.add(new StringInText(w.word(), w.beginPosition() + startpos, w
				.endPosition() + startpos));
	}
	StringInText[] tok = new StringInText[l.size() + 1];
	tok[0] = new StringInText(is2.io.CONLLReader09.ROOT, 0, 0);
	int i = 1;
	for (StringInText s : l)
		tok[i++] = s;

	startpos += (1 + sentence.length());

	return tok;
}
 
开发者ID:microth,项目名称:mateplus,代码行数:20,代码来源:StanfordPTBTokenizer.java

示例9: processLines

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public void processLines(String[] sentencesToProcess, Integer[] sentencesStarts, int parentOffset) throws IOException{
	for(int line_number = 0;line_number<sentencesToProcess.length;line_number++){
		String line = sentencesToProcess[line_number];
		List<Word> tokensInSentence = getTokens(line);
		ArrayList<Integer> tokens_indexes = new ArrayList<Integer>();
		ArrayList<String> tokens = new ArrayList<String>();
		
		for(int token_index = 0;token_index<tokensInSentence.size();token_index++)
		{
			tokens_indexes.add(tokensInSentence.get(token_index).beginPosition()+
					sentencesStarts[line_number]+
					parentOffset);
			tokens.add(tokensInSentence.get(token_index).word());
		}
		
		sentences_tokens_indexes.put(line_number, tokens_indexes);
		sentences_tokens.put(line_number, tokensInSentence);
		sentences_tokens_string.put(line_number, tokens);
		sentences.put(line_number, line);
	}
	
}
 
开发者ID:ehsane,项目名称:rainbownlp,代码行数:23,代码来源:Tokenizer.java

示例10: processFile

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public void processFile() throws IOException{
	List<String> lines = FileUtil.loadLineByLine(txt_file_path);
	
	int sentence_start=0;
	for(int line_number = 0;line_number<lines.size();line_number++){
		String line = lines.get(line_number);
		List<Word> tokensInSentence = getTokens(line);
		ArrayList<Integer> tokens_indexes = new ArrayList<Integer>();
		
		for(int token_index = 0;token_index<tokensInSentence.size();token_index++)
		{
			tokens_indexes.add(tokensInSentence.get(token_index).beginPosition()+sentence_start+line_number+1);
		}
		
		sentences_tokens_indexes.put(line_number, tokens_indexes);
		sentences_tokens.put(line_number, tokensInSentence);
		sentences.put(line_number, line);
		sentence_start+= line.length();
	}
	
}
 
开发者ID:ehsane,项目名称:rainbownlp,代码行数:22,代码来源:Tokenizer.java

示例11: processSentence

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
public List<WordLemmaTag> processSentence(String sentence, boolean isTokenized)
{
	final StanfordLemmatizer lemmatizer = StanfordLemmatizer.getInstance();
	final StanfordPOSTagger tagger = StanfordPOSTagger.getInstance();
   	final List<WordLemmaTag> tlSentence = new ArrayList<WordLemmaTag>();
	
   	// the tagged sentence
   	List<TaggedWord> tSentence = null;
   	if (isTokenized) tSentence = tagger.tag(sentence);
   	else
   	{
   		StanfordTokenizer tokenizer = StanfordTokenizer.getInstance();
   		List<Word> tokens = tokenizer.tokenize(sentence);
   		tSentence = tagger.tag(tokens);
   	}
   	
   	// add to the lemmatized sentence
   	for (TaggedWord tw : tSentence) 
   		tlSentence.add(lemmatizer.lemmatize(tw));

   	return tlSentence;
}
 
开发者ID:pschuette22,项目名称:Zeppa-AppEngine,代码行数:23,代码来源:SentenceProcessor.java

示例12: getPCFGScore

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
 * Parses a sentence and returns the PCFG score as a confidence measure.
 * 
 * @param sentence a sentence
 * @return PCFG score
 */
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence) {
       if (tlp == null || parser == null)
           throw new RuntimeException("Parser has not been initialized");
       
       // parse the sentence to produce PCFG score
       log.debug("Parsing sentence");
       double score;
       synchronized (parser) {
           Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
           List<Word> words = tokenizer.tokenize();
           log.debug("Tokenization: "+words);
           parser.parse(new Sentence(words));
           score = parser.getPCFGScore();
       }
       
       return score;
}
 
开发者ID:claritylab,项目名称:lucida,代码行数:25,代码来源:StanfordParser.java

示例13: getPCFGScore

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
 * Parses a sentence and returns the PCFG score as a confidence measure.
 * 
 * @param sentence
 *            a sentence
 * @return PCFG score
 */
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        score = parser.getPCFGScore();
    }

    return score;
}
 
开发者ID:TScottJ,项目名称:OpenEphyra,代码行数:29,代码来源:StanfordParser.java

示例14: getNext

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
 * Internally fetches the next token.
 *
 * @return the next token in the token stream, or null if none exists.
 */
@Override
protected Word getNext() {
  Word token = null;
  if (lexer == null) {
    return token;
  }
  try {
    token = lexer.next();
    while (token == ArabicLexer.crValue) {
      if (eolIsSignificant) {
        return token;
      } else {
        token = lexer.next();
      }
    }
  } catch (IOException e) {
    // do nothing, return null
  }
  return token;
}
 
开发者ID:FabianFriedrich,项目名称:Text2Process,代码行数:26,代码来源:ArabicTokenizer.java

示例15: main

import edu.stanford.nlp.ling.Word; //导入依赖的package包/类
/**
 * Reads a file from the argument and prints its tokens one per line.
 * This is mainly as a testing aid, but it can also be quite useful
 * standalone to turn a corpus into a one token per line file of tokens.
 * <p/>
 * Usage: <code>java edu.stanford.nlp.process.ArabicTokenizer filename
 * </code>
 *
 * @param args Command line arguments
 * @throws IOException If can't open files, etc.
 */
public static void main(String[] args) throws IOException {
  if (args.length < 1) {
    System.err.println("usage: java edu.stanford.nlp.process.ArabicTokenizer [-cr] filename");
    return;
  }
  ArabicTokenizer tokenizer = new ArabicTokenizer(new InputStreamReader(new FileInputStream(args[args.length - 1]), "UTF-8"), args[0].equals("-cr"));
  PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true);
  while (tokenizer.hasNext()) {
    Word w = tokenizer.next();
    if (w == ArabicLexer.crValue) {
      pw.println("***CR***");
    } else {
      pw.println(w);
    }
  }
}
 
开发者ID:FabianFriedrich,项目名称:Text2Process,代码行数:28,代码来源:ArabicTokenizer.java


注:本文中的edu.stanford.nlp.ling.Word类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。