当前位置: 首页>>代码示例>>Java>>正文


Java TokenizationKind类代码示例

本文整理汇总了Java中edu.jhu.hlt.concrete.TokenizationKind的典型用法代码示例。如果您正苦于以下问题:Java TokenizationKind类的具体用法?Java TokenizationKind怎么用?Java TokenizationKind使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


TokenizationKind类属于edu.jhu.hlt.concrete包,在下文中一共展示了TokenizationKind类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: generateConcreteTokenization

import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
/**
 * Generate a {@link Tokenization} object from a list of tokens, list of offsets, and start position of the text (e.g., first text character in the text).
 *
 * @param tokens
 *          - a {@link List} of tokens (Strings)
 * @param offsets
 *          - an array of integers (offsets)
 * @param startPos
 *          - starting position of the text
 * @return a {@link Tokenization} object with correct tokenization
 */
public static Tokenization generateConcreteTokenization(String[] tokens, int[] offsets, int startPos) {
  Tokenization tkz = new Tokenization();
  tkz.setKind(TokenizationKind.TOKEN_LIST);
  tkz.setMetadata(new AnnotationMetadata(tiftMetadata));
  tkz.setUuid(UUIDFactory.newUUID());

  TokenList tl = new TokenList();
  // Note: we use token index as token id.
  for (int tokenId = 0; tokenId < tokens.length; ++tokenId) {
    String token = tokens[tokenId];
    int start = startPos + offsets[tokenId];
    int end = start + token.length();
    TextSpan ts = new TextSpan(start, end);
    Token tokenObj = new Token();
    tokenObj.setTextSpan(ts).setText(token).setTokenIndex(tokenId);
    tl.addToTokenList(tokenObj);
  }

  tkz.setTokenList(tl);
  return tkz;
}
 
开发者ID:hltcoe,项目名称:concrete-java,代码行数:33,代码来源:ConcreteTokenization.java

示例2: coreLabelToTokenization

import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
private Tokenization coreLabelToTokenization(int cOffset) throws AnalyticException, ConcreteException {
  Tokenization tkz = new TokenizationFactory(this.gen).create();
  tkz.setKind(TokenizationKind.TOKEN_LIST);
  List<Token> tlist = new ArrayList<>();
  tkz.setTokenList(new TokenList(tlist));
  AnnotationMetadata md = AnnotationMetadataFactory.fromCurrentLocalTime().setTool("Stanford CoreNLP PTB");
  tkz.setMetadata(md);
  return this.coreLabelToTokenization(cOffset, tkz);
}
 
开发者ID:hltcoe,项目名称:concrete-stanford-deprecated2,代码行数:10,代码来源:CoreMapWrapper.java

示例3: isValid

import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
/**
 * Check:
 * <ol>
 * <li>UUID is valid</li>
 * <li>Metadata is set</li>
 * <li>Metadata is valid</li>
 * <li>TokenizationKind is set</li>
 * </ol>
 *
 * <ul>
 * <li>If TokenizationKind == Lattice, check Lattice exists and List does not</li>
 * <li>If TokenizationKind == List, check List exists and Lattice does not; validate List[Token]</li>
 * </ul>
 */
/* (non-Javadoc)
 * @see edu.jhu.hlt.concrete.validation.AbstractAnnotation#isValid()
 */
@Override
public boolean isValid() {
  boolean basics = this.validateUUID(this.annotation.getUuid())
      && this.printStatus("Metadata must be set", this.annotation.isSetMetadata())
      && this.printStatus("Metadata must be valid", new ValidatableMetadata(this.annotation.getMetadata()).isValid())
      && this.printStatus("TokenizationKind must be set.", this.annotation.isSetKind());
  if (!basics)
    return false;
  else {
    boolean validByType = true;
    if (this.annotation.getKind() == TokenizationKind.TOKEN_LATTICE)
      validByType = this.printStatus("Kind == LATTICE, so lattice must be set, AND list must NOT be set.", this.annotation.isSetLattice() && !this.annotation.isSetTokenList());

    else {
      validByType = this.printStatus("Kind == LIST, so list must be set, AND list must NOT be set.", this.annotation.isSetTokenList() && !this.annotation.isSetLattice())
          && this.printStatus("TokenList must not be empty.", this.annotation.getTokenList().getTokenListSize() > 0)
          && this.printStatus("TokenList must be valid.", this.validateTokenList());
      if (validByType) {
        validByType = this.validateTokenTaggings();
      }
    }

    return validByType;
  }
}
 
开发者ID:hltcoe,项目名称:concrete-java,代码行数:43,代码来源:ValidatableTokenization.java

示例4: testChinese1

import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
@Test
public void testChinese1() throws Exception {
  Communication chineseComm = this.cf.communication().setText(chineseText1);
  AnnotationMetadata md = new AnnotationMetadata().setTool(
      "concrete-stanford:test").setTimestamp(
      System.currentTimeMillis() / 1000);
  chineseComm.setMetadata(md);
  Section section = new Section()
      .setUuid(UUIDFactory.newUUID())
      .setTextSpan(
          new TextSpan().setStart(0).setEnding(chineseText1.length()))
      .setKind("</TEXT>");
  chineseComm.addToSectionList(section);
  Sentence sentence = new Sentence().setUuid(UUIDFactory.newUUID())
      .setTextSpan(
          new TextSpan().setStart(0).setEnding(chineseText1.length()));
  section.addToSentenceList(sentence);
  Tokenization tokenization = new Tokenization()
      .setUuid(UUIDFactory.newUUID()).setMetadata(md)
      .setKind(TokenizationKind.TOKEN_LIST);
  TokenList tokenList = new TokenList();
  int tokId = 0;
  int tokenStart = 0, tokenEnd = 0;
  for (String tokenStr : chineseText1.split(" +")) {
    tokenEnd += tokenStr.length();
    Token token = new Token().setTokenIndex(tokId++).setText(tokenStr)
        .setTextSpan(new TextSpan().setStart(tokenStart).setEnding(tokenEnd));
    tokenStart = tokenEnd + 1;
    tokenEnd = tokenStart;
    tokenList.addToTokenList(token);
  }
  tokenization.setTokenTaggingList(new ArrayList<>());
  tokenization.setTokenList(tokenList);
  sentence.setTokenization(tokenization);

  assertTrue(new CommunicationValidator(chineseComm).validate());

  TokenizedCommunication tc = new CachedTokenizationCommunication(chineseComm);
  TokenizedCommunication wDepParse = new ConcreteStanfordPreCorefAnalytic(PipelineLanguage.CHINESE).annotate(tc);

  List<Section> sectList = tc.getSections();
  sectList.forEach(s -> {
    LOGGER.info("Got kind: {}", s.getKind());
    LOGGER.info("Got text span: {}", new SuperTextSpan(s.getTextSpan(), chineseComm).getText());

    s.getSentenceList().forEach(st -> {
      LOGGER.info("Got sentence: {}", new SuperTextSpan(st.getTextSpan(), chineseComm).getText());
      Tokenization tkz = st.getTokenization();
      tkz.getTokenList().getTokenList().forEach(tok -> {
        LOGGER.info("Got token text: {}", new SuperTextSpan(tok.getTextSpan(), chineseComm).getText());
      });
      tkz.getTokenTaggingList().forEach(tt -> {
        LOGGER.info("Got TT: {} [kind: {}]", tt.getUuid().getUuidString(), tt.getTaggingType());
      });
    });
  });

  // List<Tokenization> ntkzList = wDepParse.getTokenizations();
  // Tokenization ntkz = ntkzList.get(0);
  // parse disabled by default
  // assertTrue(ntkz.isSetParseList());
  // assertEquals(1, ntkz.getParseListSize());
}
 
开发者ID:hltcoe,项目名称:concrete-stanford-deprecated2,代码行数:64,代码来源:NonPTBChineseTextTest.java

示例5: getAnnoSentence

import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
private AnnoSentence getAnnoSentence(Tokenization tokenization) {
    TokenizationKind kind = tokenization.getKind();
    if (kind != TokenizationKind.TOKEN_LIST) {
        throw new IllegalArgumentException("tokens must be of kind TOKEN_LIST: " + kind);
    }

    AnnoSentence as = new AnnoSentence();

    // Words
    List<String> words = new ArrayList<String>();
    TokenList tl = tokenization.getTokenList();
    for (Token tok : tl.getTokenList()) {
        words.add(tok.getText());
    }
    as.setWords(words);

    // POS tags, Lemmas, and Chunks.
    TokenTagging posTags = ConcreteUtils.getFirstXTagsWithName(tokenization, TagTypes.POS.name(), prm.posTool);
    TokenTagging cposTags = ConcreteUtils.getFirstXTagsWithName(tokenization, TagTypes.POS.name(), prm.cposTool);
    TokenTagging lemmas = ConcreteUtils.getFirstXTagsWithName(tokenization, TagTypes.LEMMA.name(), prm.lemmaTool);
    TokenTagging chunks = ConcreteUtils.getFirstXTagsWithName(tokenization, "CHUNK", prm.chunkTool);
    as.setPosTags(getTagging(posTags));
    as.setCposTags(getTagging(cposTags));
    as.setLemmas(getTagging(lemmas));
    as.setChunks(getTagging(chunks));

    // Dependency Parse
    if (tokenization.isSetDependencyParseList()) {
        int numWords = words.size();
        log.trace("Reading dependency parse with name {}", prm.depParseTool);
        DependencyParse depParse = ConcreteUtils.getFirstDependencyParseWithName(tokenization, prm.depParseTool);
        Pair<int[], List<String>> pair = getParentsDeprels(depParse, numWords);
        if (pair != null) {
            as.setParents(pair.get1());
            as.setDeprels(pair.get2());
        }
    }

    // Constituency Parse
    if (tokenization.isSetParseList()) {
        NaryTree tree = getParse(ConcreteUtils.getFirstParseWithName(tokenization, prm.parseTool ));
        as.setNaryTree(tree);
    }

    return as;
}
 
开发者ID:mgormley,项目名称:pacaya-nlp,代码行数:47,代码来源:ConcreteReader.java

示例6: ingestText

import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
/**
 * @return A concrete communication containing the tokens of the text, split into sections by newlines (skipping blank lines), and tokenized on whitespace which are replaced by single spaces
 * @throws ConcreteException
 */
public static Communication ingestText(String text, String commId, String commTool, String tokTool) {
    AnnotationMetadata commMetadata = new AnnotationMetadata();
    commMetadata.setTimestamp(System.currentTimeMillis());
    commMetadata.setTool(commTool);
    AnnotationMetadata tokenizationMetadata = new AnnotationMetadata();
    tokenizationMetadata.setTimestamp(System.currentTimeMillis());
    tokenizationMetadata.setTool(tokTool);
    Communication comm = new Communication();
    comm.setId(commId);
    comm.setType("corpus");
    comm.setMetadata(commMetadata);

    List<Section> sections = new ArrayList<>();
    List<String> sentenceStrs = new ArrayList<>();
    int nchars = 0;
    for (String sentStr : text.trim().split("\\n")) {
        sentStr = sentStr.trim();
        if (sentStr.length() == 0) { continue; }
        List<String> tokenStrs = new ArrayList<>();
        List<Token> tokList = new ArrayList<>();
        int tokIndex = 0;
        for (String tokStr : sentStr.split("\\s+")) {
            tokStr = tokStr.trim();

            Token newTok = new Token();
            newTok.setTokenIndex(tokIndex);
            newTok.setTextSpan(new TextSpan(nchars, nchars + tokStr.length()));
            newTok.setText(tokStr);

            tokList.add(newTok);
            tokenStrs.add(tokStr);
            tokIndex += 1;
            nchars += tokStr.length() + 1;  // +1 because of space or newline
        }
        Tokenization tok = new Tokenization(UUIDFactory.newUUID(), tokenizationMetadata, TokenizationKind.TOKEN_LIST);
        Section newSection = new Section(UUIDFactory.newUUID(), "section");
        Sentence sent = new Sentence(UUIDFactory.newUUID());

        tok.setTokenList(new TokenList(tokList));
        sent.setTokenization(tok);
        newSection.addToSentenceList(sent);

        sections.add(newSection);
        sentenceStrs.add(String.join(" ", tokenStrs));
    }
    comm.setSectionList(sections);
    comm.setText(String.join("\n", sentenceStrs));
    return comm;
}
 
开发者ID:mgormley,项目名称:pacaya-nlp,代码行数:54,代码来源:ConcreteUtils.java

示例7: createSimpleCommunication

import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
public static Communication createSimpleCommunication() throws Exception {
    Communication comm = new Communication();
    comm.setId("Gore-y Landing");
    comm.setText("vice pres says jump");
    // 0123456789012345678
    comm.setType("Test");
    comm.setUuid(getUUID());

    AnnotationMetadata commMetadata = new AnnotationMetadata();
    commMetadata.setTimestamp(System.currentTimeMillis());
    commMetadata.setTool("TestTool");
    comm.setMetadata(commMetadata);

    Tokenization tokenization = new Tokenization();
    tokenization.setKind(TokenizationKind.TOKEN_LIST);
    tokenization.setUuid(getUUID());

    List<Token> listOfTokens = new ArrayList<Token>();

    String[] tokens = new String[] { "vice", "pres", "says", "jump" };
    for (int i = 0; i < tokens.length; i++) {
        Token token = new Token();
        token.setText(tokens[i]);
        token.setTokenIndex(i);
        listOfTokens.add(i, token);
    }
    TokenList tokenList = new TokenList();
    tokenList.setTokenList(listOfTokens);
    tokenization.setTokenList(tokenList);

    AnnotationMetadata tokenizationMetadata = new AnnotationMetadata();
    tokenizationMetadata.setTimestamp(System.currentTimeMillis());
    tokenizationMetadata.setTool("TestTool");
    tokenization.setMetadata(tokenizationMetadata);

    Sentence sentence = new Sentence();
    sentence.setTokenization(tokenization);
    sentence.setUuid(getUUID());
    TextSpan sentenceSpan = new TextSpan();
    sentenceSpan.setStart(0);
    sentenceSpan.setEnding(18);
    sentence.setTextSpan(sentenceSpan);

    Section section = new Section();
    section.addToSentenceList(sentence);
    section.setKind("SectionKind");
    section.setUuid(getUUID());
    TextSpan sectionSpan = new TextSpan();
    sectionSpan.setStart(0);
    sectionSpan.setEnding(18);

    comm.addToSectionList(section);

    return comm;
}
 
开发者ID:mgormley,项目名称:pacaya-nlp,代码行数:56,代码来源:ConcreteWriterTest.java

示例8: createSimpleCommunication

import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
public static Communication createSimpleCommunication() throws Exception {
    Communication comm = new Communication();
    comm.setId("Gore-y Landing");
    comm.setText("vice pres says jump");
    // 0123456789012345678
    comm.setType("Test");
    comm.setUuid(getUUID());
    
    String toolName = "TestTool";
    comm.setMetadata(getMetadata(toolName));

    // Tokens
    Tokenization tokenization = new Tokenization();
    tokenization.setKind(TokenizationKind.TOKEN_LIST);
    tokenization.setUuid(getUUID());
    List<Token> listOfTokens = new ArrayList<Token>();
    String[] tokens = new String[] { "vice", "pres", "says", "jump" };
    for (int i = 0; i < tokens.length; i++) {
        Token token = new Token();
        token.setText(tokens[i]);
        token.setTokenIndex(i);
        listOfTokens.add(i, token);
    }
    TokenList tokenList = new TokenList();
    tokenList.setTokenList(listOfTokens);
    tokenization.setTokenList(tokenList);
    tokenization.setMetadata(getMetadata(toolName));

    // POS Tags
    addTagging(tokenization, "POS", "ToolOne tags", new String[] { "N", "N", "V", "V" });
    addTagging(tokenization, "POS", "ToolTwo tags", new String[] { "n", "n", "v", "v" });
    
    // Lemmas
    addTagging(tokenization, "LEMMA", "ToolOne tags", new String[] { "VIC", "PRE", "SAY", "JUM" });
    addTagging(tokenization, "LEMMA", "ToolTwo tags", new String[] { "vic", "pre", "say", "jum" });
    
    // Chunks
    addTagging(tokenization, "CHUNK", "ToolOne tags", new String[] { "BN", "IN", "BV", "IV" });
    addTagging(tokenization, "CHUNK", "ToolTwo tags", new String[] { "bn", "in", "bv", "iv" });

    // Dependency Parse
    addDepParse(tokenization, "ToolOne", new int[] { 1, 2, 3, -1 });
    addDepParse(tokenization, "ToolTwo", new int[] { -1, 0, 1, 2 });
            
    // Constituency Parse
    addParse(tokenization, "ToolOne", "SYMBOL");
    addParse(tokenization, "ToolTwo", "symbol");
    
    // Sentence
    Sentence sentence = new Sentence();
    sentence.setTokenization(tokenization);
    sentence.setUuid(getUUID());
    TextSpan sentenceSpan = new TextSpan();
    sentenceSpan.setStart(0);
    sentenceSpan.setEnding(18);
    sentence.setTextSpan(sentenceSpan);

    // Section
    Section section = new Section();
    section.addToSentenceList(sentence);
    section.setKind("SectionKind");
    section.setUuid(getUUID());
    TextSpan sectionSpan = new TextSpan();
    sectionSpan.setStart(0);
    sectionSpan.setEnding(18);
    comm.addToSectionList(section);

    // EntityMentionSet
    addEntityMentionSet(comm, tokenization, "ToolOne", "PER");
    addEntityMentionSet(comm, tokenization, "ToolTwo", "per");
    
    // SituationMentionSet
    addSituationMentionSet(comm, tokenization, "ToolOne", "NEAR", 0);
    addSituationMentionSet(comm, tokenization, "ToolTwo", "near", 1);
    
    return comm;
}
 
开发者ID:mgormley,项目名称:pacaya-nlp,代码行数:78,代码来源:ConcreteReaderTest.java

示例9: ValidatableTokenTagging

import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
/**
 * 
 */
public ValidatableTokenTagging(TokenTagging tagging, Tokenization parent) {
  this.tagging = tagging;
  this.parent = parent;
  
  // TODO: only accept correct Tokenization
  TokenizationKind kind = parent.getKind();
  switch (kind) {
  case TOKEN_LIST:
    TokenList tok = parent.getTokenList();
    List<Token> tokList = tok.getTokenList();
    List<Integer> tokIndicesList = new ArrayList<Integer>();
    int tmpIdx = -1;
    for (Token t : tokList) {
      final int tidx = t.getTokenIndex();
      tokIndicesList.add(tidx);
      if (tmpIdx < tidx)
        tmpIdx = tidx;  
    }
    
    this.maxTokenIdx = tmpIdx;
    this.tokIndices = tokIndicesList;
    break;
  default:
    throw new IllegalArgumentException("Validating of tokenization type: " + parent.getKind() + " not supported.");
  }
  
  List<TaggedToken> ttList = this.tagging.getTaggedTokenList();
  this.tokenTaggings = ttList;
  if (ttList.size() > 0) {
    this.ttIndices = new ArrayList<Integer>();
    
    int tmpMaxIdx = -1;
    for (TaggedToken tt : ttList) {
      int ttIndex = tt.getTokenIndex();
      if (tmpMaxIdx < ttIndex)
        tmpMaxIdx = ttIndex;
      this.ttIndices.add(tt.getTokenIndex());
    }
    
    this.maxTTIndex = tmpMaxIdx;
  } else {
    this.ttIndices = new ArrayList<>();
    this.maxTTIndex = -1;
  }
}
 
开发者ID:hltcoe,项目名称:concrete-java,代码行数:49,代码来源:ValidatableTokenTagging.java


注:本文中的edu.jhu.hlt.concrete.TokenizationKind类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。