本文整理汇总了Java中edu.jhu.hlt.concrete.TokenizationKind类的典型用法代码示例。如果您正苦于以下问题:Java TokenizationKind类的具体用法?Java TokenizationKind怎么用?Java TokenizationKind使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
TokenizationKind类属于edu.jhu.hlt.concrete包,在下文中一共展示了TokenizationKind类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: generateConcreteTokenization
import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
/**
* Generate a {@link Tokenization} object from a list of tokens, list of offsets, and start position of the text (e.g., first text character in the text).
*
* @param tokens
* - a {@link List} of tokens (Strings)
* @param offsets
* - an array of integers (offsets)
* @param startPos
* - starting position of the text
* @return a {@link Tokenization} object with correct tokenization
*/
public static Tokenization generateConcreteTokenization(String[] tokens, int[] offsets, int startPos) {
Tokenization tkz = new Tokenization();
tkz.setKind(TokenizationKind.TOKEN_LIST);
tkz.setMetadata(new AnnotationMetadata(tiftMetadata));
tkz.setUuid(UUIDFactory.newUUID());
TokenList tl = new TokenList();
// Note: we use token index as token id.
for (int tokenId = 0; tokenId < tokens.length; ++tokenId) {
String token = tokens[tokenId];
int start = startPos + offsets[tokenId];
int end = start + token.length();
TextSpan ts = new TextSpan(start, end);
Token tokenObj = new Token();
tokenObj.setTextSpan(ts).setText(token).setTokenIndex(tokenId);
tl.addToTokenList(tokenObj);
}
tkz.setTokenList(tl);
return tkz;
}
示例2: coreLabelToTokenization
import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
private Tokenization coreLabelToTokenization(int cOffset) throws AnalyticException, ConcreteException {
Tokenization tkz = new TokenizationFactory(this.gen).create();
tkz.setKind(TokenizationKind.TOKEN_LIST);
List<Token> tlist = new ArrayList<>();
tkz.setTokenList(new TokenList(tlist));
AnnotationMetadata md = AnnotationMetadataFactory.fromCurrentLocalTime().setTool("Stanford CoreNLP PTB");
tkz.setMetadata(md);
return this.coreLabelToTokenization(cOffset, tkz);
}
示例3: isValid
import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
/**
* Check:
* <ol>
* <li>UUID is valid</li>
* <li>Metadata is set</li>
* <li>Metadata is valid</li>
* <li>TokenizationKind is set</li>
* </ol>
*
* <ul>
* <li>If TokenizationKind == Lattice, check Lattice exists and List does not</li>
* <li>If TokenizationKind == List, check List exists and Lattice does not; validate List[Token]</li>
* </ul>
*/
/* (non-Javadoc)
* @see edu.jhu.hlt.concrete.validation.AbstractAnnotation#isValid()
*/
@Override
public boolean isValid() {
boolean basics = this.validateUUID(this.annotation.getUuid())
&& this.printStatus("Metadata must be set", this.annotation.isSetMetadata())
&& this.printStatus("Metadata must be valid", new ValidatableMetadata(this.annotation.getMetadata()).isValid())
&& this.printStatus("TokenizationKind must be set.", this.annotation.isSetKind());
if (!basics)
return false;
else {
boolean validByType = true;
if (this.annotation.getKind() == TokenizationKind.TOKEN_LATTICE)
validByType = this.printStatus("Kind == LATTICE, so lattice must be set, AND list must NOT be set.", this.annotation.isSetLattice() && !this.annotation.isSetTokenList());
else {
validByType = this.printStatus("Kind == LIST, so list must be set, AND list must NOT be set.", this.annotation.isSetTokenList() && !this.annotation.isSetLattice())
&& this.printStatus("TokenList must not be empty.", this.annotation.getTokenList().getTokenListSize() > 0)
&& this.printStatus("TokenList must be valid.", this.validateTokenList());
if (validByType) {
validByType = this.validateTokenTaggings();
}
}
return validByType;
}
}
示例4: testChinese1
import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
@Test
public void testChinese1() throws Exception {
Communication chineseComm = this.cf.communication().setText(chineseText1);
AnnotationMetadata md = new AnnotationMetadata().setTool(
"concrete-stanford:test").setTimestamp(
System.currentTimeMillis() / 1000);
chineseComm.setMetadata(md);
Section section = new Section()
.setUuid(UUIDFactory.newUUID())
.setTextSpan(
new TextSpan().setStart(0).setEnding(chineseText1.length()))
.setKind("</TEXT>");
chineseComm.addToSectionList(section);
Sentence sentence = new Sentence().setUuid(UUIDFactory.newUUID())
.setTextSpan(
new TextSpan().setStart(0).setEnding(chineseText1.length()));
section.addToSentenceList(sentence);
Tokenization tokenization = new Tokenization()
.setUuid(UUIDFactory.newUUID()).setMetadata(md)
.setKind(TokenizationKind.TOKEN_LIST);
TokenList tokenList = new TokenList();
int tokId = 0;
int tokenStart = 0, tokenEnd = 0;
for (String tokenStr : chineseText1.split(" +")) {
tokenEnd += tokenStr.length();
Token token = new Token().setTokenIndex(tokId++).setText(tokenStr)
.setTextSpan(new TextSpan().setStart(tokenStart).setEnding(tokenEnd));
tokenStart = tokenEnd + 1;
tokenEnd = tokenStart;
tokenList.addToTokenList(token);
}
tokenization.setTokenTaggingList(new ArrayList<>());
tokenization.setTokenList(tokenList);
sentence.setTokenization(tokenization);
assertTrue(new CommunicationValidator(chineseComm).validate());
TokenizedCommunication tc = new CachedTokenizationCommunication(chineseComm);
TokenizedCommunication wDepParse = new ConcreteStanfordPreCorefAnalytic(PipelineLanguage.CHINESE).annotate(tc);
List<Section> sectList = tc.getSections();
sectList.forEach(s -> {
LOGGER.info("Got kind: {}", s.getKind());
LOGGER.info("Got text span: {}", new SuperTextSpan(s.getTextSpan(), chineseComm).getText());
s.getSentenceList().forEach(st -> {
LOGGER.info("Got sentence: {}", new SuperTextSpan(st.getTextSpan(), chineseComm).getText());
Tokenization tkz = st.getTokenization();
tkz.getTokenList().getTokenList().forEach(tok -> {
LOGGER.info("Got token text: {}", new SuperTextSpan(tok.getTextSpan(), chineseComm).getText());
});
tkz.getTokenTaggingList().forEach(tt -> {
LOGGER.info("Got TT: {} [kind: {}]", tt.getUuid().getUuidString(), tt.getTaggingType());
});
});
});
// List<Tokenization> ntkzList = wDepParse.getTokenizations();
// Tokenization ntkz = ntkzList.get(0);
// parse disabled by default
// assertTrue(ntkz.isSetParseList());
// assertEquals(1, ntkz.getParseListSize());
}
示例5: getAnnoSentence
import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
private AnnoSentence getAnnoSentence(Tokenization tokenization) {
TokenizationKind kind = tokenization.getKind();
if (kind != TokenizationKind.TOKEN_LIST) {
throw new IllegalArgumentException("tokens must be of kind TOKEN_LIST: " + kind);
}
AnnoSentence as = new AnnoSentence();
// Words
List<String> words = new ArrayList<String>();
TokenList tl = tokenization.getTokenList();
for (Token tok : tl.getTokenList()) {
words.add(tok.getText());
}
as.setWords(words);
// POS tags, Lemmas, and Chunks.
TokenTagging posTags = ConcreteUtils.getFirstXTagsWithName(tokenization, TagTypes.POS.name(), prm.posTool);
TokenTagging cposTags = ConcreteUtils.getFirstXTagsWithName(tokenization, TagTypes.POS.name(), prm.cposTool);
TokenTagging lemmas = ConcreteUtils.getFirstXTagsWithName(tokenization, TagTypes.LEMMA.name(), prm.lemmaTool);
TokenTagging chunks = ConcreteUtils.getFirstXTagsWithName(tokenization, "CHUNK", prm.chunkTool);
as.setPosTags(getTagging(posTags));
as.setCposTags(getTagging(cposTags));
as.setLemmas(getTagging(lemmas));
as.setChunks(getTagging(chunks));
// Dependency Parse
if (tokenization.isSetDependencyParseList()) {
int numWords = words.size();
log.trace("Reading dependency parse with name {}", prm.depParseTool);
DependencyParse depParse = ConcreteUtils.getFirstDependencyParseWithName(tokenization, prm.depParseTool);
Pair<int[], List<String>> pair = getParentsDeprels(depParse, numWords);
if (pair != null) {
as.setParents(pair.get1());
as.setDeprels(pair.get2());
}
}
// Constituency Parse
if (tokenization.isSetParseList()) {
NaryTree tree = getParse(ConcreteUtils.getFirstParseWithName(tokenization, prm.parseTool ));
as.setNaryTree(tree);
}
return as;
}
示例6: ingestText
import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
/**
* @return A concrete communication containing the tokens of the text, split into sections by newlines (skipping blank lines), and tokenized on whitespace which are replaced by single spaces
* @throws ConcreteException
*/
public static Communication ingestText(String text, String commId, String commTool, String tokTool) {
AnnotationMetadata commMetadata = new AnnotationMetadata();
commMetadata.setTimestamp(System.currentTimeMillis());
commMetadata.setTool(commTool);
AnnotationMetadata tokenizationMetadata = new AnnotationMetadata();
tokenizationMetadata.setTimestamp(System.currentTimeMillis());
tokenizationMetadata.setTool(tokTool);
Communication comm = new Communication();
comm.setId(commId);
comm.setType("corpus");
comm.setMetadata(commMetadata);
List<Section> sections = new ArrayList<>();
List<String> sentenceStrs = new ArrayList<>();
int nchars = 0;
for (String sentStr : text.trim().split("\\n")) {
sentStr = sentStr.trim();
if (sentStr.length() == 0) { continue; }
List<String> tokenStrs = new ArrayList<>();
List<Token> tokList = new ArrayList<>();
int tokIndex = 0;
for (String tokStr : sentStr.split("\\s+")) {
tokStr = tokStr.trim();
Token newTok = new Token();
newTok.setTokenIndex(tokIndex);
newTok.setTextSpan(new TextSpan(nchars, nchars + tokStr.length()));
newTok.setText(tokStr);
tokList.add(newTok);
tokenStrs.add(tokStr);
tokIndex += 1;
nchars += tokStr.length() + 1; // +1 because of space or newline
}
Tokenization tok = new Tokenization(UUIDFactory.newUUID(), tokenizationMetadata, TokenizationKind.TOKEN_LIST);
Section newSection = new Section(UUIDFactory.newUUID(), "section");
Sentence sent = new Sentence(UUIDFactory.newUUID());
tok.setTokenList(new TokenList(tokList));
sent.setTokenization(tok);
newSection.addToSentenceList(sent);
sections.add(newSection);
sentenceStrs.add(String.join(" ", tokenStrs));
}
comm.setSectionList(sections);
comm.setText(String.join("\n", sentenceStrs));
return comm;
}
示例7: createSimpleCommunication
import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
public static Communication createSimpleCommunication() throws Exception {
Communication comm = new Communication();
comm.setId("Gore-y Landing");
comm.setText("vice pres says jump");
// 0123456789012345678
comm.setType("Test");
comm.setUuid(getUUID());
AnnotationMetadata commMetadata = new AnnotationMetadata();
commMetadata.setTimestamp(System.currentTimeMillis());
commMetadata.setTool("TestTool");
comm.setMetadata(commMetadata);
Tokenization tokenization = new Tokenization();
tokenization.setKind(TokenizationKind.TOKEN_LIST);
tokenization.setUuid(getUUID());
List<Token> listOfTokens = new ArrayList<Token>();
String[] tokens = new String[] { "vice", "pres", "says", "jump" };
for (int i = 0; i < tokens.length; i++) {
Token token = new Token();
token.setText(tokens[i]);
token.setTokenIndex(i);
listOfTokens.add(i, token);
}
TokenList tokenList = new TokenList();
tokenList.setTokenList(listOfTokens);
tokenization.setTokenList(tokenList);
AnnotationMetadata tokenizationMetadata = new AnnotationMetadata();
tokenizationMetadata.setTimestamp(System.currentTimeMillis());
tokenizationMetadata.setTool("TestTool");
tokenization.setMetadata(tokenizationMetadata);
Sentence sentence = new Sentence();
sentence.setTokenization(tokenization);
sentence.setUuid(getUUID());
TextSpan sentenceSpan = new TextSpan();
sentenceSpan.setStart(0);
sentenceSpan.setEnding(18);
sentence.setTextSpan(sentenceSpan);
Section section = new Section();
section.addToSentenceList(sentence);
section.setKind("SectionKind");
section.setUuid(getUUID());
TextSpan sectionSpan = new TextSpan();
sectionSpan.setStart(0);
sectionSpan.setEnding(18);
comm.addToSectionList(section);
return comm;
}
示例8: createSimpleCommunication
import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
public static Communication createSimpleCommunication() throws Exception {
Communication comm = new Communication();
comm.setId("Gore-y Landing");
comm.setText("vice pres says jump");
// 0123456789012345678
comm.setType("Test");
comm.setUuid(getUUID());
String toolName = "TestTool";
comm.setMetadata(getMetadata(toolName));
// Tokens
Tokenization tokenization = new Tokenization();
tokenization.setKind(TokenizationKind.TOKEN_LIST);
tokenization.setUuid(getUUID());
List<Token> listOfTokens = new ArrayList<Token>();
String[] tokens = new String[] { "vice", "pres", "says", "jump" };
for (int i = 0; i < tokens.length; i++) {
Token token = new Token();
token.setText(tokens[i]);
token.setTokenIndex(i);
listOfTokens.add(i, token);
}
TokenList tokenList = new TokenList();
tokenList.setTokenList(listOfTokens);
tokenization.setTokenList(tokenList);
tokenization.setMetadata(getMetadata(toolName));
// POS Tags
addTagging(tokenization, "POS", "ToolOne tags", new String[] { "N", "N", "V", "V" });
addTagging(tokenization, "POS", "ToolTwo tags", new String[] { "n", "n", "v", "v" });
// Lemmas
addTagging(tokenization, "LEMMA", "ToolOne tags", new String[] { "VIC", "PRE", "SAY", "JUM" });
addTagging(tokenization, "LEMMA", "ToolTwo tags", new String[] { "vic", "pre", "say", "jum" });
// Chunks
addTagging(tokenization, "CHUNK", "ToolOne tags", new String[] { "BN", "IN", "BV", "IV" });
addTagging(tokenization, "CHUNK", "ToolTwo tags", new String[] { "bn", "in", "bv", "iv" });
// Dependency Parse
addDepParse(tokenization, "ToolOne", new int[] { 1, 2, 3, -1 });
addDepParse(tokenization, "ToolTwo", new int[] { -1, 0, 1, 2 });
// Constituency Parse
addParse(tokenization, "ToolOne", "SYMBOL");
addParse(tokenization, "ToolTwo", "symbol");
// Sentence
Sentence sentence = new Sentence();
sentence.setTokenization(tokenization);
sentence.setUuid(getUUID());
TextSpan sentenceSpan = new TextSpan();
sentenceSpan.setStart(0);
sentenceSpan.setEnding(18);
sentence.setTextSpan(sentenceSpan);
// Section
Section section = new Section();
section.addToSentenceList(sentence);
section.setKind("SectionKind");
section.setUuid(getUUID());
TextSpan sectionSpan = new TextSpan();
sectionSpan.setStart(0);
sectionSpan.setEnding(18);
comm.addToSectionList(section);
// EntityMentionSet
addEntityMentionSet(comm, tokenization, "ToolOne", "PER");
addEntityMentionSet(comm, tokenization, "ToolTwo", "per");
// SituationMentionSet
addSituationMentionSet(comm, tokenization, "ToolOne", "NEAR", 0);
addSituationMentionSet(comm, tokenization, "ToolTwo", "near", 1);
return comm;
}
示例9: ValidatableTokenTagging
import edu.jhu.hlt.concrete.TokenizationKind; //导入依赖的package包/类
/**
*
*/
public ValidatableTokenTagging(TokenTagging tagging, Tokenization parent) {
this.tagging = tagging;
this.parent = parent;
// TODO: only accept correct Tokenization
TokenizationKind kind = parent.getKind();
switch (kind) {
case TOKEN_LIST:
TokenList tok = parent.getTokenList();
List<Token> tokList = tok.getTokenList();
List<Integer> tokIndicesList = new ArrayList<Integer>();
int tmpIdx = -1;
for (Token t : tokList) {
final int tidx = t.getTokenIndex();
tokIndicesList.add(tidx);
if (tmpIdx < tidx)
tmpIdx = tidx;
}
this.maxTokenIdx = tmpIdx;
this.tokIndices = tokIndicesList;
break;
default:
throw new IllegalArgumentException("Validating of tokenization type: " + parent.getKind() + " not supported.");
}
List<TaggedToken> ttList = this.tagging.getTaggedTokenList();
this.tokenTaggings = ttList;
if (ttList.size() > 0) {
this.ttIndices = new ArrayList<Integer>();
int tmpMaxIdx = -1;
for (TaggedToken tt : ttList) {
int ttIndex = tt.getTokenIndex();
if (tmpMaxIdx < ttIndex)
tmpMaxIdx = ttIndex;
this.ttIndices.add(tt.getTokenIndex());
}
this.maxTTIndex = tmpMaxIdx;
} else {
this.ttIndices = new ArrayList<>();
this.maxTTIndex = -1;
}
}