本文整理匯總了Java中com.twitter.penguin.korean.TwitterKoreanProcessorJava類的典型用法代碼示例。如果您正苦於以下問題:Java TwitterKoreanProcessorJava類的具體用法?Java TwitterKoreanProcessorJava怎麽用?Java TwitterKoreanProcessorJava使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
TwitterKoreanProcessorJava類屬於com.twitter.penguin.korean包,在下文中一共展示了TwitterKoreanProcessorJava類的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: incrementToken
import com.twitter.penguin.korean.TwitterKoreanProcessorJava; //導入依賴的package包/類
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
if (isFirst) {
final char[] buffer = new char[IO_BUFFER_SIZE];
final StringBuilder out = new StringBuilder();
for (;;) {
int dataLen = input.read(buffer, 0, buffer.length);
if (dataLen < 0){
break;
}
out.append(buffer, 0, dataLen);
}
Seq<KoreanTokenizer.KoreanToken> tokens = TwitterKoreanProcessorJava.tokenize(TwitterKoreanProcessorJava.normalize(out));
Seq<KoreanTokenizer.KoreanToken> stemmed = TwitterKoreanProcessorJava.stem(tokens);
tokenList = TwitterKoreanProcessorJava.extractPhrases(stemmed, true, false);
isFirst = false;
}
if(tokenList.size() <= currentIndex){
return false; // No more tokens
}
setAttributes(tokenList.get(currentIndex));
positionAtt.setPositionIncrement(1);
currentIndex++;
return true;
}
示例2: incrementToken
import com.twitter.penguin.korean.TwitterKoreanProcessorJava; //導入依賴的package包/類
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
if (isFirst) {
final char[] buffer = new char[IO_BUFFER_SIZE];
final StringBuilder out = new StringBuilder();
for (;;) {
int dataLen = input.read(buffer, 0, buffer.length);
if (dataLen < 0){
break;
}
out.append(buffer, 0, dataLen);
}
Seq<KoreanTokenizer.KoreanToken> tokens = TwitterKoreanProcessorJava.tokenize(TwitterKoreanProcessorJava.normalize(out));
Seq<KoreanTokenizer.KoreanToken> stemmed = TwitterKoreanProcessorJava.stem(tokens);
tokenList = TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(stemmed);
isFirst = false;
}
if(tokenList.size() <= currentIndex){
return false; // No more tokens
}
setAttributes(tokenList.get(currentIndex));
positionAtt.setPositionIncrement(1);
currentIndex++;
return true;
}
示例3: KoreanTokenizer
import com.twitter.penguin.korean.TwitterKoreanProcessorJava; //導入依賴的package包/類
public KoreanTokenizer(String toTokenize) {
// need normalize?
// Tokenize
Seq<com.twitter.penguin.korean.tokenizer.KoreanTokenizer.KoreanToken> tokens =
TwitterKoreanProcessorJava.tokenize(toTokenize);
tokenList = new ArrayList<String>();
Iterator<KoreanTokenJava> iter = TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(tokens).iterator();
while (iter.hasNext()) {
tokenList.add(iter.next().getText());
}
tokenIter = tokenList.iterator();
}
示例4: main
import com.twitter.penguin.korean.TwitterKoreanProcessorJava; //導入依賴的package包/類
public static void main(String[] args) {
String text = "대숲 어쩌죠...? 짝사랑을 시작하게 된 거 같아요ㅜㅜㅜㅠㅠㅠㅠㅠ";
// Normalize
CharSequence normalized = TwitterKoreanProcessorJava.normalize(text);
System.out.println(normalized);
// 한국어를 처리하는 예시입니다ㅋㅋ #한국어
// Tokenize
Seq<KoreanTokenizer.KoreanToken> tokens = TwitterKoreanProcessorJava.tokenize(normalized);
//System.out.println(TwitterKoreanProcessorJava.tokensToJavaStringList(tokens));
// [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ, #한국어]
// System.out.println(TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(tokens));
// [한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4)]
// Stemming
Seq<KoreanTokenizer.KoreanToken> stemmed = TwitterKoreanProcessorJava.stem(tokens);
System.out.println(TwitterKoreanProcessorJava.tokensToJavaStringList(stemmed));
// [한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ, #한국어]
List<KoreanTokenJava> change = TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(stemmed);
LinkedList<String> change_list = new LinkedList<String>();
for(int i=0; i<change.size(); i++) {
KoreanTokenJava str = change.get(i);
String str2 = str.toString();
str2 = str2.replaceAll(":", "");
str2 = str2.replaceAll("[0-9]", "");
str2 = str2.replaceAll(" , ", "");
change_list.add(str2);
}
System.out.println(change_list);
// [한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4)]
/*
// Phrase extraction
List<KoreanPhraseExtractor.KoreanPhrase> phrases = TwitterKoreanProcessorJava.extractPhrases(tokens, true, true);
System.out.println(phrases);
// [한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), #한국어(Hashtag: 18, 4)]
*/
}