本文整理汇总了Java中org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator类的典型用法代码示例。如果您正苦于以下问题:Java CollectionSentenceIterator类的具体用法?Java CollectionSentenceIterator怎么用?Java CollectionSentenceIterator使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
CollectionSentenceIterator类属于org.deeplearning4j.text.sentenceiterator包,在下文中一共展示了CollectionSentenceIterator类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getBagOfWordsWithCounts
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; //导入依赖的package包/类
private static LinkedHashMap<String, Integer> getBagOfWordsWithCounts(Language language) {
HashMap<String, Integer> bagOfWords = new HashMap<>();
List<String> sentences = getSentencesFromLanguage(language);
SentenceIterator iter = new CollectionSentenceIterator(new Pan15SentencePreProcessor(), sentences);
while(iter.hasNext()) {
String sentence = iter.nextSentence();
for(String word : sentence.split("\\s+")) {
word = normalize(word);
if (Objects.equals(word, "") || (word.length() == 1 && word.matches("\\p{Punct}"))) continue;
bagOfWords.put(word, bagOfWords.getOrDefault(word, 0) + 1);
}
}
LinkedHashMap<String, Integer> sorted = new LinkedHashMap<>();
final int[] count = {0};
bagOfWords.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Collections.reverseOrder())).forEach(
entry -> {
if (count[0] < VEC_LENGTH) sorted.put(entry.getKey(), entry.getValue());
count[0]++;
}
);
return sorted;
}
示例2: testParallelFlag2
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; //导入依赖的package包/类
@Test(expected = ND4JIllegalStateException.class)
public void testParallelFlag2() throws Exception {
val collection = new ArrayList<String>();
collection.add("First string");
collection.add("Second string");
collection.add("Third string");
collection.add("");
collection.add("Fifth string");
// collection.add("caboom");
val vectorizer = new TfidfVectorizer.Builder()
.allowParallelTokenization(false)
.setIterator(new CollectionSentenceIterator(collection))
.setTokenizerFactory(new ExplodingTokenizerFactory(8, -1))
.build();
vectorizer.buildVocab();
log.info("Fitting vectorizer...");
vectorizer.fit();
}
示例3: testParallelFlag3
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; //导入依赖的package包/类
@Test(expected = ND4JIllegalStateException.class)
public void testParallelFlag3() throws Exception {
val collection = new ArrayList<String>();
collection.add("First string");
collection.add("Second string");
collection.add("Third string");
collection.add("");
collection.add("Fifth string");
collection.add("Long long long string");
collection.add("Sixth string");
val vectorizer = new TfidfVectorizer.Builder()
.allowParallelTokenization(false)
.setIterator(new CollectionSentenceIterator(collection))
.setTokenizerFactory(new ExplodingTokenizerFactory(-1, 4))
.build();
vectorizer.buildVocab();
log.info("Fitting vectorizer...");
vectorizer.fit();
}
示例4: getBinaryBoWVector
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; //导入依赖的package包/类
public INDArray getBinaryBoWVector(String sentence, Language language) {
LinkedList<String> keys = getBagOfWords(language);
SentenceIterator iter = new CollectionSentenceIterator(new Pan15SentencePreProcessor(), Collections.singletonList(sentence));
sentence = iter.nextSentence();
INDArray featureVector = Nd4j.zeros(1, VEC_LENGTH);
for(String word : sentence.split("\\s+")) {
word = normalize(word);
int col = keys.indexOf(word);
if (col > -1) featureVector.putScalar(0, col, 1);
}
return featureVector;
}
示例5: getBoWVector
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; //导入依赖的package包/类
public INDArray getBoWVector(String sentence, Language language) {
LinkedList<String> keys = getBagOfWords(language);
SentenceIterator iter = new CollectionSentenceIterator(new Pan15SentencePreProcessor(), Collections.singletonList(sentence));
sentence = iter.nextSentence();
INDArray featureVector = Nd4j.zeros(1, VEC_LENGTH);
for(String word : sentence.split("\\s+")) {
word = normalize(word);
int col = keys.indexOf(word);
if (col > -1)
featureVector.putScalar(0, col, featureVector.getColumn(col).getInt(0) + 1);
}
featureVector.divi(VEC_LENGTH);
return featureVector;
}
示例6: getWord2Vec
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; //导入依赖的package包/类
private void getWord2Vec() {
t.setTokenPreProcessor(new CommonPreprocessor());
for (Language language: languages.keySet()) {
List<String> sentences = getSentencesFromLanguage(language);
SentenceIterator iter = new CollectionSentenceIterator(PREPROCESSOR, sentences);
Word2Vec vec = new Word2Vec.Builder().elementsLearningAlgorithm(learningAlgorithm)
.minWordFrequency(6)
.iterations(15)
.layerSize(VEC_LENGTH)
.seed(42)
.windowSize(5)
.iterate(iter)
.tokenizerFactory(t)
.build();
vec.fit();
saveModel(vec, language);
languageWord2VecMap.put(language, vec);
}
}
示例7: main
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
ClassPathResource r = new ClassPathResource("/train.tsv");
if(r.exists()) {
InputStream is = r.getInputStream();
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(new File("train.tsv")));
IOUtils.copy(is, bos);
bos.flush();
bos.close();
is.close();
}
SentenceIterator docIter = new CollectionSentenceIterator(new SentenceToPhraseMapper(new File("train.tsv")).sentences());
TokenizerFactory factory = new DefaultTokenizerFactory();
Word2Vec vec = new Word2Vec.Builder().iterate(docIter)
.tokenizerFactory(factory).batchSize(10000)
.learningRate(2.5e-2).sampling(5).learningRateDecayWords(10000)
.iterations(3).minWordFrequency(1)
.layerSize(300).windowSize(5).build();
vec.fit();
FileUtils.writeLines(new File("vocab.csv"),vec.getCache().words());
String word = "amusing";
String otherWord = "turd";
System.out.println("Words nearest " + word + " " + vec.wordsNearest(word,10));
System.out.println("Words nearest " + otherWord + " " + vec.wordsNearest(otherWord,10));
Tsne t = new Tsne.Builder()
.setMaxIter(100).stopLyingIteration(20).build();
vec.getCache().plotVocab(t);
}
示例8: shouldLoadAndCreateSameWord2Vec
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; //导入依赖的package包/类
@Test
public void shouldLoadAndCreateSameWord2Vec() {
//given
Pan15Parser parser = new Pan15Parser();
HashMap<String, Pan15Author> english = parser.parseCSVCorpus().get(Language.ENGLISH);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
List<String> englishSentences = english.values().stream().map(Author::getDocuments)
.collect(Collectors.toList())
.stream().flatMap(List::stream).collect(Collectors.toList());
SentenceIterator englishIter = new CollectionSentenceIterator(new Pan15SentencePreProcessor(), englishSentences);
// when
Word2Vec englishVec = new Word2Vec.Builder()
.minWordFrequency(6)
.iterations(15)
.layerSize(250)
.seed(42)
.windowSize(5)
.iterate(englishIter)
.tokenizerFactory(t)
.build();
englishVec.fit();
Word2Vec loadedEnglishVec1 = new Pan15Word2Vec(new SkipGram<>()).readModelFromFile(Language.ENGLISH);
Word2Vec loadedEnglishVec2 = new Pan15Word2Vec(new CBOW<>()).readModelFromFile(Language.ENGLISH);
Word2Vec loadedEnglishVec3 = new Pan15Word2Vec(new GloVe<>()).readModelFromFile(Language.ENGLISH);
loadedEnglishVec1.setTokenizerFactory(t);
loadedEnglishVec1.setSentenceIterator(englishIter);
loadedEnglishVec2.setTokenizerFactory(t);
loadedEnglishVec2.setSentenceIterator(englishIter);
loadedEnglishVec3.setTokenizerFactory(t);
loadedEnglishVec3.setSentenceIterator(englishIter);
//then
Assert.assertNotNull(loadedEnglishVec1);
System.out.println(englishVec.wordsNearest("home", 15));
System.out.println(loadedEnglishVec1.wordsNearest("home", 15));
System.out.println(loadedEnglishVec2.wordsNearest("home", 15));
System.out.println(loadedEnglishVec3.wordsNearest("home", 15));
}
示例9: main
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
SentenceIterator docIter = new CollectionSentenceIterator(new SentenceToPhraseMapper(new ClassPathResource("/train.tsv").getFile()).sentences());
TokenizerFactory factory = new DefaultTokenizerFactory();
Word2Vec vec = new Word2Vec.Builder().iterate(docIter).tokenizerFactory(factory).batchSize(100000)
.learningRate(2.5e-2).iterations(1)
.layerSize(100).windowSize(5).build();
vec.fit();
NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().nIn(vec.getLayerSize()).nOut(vec.getLayerSize())
.hiddenUnit(RBM.HiddenUnit.RECTIFIED).visibleUnit(RBM.VisibleUnit.GAUSSIAN).momentum(0.5f)
.iterations(10).learningRate(1e-6f).build();
InMemoryLookupCache l = (InMemoryLookupCache) vec.getCache();
DBN d = new DBN.Builder()
.configure(conf).hiddenLayerSizes(new int[]{250,100,2})
.build();
DataSet dPretrain = new DataSet(l.getSyn0(),l.getSyn0());
DataSetIterator dPretrainIter = new ListDataSetIterator(dPretrain.asList(),1000);
while(dPretrainIter.hasNext()) {
d.pretrain(dPretrainIter.next().getFeatureMatrix(), 1, 1e-6f, 10);
}
// d.pretrain(l.getSyn0(),1,1e-3f,1000);
d.getOutputLayer().conf().setLossFunction(LossFunctions.LossFunction.RMSE_XENT);
SemanticHashing s = new SemanticHashing.Builder().withEncoder(d)
.build();
d = null;
dPretrainIter.reset();
while(dPretrainIter.hasNext()) {
s.fit(dPretrainIter.next());
}
Tsne t = new Tsne.Builder()
.setMaxIter(100).stopLyingIteration(20).build();
INDArray output = s.reconstruct(l.getSyn0(),4);
l.getSyn0().data().flush();
l.getSyn1().data().flush();
s = null;
System.out.println(Arrays.toString(output.shape()));
t.plot(output,2,new ArrayList<>(vec.getCache().words()));
vec.getCache().plotVocab(t);
}