本文整理汇总了Java中org.deeplearning4j.text.documentiterator.FileLabelAwareIterator类的典型用法代码示例。如果您正苦于以下问题:Java FileLabelAwareIterator类的具体用法?Java FileLabelAwareIterator怎么用?Java FileLabelAwareIterator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
FileLabelAwareIterator类属于org.deeplearning4j.text.documentiterator包,在下文中一共展示了FileLabelAwareIterator类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: makeParagraphVectors
import org.deeplearning4j.text.documentiterator.FileLabelAwareIterator; //导入依赖的package包/类
void makeParagraphVectors() throws Exception {
// build a iterator for our dataset
File dir = TYPE_LEARNING_DIR;
dir.mkdirs();
iterator = new FileLabelAwareIterator.Builder()
.addSourceFolder(new File(dir, "corpus"))
.build();
tokenizerFactory = new DefaultTokenizerFactory();
tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
// ParagraphVectors training configuration
paragraphVectors = new ParagraphVectors.Builder()
.learningRate(0.025)
.minLearningRate(0.001)
.batchSize(1000)
.epochs(5)
.iterate(iterator)
.trainWordVectors(true)
.tokenizerFactory(tokenizerFactory)
.build();
// Start model training
paragraphVectors.fit();
}
示例2: checkUnlabeledData
import org.deeplearning4j.text.documentiterator.FileLabelAwareIterator; //导入依赖的package包/类
void checkUnlabeledData() throws FileNotFoundException {
/*
At this point we assume that we have model built and we can check
which categories our unlabeled document falls into.
So we'll start loading our unlabeled documents and checking them
*/
ClassPathResource unClassifiedResource = new ClassPathResource("paravec/unlabeled");
FileLabelAwareIterator unClassifiedIterator = new FileLabelAwareIterator.Builder()
.addSourceFolder(unClassifiedResource.getFile())
.build();
/*
Now we'll iterate over unlabeled data, and check which label it could be assigned to
Please note: for many domains it's normal to have 1 document fall into few labels at once,
with different "weight" for each.
*/
MeansBuilder meansBuilder = new MeansBuilder(
(InMemoryLookupTable<VocabWord>) paragraphVectors.getLookupTable(),
tokenizerFactory);
LabelSeeker seeker = new LabelSeeker(iterator.getLabelsSource().getLabels(),
(InMemoryLookupTable<VocabWord>) paragraphVectors.getLookupTable());
while (unClassifiedIterator.hasNextDocument()) {
LabelledDocument document = unClassifiedIterator.nextDocument();
INDArray documentAsCentroid = meansBuilder.documentAsVector(document);
List<Pair<String, Double>> scores = seeker.getScores(documentAsCentroid);
/*
please note, document.getLabel() is used just to show which document we're looking at now,
as a substitute for printing out the whole document name.
So, labels on these two documents are used like titles,
just to visualize our classification done properly
*/
log.info("Document '" + document.getLabel() + "' falls into the following categories: ");
for (Pair<String, Double> score : scores) {
log.info(" " + score.getFirst() + ": " + score.getSecond());
}
}
}
示例3: checkUnlabelledData
import org.deeplearning4j.text.documentiterator.FileLabelAwareIterator; //导入依赖的package包/类
private void checkUnlabelledData(Word2Vec paragraphVectors, LabelAwareIterator iterator, TokenizerFactory tokenizerFactory) throws FileNotFoundException {
ClassPathResource unClassifiedResource = new ClassPathResource("papers/unlabeled");
FileLabelAwareIterator unClassifiedIterator = new FileLabelAwareIterator.Builder()
.addSourceFolder(unClassifiedResource.getFile())
.build();
MeansBuilder meansBuilder = new MeansBuilder(
(InMemoryLookupTable<VocabWord>) paragraphVectors.getLookupTable(),
tokenizerFactory);
LabelSeeker seeker = new LabelSeeker(iterator.getLabelsSource().getLabels(),
(InMemoryLookupTable<VocabWord>) paragraphVectors.getLookupTable());
System.out.println(paragraphVectors + " classification results");
double cc = 0;
double size = 0;
while (unClassifiedIterator.hasNextDocument()) {
LabelledDocument document = unClassifiedIterator.nextDocument();
INDArray documentAsCentroid = meansBuilder.documentAsVector(document);
List<Pair<String, Double>> scores = seeker.getScores(documentAsCentroid);
double max = -Integer.MAX_VALUE;
String cat = null;
for (Pair<String, Double> p : scores) {
if (p.getSecond() > max) {
max = p.getSecond();
cat = p.getFirst();
}
}
if (document.getLabels().contains(cat)) {
cc++;
}
size++;
}
System.out.println("acc:" + (cc / size));
}
示例4: testConsumeOnNonEqualVocabs
import org.deeplearning4j.text.documentiterator.FileLabelAwareIterator; //导入依赖的package包/类
@Test
public void testConsumeOnNonEqualVocabs() throws Exception {
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
BasicLineIterator underlyingIterator = new BasicLineIterator(resource.getFile());
SentenceTransformer transformer =
new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
AbstractSequenceIterator<VocabWord> sequenceIterator =
new AbstractSequenceIterator.Builder<>(transformer).build();
VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>()
.addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();
vocabConstructor.buildJointVocabulary(false, true);
assertEquals(244, cacheSource.numWords());
InMemoryLookupTable<VocabWord> mem1 =
(InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100)
.cache(cacheSource).build();
mem1.resetWeights(true);
AbstractCache<VocabWord> cacheTarget = new AbstractCache.Builder<VocabWord>().build();
FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder()
.addSourceFolder(new ClassPathResource("/paravec/labeled").getFile()).build();
transformer = new SentenceTransformer.Builder().iterator(labelAwareIterator).tokenizerFactory(t).build();
sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
VocabConstructor<VocabWord> vocabTransfer = new VocabConstructor.Builder<VocabWord>()
.addSource(sequenceIterator, 1).setTargetVocabCache(cacheTarget).build();
vocabTransfer.buildMergedVocabulary(cacheSource, true);
// those +3 go for 3 additional entries in target VocabCache: labels
assertEquals(cacheSource.numWords() + 3, cacheTarget.numWords());
InMemoryLookupTable<VocabWord> mem2 =
(InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100)
.cache(cacheTarget).seed(18).build();
mem2.resetWeights(true);
assertNotEquals(mem1.vector("day"), mem2.vector("day"));
mem2.consume(mem1);
assertEquals(mem1.vector("day"), mem2.vector("day"));
assertTrue(mem1.syn0.rows() < mem2.syn0.rows());
assertEquals(mem1.syn0.rows() + 3, mem2.syn0.rows());
}
示例5: testParagraphVectorsReducedLabels1
import org.deeplearning4j.text.documentiterator.FileLabelAwareIterator; //导入依赖的package包/类
/**
* This test is not indicative.
* there's no need in this test within travis, use it manually only for problems detection
*
* @throws Exception
*/
@Test
@Ignore
public void testParagraphVectorsReducedLabels1() throws Exception {
ClassPathResource resource = new ClassPathResource("/labeled");
File file = resource.getFile();
LabelAwareIterator iter = new FileLabelAwareIterator.Builder().addSourceFolder(file).build();
TokenizerFactory t = new DefaultTokenizerFactory();
/**
* Please note: text corpus is REALLY small, and some kind of "results" could be received with HIGH epochs number, like 30.
* But there's no reason to keep at that high
*/
ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).epochs(3).layerSize(100)
.stopWords(new ArrayList<String>()).windowSize(5).iterate(iter).tokenizerFactory(t).build();
vec.fit();
//WordVectorSerializer.writeWordVectors(vec, "vectors.txt");
INDArray w1 = vec.lookupTable().vector("I");
INDArray w2 = vec.lookupTable().vector("am");
INDArray w3 = vec.lookupTable().vector("sad.");
INDArray words = Nd4j.create(3, vec.lookupTable().layerSize());
words.putRow(0, w1);
words.putRow(1, w2);
words.putRow(2, w3);
INDArray mean = words.isMatrix() ? words.mean(0) : words;
log.info("Mean" + Arrays.toString(mean.dup().data().asDouble()));
log.info("Array" + Arrays.toString(vec.lookupTable().vector("negative").dup().data().asDouble()));
double simN = Transforms.cosineSim(mean, vec.lookupTable().vector("negative"));
log.info("Similarity negative: " + simN);
double simP = Transforms.cosineSim(mean, vec.lookupTable().vector("neutral"));
log.info("Similarity neutral: " + simP);
double simV = Transforms.cosineSim(mean, vec.lookupTable().vector("positive"));
log.info("Similarity positive: " + simV);
}
示例6: testMergedVocabWithLabels1
import org.deeplearning4j.text.documentiterator.FileLabelAwareIterator; //导入依赖的package包/类
@Test
public void testMergedVocabWithLabels1() throws Exception {
AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();
AbstractCache<VocabWord> cacheTarget = new AbstractCache.Builder<VocabWord>().build();
ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
BasicLineIterator underlyingIterator = new BasicLineIterator(resource.getFile());
SentenceTransformer transformer =
new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();
AbstractSequenceIterator<VocabWord> sequenceIterator =
new AbstractSequenceIterator.Builder<>(transformer).build();
VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>()
.addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();
vocabConstructor.buildJointVocabulary(false, true);
int sourceSize = cacheSource.numWords();
log.info("Source Vocab size: " + sourceSize);
FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder()
.addSourceFolder(new ClassPathResource("/paravec/labeled").getFile()).build();
transformer = new SentenceTransformer.Builder().iterator(labelAwareIterator).tokenizerFactory(t).build();
sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();
VocabConstructor<VocabWord> vocabTransfer = new VocabConstructor.Builder<VocabWord>()
.addSource(sequenceIterator, 1).setTargetVocabCache(cacheTarget).build();
vocabTransfer.buildMergedVocabulary(cacheSource, true);
// those +3 go for 3 additional entries in target VocabCache: labels
assertEquals(sourceSize + 3, cacheTarget.numWords());
// now we check index equality for transferred elements
assertEquals(cacheSource.wordAtIndex(17), cacheTarget.wordAtIndex(17));
assertEquals(cacheSource.wordAtIndex(45), cacheTarget.wordAtIndex(45));
assertEquals(cacheSource.wordAtIndex(89), cacheTarget.wordAtIndex(89));
// we check that newly added labels have indexes beyond the VocabCache index space
// please note, we need >= since the indexes are zero-based, and sourceSize is not
assertTrue(cacheTarget.indexOf("Zfinance") > sourceSize - 1);
assertTrue(cacheTarget.indexOf("Zscience") > sourceSize - 1);
assertTrue(cacheTarget.indexOf("Zhealth") > sourceSize - 1);
}