本文整理汇总了Java中org.deeplearning4j.text.sentenceiterator.SentenceIterator类的典型用法代码示例。如果您正苦于以下问题:Java SentenceIterator类的具体用法?Java SentenceIterator怎么用?Java SentenceIterator使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
SentenceIterator类属于org.deeplearning4j.text.sentenceiterator包,在下文中一共展示了SentenceIterator类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testFindNamesFromText
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");
log.info("load is right!");
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
//tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());
//Generates a word-vector from the dataset stored in resources folder
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
.learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
vec.fit();
WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));
//trains a model that can find out all names from news(Suffix txt),It uses word vector generated
// WordVectors wordVectors;
//test model,Whether the model find out name from unknow text;
}
示例2: testWord2VecPlot
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
.layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
.tokenizerFactory(t).build();
vec.fit();
// UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
// vec.getLookupTable().plotVocab(100, connectionInfo);
Thread.sleep(10000000000L);
fail("Not implemented");
}
示例3: main
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
// Gets Path to Text file
String filePath = "c:/raw_sentences.txt";
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache();
WeightLookupTable table = new InMemoryLookupTable.Builder()
.vectorLength(100)
.useAdaGrad(false)
.cache(cache)
.lr(0.025f).build();
log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5).iterations(1)
.layerSize(100).lookupTable(table)
.stopWords(new ArrayList<String>())
.vocabCache(cache).seed(42)
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
log.info("Fitting Word2Vec model....");
vec.fit();
log.info("Writing word vectors to text file....");
// Write word
WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");
log.info("Closest Words:");
Collection<String> lst = vec.wordsNearest("man", 5);
System.out.println(lst);
double cosSim = vec.similarity("cruise", "voyage");
System.out.println(cosSim);
}
示例4: w2vBuilder
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
public static Word2Vec w2vBuilder(SentenceIterator iter, TokenizerFactory t) {
return new Word2Vec.Builder()
.seed(12345)
.iterate(iter)
.tokenizerFactory(t)
.batchSize(1000)
.allowParallelTokenization(true) // enable parallel tokenization
.epochs(1) // number of epochs (iterations over whole training corpus) for training
.iterations(3) // number of iterations done for each mini-batch during training
.elementsLearningAlgorithm(new SkipGram<>()) // use SkipGram Model. If CBOW: new CBOW<>()
.minWordFrequency(50) // discard words that appear less than the times of set value
.windowSize(5) // set max skip length between words
.learningRate(0.05) // the starting learning rate
.minLearningRate(5e-4) // learning rate should not lower than the set threshold value
.negativeSample(10) // number of negative examples
// set threshold for occurrence of words. Those that appear with higher frequency will be
// randomly down-sampled
.sampling(1e-5)
.useHierarchicSoftmax(true) // use hierarchical softmax
.layerSize(300) // size of word vectors
.workers(8) // number of threads
.build();
}
示例5: getBagOfWordsWithCounts
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
private static LinkedHashMap<String, Integer> getBagOfWordsWithCounts(Language language) {
HashMap<String, Integer> bagOfWords = new HashMap<>();
List<String> sentences = getSentencesFromLanguage(language);
SentenceIterator iter = new CollectionSentenceIterator(new Pan15SentencePreProcessor(), sentences);
while(iter.hasNext()) {
String sentence = iter.nextSentence();
for(String word : sentence.split("\\s+")) {
word = normalize(word);
if (Objects.equals(word, "") || (word.length() == 1 && word.matches("\\p{Punct}"))) continue;
bagOfWords.put(word, bagOfWords.getOrDefault(word, 0) + 1);
}
}
LinkedHashMap<String, Integer> sorted = new LinkedHashMap<>();
final int[] count = {0};
bagOfWords.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Collections.reverseOrder())).forEach(
entry -> {
if (count[0] < VEC_LENGTH) sorted.put(entry.getKey(), entry.getValue());
count[0]++;
}
);
return sorted;
}
示例6: testWord2VecMultiEpoch
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void testWord2VecMultiEpoch() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
.seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).epochs(3)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
.tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
}
示例7: testWord2VecGoogleModelUptraining
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
long time1 = System.currentTimeMillis();
Word2Vec vec = WordVectorSerializer.readWord2VecModel(
new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
long time2 = System.currentTimeMillis();
log.info("Model loaded in {} msec", time2 - time1);
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
vec.setTokenizerFactory(t);
vec.setSentenceIterator(iter);
vec.getConfiguration().setUseHierarchicSoftmax(false);
vec.getConfiguration().setNegative(5.0);
vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());
vec.fit();
}
示例8: before
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Before
public void before() throws Exception {
if (vec == null) {
ClassPathResource resource = new ClassPathResource("/labeled/");
File file = resource.getFile();
SentenceIterator iter = UimaSentenceIterator.createWithPath(file.getAbsolutePath());
new File("cache.ser").delete();
TokenizerFactory t = new UimaTokenizerFactory();
vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100)
.stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter)
.tokenizerFactory(t).build();
vec.fit();
}
}
示例9: testVocab
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void testVocab() throws Exception {
File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile);
Set<String> set = new HashSet<>();
int lines = 0;
int cnt = 0;
while (iter.hasNext()) {
Tokenizer tok = t.create(iter.nextSentence());
for (String token : tok.getTokens()) {
if (token == null || token.isEmpty() || token.trim().isEmpty())
continue;
cnt++;
if (!set.contains(token))
set.add(token);
}
lines++;
}
log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
log.info("Set:\n" + set);
}
示例10: hasNext
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void hasNext() throws Exception {
SentenceIterator iterator = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());
SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
.tokenizerFactory(factory).build();
Iterator<Sequence<VocabWord>> iter = transformer.iterator();
int cnt = 0;
Sequence<VocabWord> sequence = null;
while (iter.hasNext()) {
sequence = iter.next();
assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
cnt++;
}
// log.info("Last element: {}", sequence.asLabels());
assertEquals(97162, cnt);
}
示例11: nextDocument
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void nextDocument() throws Exception {
SentenceIterator sentence = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());
BasicLabelAwareIterator backed = new BasicLabelAwareIterator.Builder(sentence).build();
int cnt = 0;
while (backed.hasNextDocument()) {
backed.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
backed.reset();
AsyncLabelAwareIterator iterator = new AsyncLabelAwareIterator(backed, 64);
cnt = 0;
while (iterator.hasNext()) {
iterator.next();
cnt++;
if (cnt == 10)
iterator.reset();
}
assertEquals(97172, cnt);
}
示例12: testHasNextDocument1
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void testHasNextDocument1() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();
int cnt = 0;
while (iterator.hasNextDocument()) {
iterator.nextDocument();
cnt++;
}
assertEquals(97162, cnt);
LabelsSource generator = iterator.getLabelsSource();
assertEquals(97162, generator.getLabels().size());
assertEquals("DOCZ_0", generator.getLabels().get(0));
}
示例13: trainParagraghVecModel
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
public void trainParagraghVecModel(String locationToSave) throws FileNotFoundException {
ClassPathResource resource = new ClassPathResource("/paragraphVectors/paragraphVectorTraining.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache<VocabWord>();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
/*
if you don't have LabelAwareIterator handy, you can use synchronized labels generator
it will be used to label each document/sequence/line with it's own label.
But if you have LabelAwareIterator ready, you can can provide it, for your in-house labels
*/
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder()
.minWordFrequency(1)
.iterations(100)
.epochs(1)
.layerSize(50)
.learningRate(0.02)
.labelsSource(source)
.windowSize(5)
.iterate(iter)
.trainWordVectors(true)
.vocabCache(cache)
.tokenizerFactory(t)
.sampling(0)
.build();
vec.fit();
WordVectorSerializer.writeParagraphVectors(vec, locationToSave);
}
示例14: w2vBuilder4SmallCorpus
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@SuppressWarnings("unused")
public static Word2Vec w2vBuilder4SmallCorpus(SentenceIterator iter, TokenizerFactory t) {
return new Word2Vec.Builder()
.minWordFrequency(5)
.iterations(1)
.layerSize(100)
.seed(42)
.windowSize(5)
.iterate(iter)
.tokenizerFactory(t)
.learningRate(0.025)
.minLearningRate(1e-3)
.build();
}
示例15: initiliazeVectors
import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Override
void initiliazeVectors(Instances instances) {
SentenceIterator iter = new WekaInstanceSentenceIterator(instances, this.textIndex - 1);
// sets the tokenizer
this.tokenizerFactory.setTokenPreProcessor(this.preprocessor);
// initializes stopwords
this.stopWordsHandler.initialize();
// Building model
this.vec =
new Glove.Builder()
.iterate(iter)
.tokenizerFactory(this.tokenizerFactory)
.alpha(this.alpha)
.learningRate(this.learningRate)
.epochs(this.epochs)
.layerSize(this.layerSize)
.minLearningRate(this.minLearningRate)
.minWordFrequency(this.minWordFrequency)
.stopWords(this.stopWordsHandler.getStopList())
.useAdaGrad(this.useAdaGrad)
.windowSize(this.windowSize)
.workers(this.workers)
.windowSize(this.windowSize)
.xMax(this.xMax)
.batchSize(this.batchSize)
.shuffle(this.shuffle)
.symmetric(this.symmetric)
.build();
// fit model
this.vec.fit();
}