当前位置: 首页>>代码示例>>Java>>正文


Java SentenceIterator类代码示例

本文整理汇总了Java中org.deeplearning4j.text.sentenceiterator.SentenceIterator的典型用法代码示例。如果您正苦于以下问题:Java SentenceIterator类的具体用法?Java SentenceIterator怎么用?Java SentenceIterator使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


SentenceIterator类属于org.deeplearning4j.text.sentenceiterator包,在下文中一共展示了SentenceIterator类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testFindNamesFromText

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:ChineseTokenizerTest.java

示例2: testWord2VecPlot

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:23,代码来源:ManualTests.java

示例3: main

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }
 
开发者ID:PacktPublishing,项目名称:Java-Data-Science-Cookbook,代码行数:41,代码来源:Word2VecRawTextExample.java

示例4: w2vBuilder

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
public static Word2Vec w2vBuilder(SentenceIterator iter, TokenizerFactory t) {
	return new Word2Vec.Builder()
			.seed(12345)
			.iterate(iter)
			.tokenizerFactory(t)
			.batchSize(1000)
			.allowParallelTokenization(true) // enable parallel tokenization
			.epochs(1) //  number of epochs (iterations over whole training corpus) for training
			.iterations(3) // number of iterations done for each mini-batch during training
			.elementsLearningAlgorithm(new SkipGram<>()) // use SkipGram Model. If CBOW: new CBOW<>()
			.minWordFrequency(50) // discard words that appear less than the times of set value
			.windowSize(5) // set max skip length between words
			.learningRate(0.05) // the starting learning rate
			.minLearningRate(5e-4) // learning rate should not lower than the set threshold value
			.negativeSample(10) // number of negative examples
			// set threshold for occurrence of words. Those that appear with higher frequency will be
			// randomly down-sampled
			.sampling(1e-5)
			.useHierarchicSoftmax(true) // use hierarchical softmax
			.layerSize(300) // size of word vectors
			.workers(8) // number of threads
			.build();
}
 
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:24,代码来源:Word2VecTrainer.java

示例5: getBagOfWordsWithCounts

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
private static LinkedHashMap<String, Integer> getBagOfWordsWithCounts(Language language) {
    HashMap<String, Integer> bagOfWords = new HashMap<>();
    List<String> sentences = getSentencesFromLanguage(language);
    SentenceIterator iter = new CollectionSentenceIterator(new Pan15SentencePreProcessor(), sentences);
    while(iter.hasNext()) {
        String sentence = iter.nextSentence();
        for(String word : sentence.split("\\s+")) {
            word =  normalize(word);
            if (Objects.equals(word, "") || (word.length() == 1 && word.matches("\\p{Punct}"))) continue;
            bagOfWords.put(word, bagOfWords.getOrDefault(word, 0) + 1);
        }
    }
    LinkedHashMap<String, Integer> sorted = new LinkedHashMap<>();
    final int[] count = {0};
    bagOfWords.entrySet().stream()
            .sorted(Map.Entry.comparingByValue(Collections.reverseOrder())).forEach(
            entry -> {
                if (count[0] < VEC_LENGTH) sorted.put(entry.getKey(), entry.getValue());
                count[0]++;
            }
    );
    return sorted;
}
 
开发者ID:madeleine789,项目名称:dl4j-apr,代码行数:24,代码来源:Pan15BagOfWords.java

示例6: testWord2VecMultiEpoch

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void testWord2VecMultiEpoch() throws Exception {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).epochs(3)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:27,代码来源:Word2VecTests.java

示例7: testWord2VecGoogleModelUptraining

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:Word2VecTests.java

示例8: before

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Before
public void before() throws Exception {
    if (vec == null) {
        ClassPathResource resource = new ClassPathResource("/labeled/");
        File file = resource.getFile();
        SentenceIterator iter = UimaSentenceIterator.createWithPath(file.getAbsolutePath());
        new File("cache.ser").delete();

        TokenizerFactory t = new UimaTokenizerFactory();

        vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100)
                        .stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter)
                        .tokenizerFactory(t).build();
        vec.fit();

    }
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:18,代码来源:Word2VecIteratorTest.java

示例9: testVocab

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void testVocab() throws Exception {
    File inputFile = new ClassPathResource("big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);

    Set<String> set = new HashSet<>();
    int lines = 0;
    int cnt = 0;
    while (iter.hasNext()) {
        Tokenizer tok = t.create(iter.nextSentence());
        for (String token : tok.getTokens()) {
            if (token == null || token.isEmpty() || token.trim().isEmpty())
                continue;
            cnt++;

            if (!set.contains(token))
                set.add(token);
        }

        lines++;
    }

    log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
    log.info("Set:\n" + set);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:26,代码来源:VocabConstructorTest.java

示例10: hasNext

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void hasNext() throws Exception {
    SentenceIterator iterator = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
                    .tokenizerFactory(factory).build();

    Iterator<Sequence<VocabWord>> iter = transformer.iterator();
    int cnt = 0;
    Sequence<VocabWord> sequence = null;
    while (iter.hasNext()) {
        sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }

    //   log.info("Last element: {}", sequence.asLabels());

    assertEquals(97162, cnt);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:ParallelTransformerIteratorTest.java

示例11: nextDocument

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void nextDocument() throws Exception {
    SentenceIterator sentence = new BasicLineIterator(new ClassPathResource("/big/raw_sentences.txt").getFile());
    BasicLabelAwareIterator backed = new BasicLabelAwareIterator.Builder(sentence).build();

    int cnt = 0;
    while (backed.hasNextDocument()) {
        backed.nextDocument();
        cnt++;
    }
    assertEquals(97162, cnt);

    backed.reset();

    AsyncLabelAwareIterator iterator = new AsyncLabelAwareIterator(backed, 64);
    cnt = 0;
    while (iterator.hasNext()) {
        iterator.next();
        cnt++;

        if (cnt == 10)
            iterator.reset();
    }
    assertEquals(97172, cnt);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:26,代码来源:AsyncLabelAwareIteratorTest.java

示例12: testHasNextDocument1

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Test
public void testHasNextDocument1() throws Exception {

    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();

    int cnt = 0;
    while (iterator.hasNextDocument()) {
        iterator.nextDocument();
        cnt++;
    }

    assertEquals(97162, cnt);

    LabelsSource generator = iterator.getLabelsSource();

    assertEquals(97162, generator.getLabels().size());
    assertEquals("DOCZ_0", generator.getLabels().get(0));
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:BasicLabelAwareIteratorTest.java

示例13: trainParagraghVecModel

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
public void trainParagraghVecModel(String locationToSave) throws FileNotFoundException {
    ClassPathResource resource = new ClassPathResource("/paragraphVectors/paragraphVectorTraining.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    AbstractCache<VocabWord> cache = new AbstractCache<VocabWord>();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    /*
         if you don't have LabelAwareIterator handy, you can use synchronized labels generator
          it will be used to label each document/sequence/line with it's own label.
          But if you have LabelAwareIterator ready, you can can provide it, for your in-house labels
    */
    LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder()
            .minWordFrequency(1)
            .iterations(100)
            .epochs(1)
            .layerSize(50)
            .learningRate(0.02)
            .labelsSource(source)
            .windowSize(5)
            .iterate(iter)
            .trainWordVectors(true)
            .vocabCache(cache)
            .tokenizerFactory(t)
            .sampling(0)
            .build();

    vec.fit();

    WordVectorSerializer.writeParagraphVectors(vec, locationToSave);
}
 
开发者ID:gizemsogancioglu,项目名称:biosses,代码行数:34,代码来源:SentenceVectorsBasedSimilarity.java

示例14: w2vBuilder4SmallCorpus

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@SuppressWarnings("unused")
public static Word2Vec w2vBuilder4SmallCorpus(SentenceIterator iter, TokenizerFactory t) {
	return new Word2Vec.Builder()
			.minWordFrequency(5)
			.iterations(1)
			.layerSize(100)
			.seed(42)
			.windowSize(5)
			.iterate(iter)
			.tokenizerFactory(t)
			.learningRate(0.025)
			.minLearningRate(1e-3)
			.build();
}
 
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:15,代码来源:Word2VecTrainer.java

示例15: initiliazeVectors

import org.deeplearning4j.text.sentenceiterator.SentenceIterator; //导入依赖的package包/类
@Override
void initiliazeVectors(Instances instances) {
  SentenceIterator iter = new WekaInstanceSentenceIterator(instances, this.textIndex - 1);

  // sets the tokenizer
  this.tokenizerFactory.setTokenPreProcessor(this.preprocessor);

  // initializes stopwords
  this.stopWordsHandler.initialize();

  // Building model
  this.vec =
      new Glove.Builder()
          .iterate(iter)
          .tokenizerFactory(this.tokenizerFactory)
          .alpha(this.alpha)
          .learningRate(this.learningRate)
          .epochs(this.epochs)
          .layerSize(this.layerSize)
          .minLearningRate(this.minLearningRate)
          .minWordFrequency(this.minWordFrequency)
          .stopWords(this.stopWordsHandler.getStopList())
          .useAdaGrad(this.useAdaGrad)
          .windowSize(this.windowSize)
          .workers(this.workers)
          .windowSize(this.windowSize)
          .xMax(this.xMax)
          .batchSize(this.batchSize)
          .shuffle(this.shuffle)
          .symmetric(this.symmetric)
          .build();

  // fit model
  this.vec.fit();
}
 
开发者ID:Waikato,项目名称:wekaDeeplearning4j,代码行数:36,代码来源:Dl4jStringToGlove.java


注:本文中的org.deeplearning4j.text.sentenceiterator.SentenceIterator类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。