Java CommonPreprocessor类代码示例

本文整理汇总了Java中org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor类的典型用法代码示例。如果您正苦于以下问题：Java CommonPreprocessor类的具体用法？Java CommonPreprocessor怎么用？Java CommonPreprocessor使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

CommonPreprocessor类属于org.deeplearning4j.text.tokenization.tokenizer.preprocessor包，在下文中一共展示了CommonPreprocessor类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testWord2VecPlot

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}

开发者ID:deeplearning4j，项目名称:deeplearning4j，代码行数:23，代码来源:ManualTests.java

示例2: main

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }

开发者ID:PacktPublishing，项目名称:Java-Data-Science-Cookbook，代码行数:41，代码来源:Word2VecRawTextExample.java

示例3: SentimentExampleIterator

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
/**
 * @param dataDirectory the directory of the IMDB review data set
 * @param wordVectors WordVectors object
 * @param batchSize Size of each minibatch for training
 * @param truncateLength If reviews exceed
 * @param train If true: return the training data. If false: return the testing data.
 */
public SentimentExampleIterator(String dataDirectory, WordVectors wordVectors, int batchSize, int truncateLength, boolean train) throws IOException {
	this.batchSize = batchSize;
	this.vectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length;


	File p = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/pos/") + "/");
	File n = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/neg/") + "/");
	positiveFiles = p.listFiles();
	negativeFiles = n.listFiles();

	this.wordVectors = wordVectors;
	this.truncateLength = truncateLength;

	tokenizerFactory = new DefaultTokenizerFactory();
	tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
}

开发者ID:IsaacChanghau，项目名称:NeuralNetworksLite，代码行数:24，代码来源:SentimentExampleIterator.java

示例4: makeParagraphVectors

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
void makeParagraphVectors() throws Exception {

        // build a iterator for our dataset
        File dir = TYPE_LEARNING_DIR;
        dir.mkdirs();
        iterator = new FileLabelAwareIterator.Builder()
                           .addSourceFolder(new File(dir, "corpus"))
                           .build();

        tokenizerFactory = new DefaultTokenizerFactory();
        tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());

        // ParagraphVectors training configuration
        paragraphVectors = new ParagraphVectors.Builder()
                                   .learningRate(0.025)
                                   .minLearningRate(0.001)
                                   .batchSize(1000)
                                   .epochs(5)
                                   .iterate(iterator)
                                   .trainWordVectors(true)
                                   .tokenizerFactory(tokenizerFactory)
                                   .build();

        // Start model training
        paragraphVectors.fit();
    }

开发者ID:sillelien，项目名称:dollar，代码行数:27，代码来源:ParagraphVectorsClassifierExample.java

示例5: testWord2VecMultiEpoch

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Test
public void testWord2VecMultiEpoch() throws Exception {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).epochs(3)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
}

开发者ID:deeplearning4j，项目名称:deeplearning4j，代码行数:27，代码来源:Word2VecTests.java

示例6: testWord2VecGoogleModelUptraining

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}

开发者ID:deeplearning4j，项目名称:deeplearning4j，代码行数:22，代码来源:Word2VecTests.java

示例7: testGoogleModelForInference

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
    WordVectors googleVectors = WordVectorSerializer.loadGoogleModelNonNormalized(
                    new File("/ext/GoogleNews-vectors-negative300.bin.gz"), true, false);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    ParagraphVectors pv =
                    new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
                                    .trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
                                    .negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}

开发者ID:deeplearning4j，项目名称:deeplearning4j，代码行数:20，代码来源:ParagraphVectorsTest.java

示例8: loadParagraphVectors

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
private static ParagraphVectors loadParagraphVectors() {
    ParagraphVectors paragraphVectors = null;
    try {
        paragraphVectors = WordVectorSerializer.readParagraphVectors((PARAGRAPHVECTORMODELPATH));
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());
        paragraphVectors.setTokenizerFactory(t);
        paragraphVectors.getConfiguration().setIterations(10); // please note, we set iterations to 1 here, just to speedup inference

    } catch (IOException e) {
        e.printStackTrace();
    }
    return paragraphVectors;
}

开发者ID:gizemsogancioglu，项目名称:biosses，代码行数:15，代码来源:SentenceVectorsBasedSimilarity.java

示例9: trainParagraghVecModel

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public void trainParagraghVecModel(String locationToSave) throws FileNotFoundException {
    ClassPathResource resource = new ClassPathResource("/paragraphVectors/paragraphVectorTraining.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    AbstractCache<VocabWord> cache = new AbstractCache<VocabWord>();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    /*
         if you don't have LabelAwareIterator handy, you can use synchronized labels generator
          it will be used to label each document/sequence/line with it's own label.
          But if you have LabelAwareIterator ready, you can can provide it, for your in-house labels
    */
    LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder()
            .minWordFrequency(1)
            .iterations(100)
            .epochs(1)
            .layerSize(50)
            .learningRate(0.02)
            .labelsSource(source)
            .windowSize(5)
            .iterate(iter)
            .trainWordVectors(true)
            .vocabCache(cache)
            .tokenizerFactory(t)
            .sampling(0)
            .build();

    vec.fit();

    WordVectorSerializer.writeParagraphVectors(vec, locationToSave);
}

开发者ID:gizemsogancioglu，项目名称:biosses，代码行数:34，代码来源:SentenceVectorsBasedSimilarity.java

示例10: getWord2Vec

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
private void getWord2Vec() {

        t.setTokenPreProcessor(new CommonPreprocessor());

        for (Language language: languages.keySet()) {
            List<String> sentences = getSentencesFromLanguage(language);

            SentenceIterator iter = new CollectionSentenceIterator(PREPROCESSOR, sentences);
            Word2Vec vec = new Word2Vec.Builder().elementsLearningAlgorithm(learningAlgorithm)
                    .minWordFrequency(6)
                    .iterations(15)
                    .layerSize(VEC_LENGTH)
                    .seed(42)
                    .windowSize(5)
                    .iterate(iter)
                    .tokenizerFactory(t)
                    .build();

            vec.fit();
            saveModel(vec, language);
            languageWord2VecMap.put(language, vec);
        }

    }

开发者ID:madeleine789，项目名称:dl4j-apr，代码行数:25，代码来源:Pan15Word2Vec.java

示例11: testWord2VecAdaGrad

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Test
public void testWord2VecAdaGrad() throws Exception {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(5).learningRate(0.025).layerSize(100)
                    .seed(42).batchSize(13500).sampling(0).negativeSample(0)
                    //.epochs(10)
                    .windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false)
                    .useHierarchicSoftmax(true).iterate(iter).workers(4).tokenizerFactory(t).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
}

开发者ID:deeplearning4j，项目名称:deeplearning4j，代码行数:28，代码来源:Word2VecTests.java

示例12: testWord2VecCBOW

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Test
public void testWord2VecCBOW() throws Exception {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(sim > 0.65f);
}

开发者ID:deeplearning4j，项目名称:deeplearning4j，代码行数:28，代码来源:Word2VecTests.java

示例13: main

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
    ClassPathResource srcFile = new ClassPathResource("/raw_sentences.txt");
    File file = srcFile.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    
    TokenizerFactory tFact = new DefaultTokenizerFactory();
    tFact.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource labelFormat = new LabelsSource("LINE_");

    ParagraphVectors vec = new ParagraphVectors.Builder()
            .minWordFrequency(1)
            .iterations(5)
            .epochs(1)
            .layerSize(100)
            .learningRate(0.025)
            .labelsSource(labelFormat)
            .windowSize(5)
            .iterate(iter)
            .trainWordVectors(false)
            .tokenizerFactory(tFact)
            .sampling(0)
            .build();

    vec.fit();

    double similar1 = vec.similarity("LINE_9835", "LINE_12492");
    out.println("Comparing lines 9836 & 12493 ('This is my house .'/'This is my world .') Similarity = " + similar1);


    double similar2 = vec.similarity("LINE_3720", "LINE_16392");
    out.println("Comparing lines 3721 & 16393 ('This is my way .'/'This is my work .') Similarity = " + similar2);

    double similar3 = vec.similarity("LINE_6347", "LINE_3720");
    out.println("Comparing lines 6348 & 3721 ('This is my case .'/'This is my way .') Similarity = " + similar3);

    double dissimilar1 = vec.similarity("LINE_3720", "LINE_9852");
    out.println("Comparing lines 3721 & 9853 ('This is my way .'/'We now have one .') Similarity = " + dissimilar1);
    
    double dissimilar2 = vec.similarity("LINE_3720", "LINE_3719");
    out.println("Comparing lines 3721 & 3720 ('This is my way .'/'At first he says no .') Similarity = " + dissimilar2);
    
    
    
}

开发者ID:PacktPublishing，项目名称:Machine-Learning-End-to-Endguide-for-Java-developers，代码行数:46，代码来源:ClassifyBySimilarity.java

示例14: main

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public static void main(String[] args) throws Exception {

        ClassPathResource resource = new ClassPathResource("paravec/labeled");

        iter = new FileLabelAwareIterator.Builder()
                .addSourceFolder(resource.getFile())
                .build();

        tFact = new DefaultTokenizerFactory();
        tFact.setTokenPreProcessor(new CommonPreprocessor());

        pVect = new ParagraphVectors.Builder()
                .learningRate(0.025)
                .minLearningRate(0.001)
                .batchSize(1000)
                .epochs(20)
                .iterate(iter)
                .trainWordVectors(true)
                .tokenizerFactory(tFact)
                .build();

        pVect.fit();


        ClassPathResource unlabeledText = new ClassPathResource("paravec/unlabeled");
        FileLabelAwareIterator unlabeledIter = new FileLabelAwareIterator.Builder()
                .addSourceFolder(unlabeledText.getFile())
                .build();


        MeansBuilder mBuilder = new MeansBuilder(
                (InMemoryLookupTable<VocabWord>) pVect.getLookupTable(),
                tFact);
        LabelSeeker lSeeker = new LabelSeeker(iter.getLabelsSource().getLabels(),
                (InMemoryLookupTable<VocabWord>) pVect.getLookupTable());

        while (unlabeledIter.hasNextDocument()) {
            LabelledDocument doc = unlabeledIter.nextDocument();
            INDArray docCentroid = mBuilder.documentAsVector(doc);
            List<Pair<String, Double>> scores = lSeeker.getScores(docCentroid);

            out.println("Document '" + doc.getLabel() + "' falls into the following categories: ");
            for (Pair<String, Double> score : scores) {
                out.println("        " + score.getFirst() + ": " + score.getSecond());
            }

        }
    }

开发者ID:PacktPublishing，项目名称:Machine-Learning-End-to-Endguide-for-Java-developers，代码行数:49，代码来源:ParagraphVectorsClassifierExample.java

示例15: getWordEmbeddings

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public List<double[]> getWordEmbeddings(String sentence, Language language) {
    t.setTokenPreProcessor(new CommonPreprocessor());
    List<String> tokens = t.create(sentence).getTokens();
    double[] tfidf = new double[tokens.size()];
    for (int i = 0; i < tfidf.length; i++) {
        tfidf[i] = Utils.tfIdf(sentence, getSentencesFromLanguage(language), tokens.get(i));
    }
    Word2Vec loadedVec = languageWord2VecMap.get(language);
    return tokens.stream().map(loadedVec::getWordVector).collect(Collectors.toList());
}

开发者ID:madeleine789，项目名称:dl4j-apr，代码行数:11，代码来源:Pan15Word2Vec.java

注：本文中的org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。