本文整理汇总了Java中org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor类的典型用法代码示例。如果您正苦于以下问题:Java CommonPreprocessor类的具体用法?Java CommonPreprocessor怎么用?Java CommonPreprocessor使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
CommonPreprocessor类属于org.deeplearning4j.text.tokenization.tokenizer.preprocessor包,在下文中一共展示了CommonPreprocessor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testWord2VecPlot
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
.layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
.tokenizerFactory(t).build();
vec.fit();
// UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
// vec.getLookupTable().plotVocab(100, connectionInfo);
Thread.sleep(10000000000L);
fail("Not implemented");
}
示例2: main
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
// Gets Path to Text file
String filePath = "c:/raw_sentences.txt";
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache();
WeightLookupTable table = new InMemoryLookupTable.Builder()
.vectorLength(100)
.useAdaGrad(false)
.cache(cache)
.lr(0.025f).build();
log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5).iterations(1)
.layerSize(100).lookupTable(table)
.stopWords(new ArrayList<String>())
.vocabCache(cache).seed(42)
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
log.info("Fitting Word2Vec model....");
vec.fit();
log.info("Writing word vectors to text file....");
// Write word
WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");
log.info("Closest Words:");
Collection<String> lst = vec.wordsNearest("man", 5);
System.out.println(lst);
double cosSim = vec.similarity("cruise", "voyage");
System.out.println(cosSim);
}
示例3: SentimentExampleIterator
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
/**
* @param dataDirectory the directory of the IMDB review data set
* @param wordVectors WordVectors object
* @param batchSize Size of each minibatch for training
* @param truncateLength If reviews exceed
* @param train If true: return the training data. If false: return the testing data.
*/
public SentimentExampleIterator(String dataDirectory, WordVectors wordVectors, int batchSize, int truncateLength, boolean train) throws IOException {
this.batchSize = batchSize;
this.vectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length;
File p = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/pos/") + "/");
File n = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/neg/") + "/");
positiveFiles = p.listFiles();
negativeFiles = n.listFiles();
this.wordVectors = wordVectors;
this.truncateLength = truncateLength;
tokenizerFactory = new DefaultTokenizerFactory();
tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
}
示例4: makeParagraphVectors
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
void makeParagraphVectors() throws Exception {
// build a iterator for our dataset
File dir = TYPE_LEARNING_DIR;
dir.mkdirs();
iterator = new FileLabelAwareIterator.Builder()
.addSourceFolder(new File(dir, "corpus"))
.build();
tokenizerFactory = new DefaultTokenizerFactory();
tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
// ParagraphVectors training configuration
paragraphVectors = new ParagraphVectors.Builder()
.learningRate(0.025)
.minLearningRate(0.001)
.batchSize(1000)
.epochs(5)
.iterate(iterator)
.trainWordVectors(true)
.tokenizerFactory(tokenizerFactory)
.build();
// Start model training
paragraphVectors.fit();
}
示例5: testWord2VecMultiEpoch
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Test
public void testWord2VecMultiEpoch() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
.seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).epochs(3)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
.tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
}
示例6: testWord2VecGoogleModelUptraining
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
long time1 = System.currentTimeMillis();
Word2Vec vec = WordVectorSerializer.readWord2VecModel(
new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
long time2 = System.currentTimeMillis();
log.info("Model loaded in {} msec", time2 - time1);
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
vec.setTokenizerFactory(t);
vec.setSentenceIterator(iter);
vec.getConfiguration().setUseHierarchicSoftmax(false);
vec.getConfiguration().setNegative(5.0);
vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());
vec.fit();
}
示例7: testGoogleModelForInference
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
WordVectors googleVectors = WordVectorSerializer.loadGoogleModelNonNormalized(
new File("/ext/GoogleNews-vectors-negative300.bin.gz"), true, false);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
ParagraphVectors pv =
new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
.trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
.negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
INDArray vec1 = pv.inferVector("This text is pretty awesome");
INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");
log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
示例8: loadParagraphVectors
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
private static ParagraphVectors loadParagraphVectors() {
ParagraphVectors paragraphVectors = null;
try {
paragraphVectors = WordVectorSerializer.readParagraphVectors((PARAGRAPHVECTORMODELPATH));
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
paragraphVectors.setTokenizerFactory(t);
paragraphVectors.getConfiguration().setIterations(10); // please note, we set iterations to 1 here, just to speedup inference
} catch (IOException e) {
e.printStackTrace();
}
return paragraphVectors;
}
示例9: trainParagraghVecModel
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public void trainParagraghVecModel(String locationToSave) throws FileNotFoundException {
ClassPathResource resource = new ClassPathResource("/paragraphVectors/paragraphVectorTraining.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache<VocabWord>();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
/*
if you don't have LabelAwareIterator handy, you can use synchronized labels generator
it will be used to label each document/sequence/line with it's own label.
But if you have LabelAwareIterator ready, you can can provide it, for your in-house labels
*/
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder()
.minWordFrequency(1)
.iterations(100)
.epochs(1)
.layerSize(50)
.learningRate(0.02)
.labelsSource(source)
.windowSize(5)
.iterate(iter)
.trainWordVectors(true)
.vocabCache(cache)
.tokenizerFactory(t)
.sampling(0)
.build();
vec.fit();
WordVectorSerializer.writeParagraphVectors(vec, locationToSave);
}
示例10: getWord2Vec
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
private void getWord2Vec() {
t.setTokenPreProcessor(new CommonPreprocessor());
for (Language language: languages.keySet()) {
List<String> sentences = getSentencesFromLanguage(language);
SentenceIterator iter = new CollectionSentenceIterator(PREPROCESSOR, sentences);
Word2Vec vec = new Word2Vec.Builder().elementsLearningAlgorithm(learningAlgorithm)
.minWordFrequency(6)
.iterations(15)
.layerSize(VEC_LENGTH)
.seed(42)
.windowSize(5)
.iterate(iter)
.tokenizerFactory(t)
.build();
vec.fit();
saveModel(vec, language);
languageWord2VecMap.put(language, vec);
}
}
示例11: testWord2VecAdaGrad
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Test
public void testWord2VecAdaGrad() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(5).learningRate(0.025).layerSize(100)
.seed(42).batchSize(13500).sampling(0).negativeSample(0)
//.epochs(10)
.windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false)
.useHierarchicSoftmax(true).iterate(iter).workers(4).tokenizerFactory(t).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
}
示例12: testWord2VecCBOW
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
@Test
public void testWord2VecCBOW() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
.seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
.tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
assertTrue(sim > 0.65f);
}
示例13: main
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
ClassPathResource srcFile = new ClassPathResource("/raw_sentences.txt");
File file = srcFile.getFile();
SentenceIterator iter = new BasicLineIterator(file);
TokenizerFactory tFact = new DefaultTokenizerFactory();
tFact.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource labelFormat = new LabelsSource("LINE_");
ParagraphVectors vec = new ParagraphVectors.Builder()
.minWordFrequency(1)
.iterations(5)
.epochs(1)
.layerSize(100)
.learningRate(0.025)
.labelsSource(labelFormat)
.windowSize(5)
.iterate(iter)
.trainWordVectors(false)
.tokenizerFactory(tFact)
.sampling(0)
.build();
vec.fit();
double similar1 = vec.similarity("LINE_9835", "LINE_12492");
out.println("Comparing lines 9836 & 12493 ('This is my house .'/'This is my world .') Similarity = " + similar1);
double similar2 = vec.similarity("LINE_3720", "LINE_16392");
out.println("Comparing lines 3721 & 16393 ('This is my way .'/'This is my work .') Similarity = " + similar2);
double similar3 = vec.similarity("LINE_6347", "LINE_3720");
out.println("Comparing lines 6348 & 3721 ('This is my case .'/'This is my way .') Similarity = " + similar3);
double dissimilar1 = vec.similarity("LINE_3720", "LINE_9852");
out.println("Comparing lines 3721 & 9853 ('This is my way .'/'We now have one .') Similarity = " + dissimilar1);
double dissimilar2 = vec.similarity("LINE_3720", "LINE_3719");
out.println("Comparing lines 3721 & 3720 ('This is my way .'/'At first he says no .') Similarity = " + dissimilar2);
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:46,代码来源:ClassifyBySimilarity.java
示例14: main
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
ClassPathResource resource = new ClassPathResource("paravec/labeled");
iter = new FileLabelAwareIterator.Builder()
.addSourceFolder(resource.getFile())
.build();
tFact = new DefaultTokenizerFactory();
tFact.setTokenPreProcessor(new CommonPreprocessor());
pVect = new ParagraphVectors.Builder()
.learningRate(0.025)
.minLearningRate(0.001)
.batchSize(1000)
.epochs(20)
.iterate(iter)
.trainWordVectors(true)
.tokenizerFactory(tFact)
.build();
pVect.fit();
ClassPathResource unlabeledText = new ClassPathResource("paravec/unlabeled");
FileLabelAwareIterator unlabeledIter = new FileLabelAwareIterator.Builder()
.addSourceFolder(unlabeledText.getFile())
.build();
MeansBuilder mBuilder = new MeansBuilder(
(InMemoryLookupTable<VocabWord>) pVect.getLookupTable(),
tFact);
LabelSeeker lSeeker = new LabelSeeker(iter.getLabelsSource().getLabels(),
(InMemoryLookupTable<VocabWord>) pVect.getLookupTable());
while (unlabeledIter.hasNextDocument()) {
LabelledDocument doc = unlabeledIter.nextDocument();
INDArray docCentroid = mBuilder.documentAsVector(doc);
List<Pair<String, Double>> scores = lSeeker.getScores(docCentroid);
out.println("Document '" + doc.getLabel() + "' falls into the following categories: ");
for (Pair<String, Double> score : scores) {
out.println(" " + score.getFirst() + ": " + score.getSecond());
}
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:49,代码来源:ParagraphVectorsClassifierExample.java
示例15: getWordEmbeddings
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; //导入依赖的package包/类
public List<double[]> getWordEmbeddings(String sentence, Language language) {
t.setTokenPreProcessor(new CommonPreprocessor());
List<String> tokens = t.create(sentence).getTokens();
double[] tfidf = new double[tokens.size()];
for (int i = 0; i < tfidf.length; i++) {
tfidf[i] = Utils.tfIdf(sentence, getSentencesFromLanguage(language), tokens.get(i));
}
Word2Vec loadedVec = languageWord2VecMap.get(language);
return tokens.stream().map(loadedVec::getWordVector).collect(Collectors.toList());
}