本文整理汇总了Java中water.fvec.RebalanceDataSet类的典型用法代码示例。如果您正苦于以下问题:Java RebalanceDataSet类的具体用法?Java RebalanceDataSet怎么用?Java RebalanceDataSet使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
RebalanceDataSet类属于water.fvec包,在下文中一共展示了RebalanceDataSet类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: reBalance
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
/**
* Rebalance a frame for load balancing
* @param fr Input frame
* @param local whether to only create enough chunks to max out all cores on one node only
* @return Frame that has potentially more chunks
*/
private Frame reBalance(final Frame fr, boolean local, final String name) {
int chunks = (int)Math.min( 4 * H2O.NUMCPUS * (local ? 1 : H2O.CLOUD.size()), fr.numRows());
if (fr.anyVec().nChunks() > chunks && !_parms._reproducible) {
Log.info("Dataset already contains " + fr.anyVec().nChunks() + " chunks. No need to rebalance.");
return fr;
} else if (_parms._reproducible) {
Log.warn("Reproducibility enforced - using only 1 thread - can be slow.");
chunks = 1;
}
if (!_parms._quiet_mode) Log.info("ReBalancing dataset into (at least) " + chunks + " chunks.");
Key newKey = Key.make(name + ".chunks" + chunks);
RebalanceDataSet rb = new RebalanceDataSet(fr, newKey, chunks);
H2O.submitTask(rb);
rb.join();
Frame f = DKV.get(newKey).get();
_delete_me.add(f);
return f;
}
示例2: reBalance
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
/**
* Rebalance a frame for load balancing
* @param fr Input frame
* @param local whether to only create enough chunks to max out all cores on one node only
* @return Frame that has potentially more chunks
*/
private Frame reBalance(final Frame fr, boolean local) {
int chunks = (int)Math.min( 4 * H2O.NUMCPUS * (local ? 1 : H2O.CLOUD.size()), fr.numRows());
if (fr.anyVec().nChunks() > chunks && !reproducible) {
Log.info("Dataset already contains " + fr.anyVec().nChunks() + " chunks. No need to rebalance.");
return fr;
} else if (reproducible) {
Log.warn("Reproducibility enforced - using only 1 thread - can be slow.");
chunks = 1;
}
if (!quiet_mode) Log.info("ReBalancing dataset into (at least) " + chunks + " chunks.");
// return MRUtils.shuffleAndBalance(fr, chunks, seed, local, shuffle_training_data);
String snewKey = fr._key != null ? (fr._key.toString() + ".balanced") : Key.rand();
Key newKey = Key.makeSystem(snewKey);
RebalanceDataSet rb = new RebalanceDataSet(fr, newKey, chunks);
H2O.submitTask(rb);
rb.join();
return UKV.get(newKey);
}
示例3: testTranspose
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Test
public void testTranspose(){
Futures fs = new Futures();
Key parsed = Key.make("prostate_parsed");
Key modelKey = Key.make("prostate_model");
GLMModel model = null;
File f = TestUtil.find_test_file("smalldata/glm_test/prostate_cat_replaced.csv");
Frame fr = getFrameForFile(parsed, "smalldata/glm_test/prostate_cat_replaced.csv");
fr.remove("RACE").remove(fs);
Key k = Key.make("rebalanced");
H2O.submitTask(new RebalanceDataSet(fr, k, 64)).join();
fr.delete();
fr = DKV.get(k).get();
Frame tr = DMatrix.transpose(fr);
tr.reloadVecs();
for(int i = 0; i < fr.numRows(); ++i)
for(int j = 0; j < fr.numCols(); ++j)
assertEquals(fr.vec(j).at(i),tr.vec(i).at(j),1e-4);
fr.delete();
for(Vec v:tr.vecs())
v.remove(fs);
fs.blockForPending();
// checkLeakedKeys();
}
示例4: reBalanceFrames
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
public void reBalanceFrames () {
final Key [] keySet = H2O.KeySnapshot.globalSnapshot().keys();
for (Key key : keySet) {
final Value val = DKV.get(key);
if (val == null || !val.isFrame()) continue;
final Frame fr = val.get();
if (!fr._key.toString().contains("balanced")) {
final int splits = Math.min((int)fr.numRows(), 4*H2O.NUMCPUS*H2O.CLOUD.size());
final String name = fr._key.toString() + ".rebalanced";
Log.info("Load balancing frame under key '" + fr._key.toString() + "' into " + splits + " splits.");
try {
final Key frHexBalanced = Key.make(name);
new RebalanceDataSet(fr, frHexBalanced, splits).invoke();
} catch(Exception ex) {
Log.err(ex.getMessage());
}
}
}
}
示例5: testPubDev928
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
/** Load simple dataset, rebalance to a number of chunks > number of rows, and run deep learning */
@Test public void testPubDev928() {
// Create rebalanced dataset
Key rebalancedKey = Key.make("rebalanced");
NFSFileVec nfs = NFSFileVec.make(find_test_file("smalldata/logreg/prostate.csv"));
Frame fr = ParseDataset.parse(Key.make(), nfs._key);
RebalanceDataSet rb = new RebalanceDataSet(fr, rebalancedKey, (int)(fr.numRows()+1));
H2O.submitTask(rb);
rb.join();
Frame rebalanced = DKV.get(rebalancedKey).get();
// Assert that there is at least one 0-len chunk
assertZeroLengthChunk("Rebalanced dataset should contain at least one 0-len chunk!", rebalanced.anyVec());
DeepLearningModel dlModel = null;
try {
// Launch Deep Learning
DeepLearningParameters dlParams = new DeepLearningParameters();
dlParams._train = rebalancedKey;
dlParams._epochs = 5;
dlParams._response_column = "CAPSULE";
dlModel = new DeepLearning(dlParams).trainModel().get();
} finally {
fr.delete();
rebalanced.delete();
if (dlModel != null) dlModel.delete();
}
}
示例6: testChunks
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Test public void testChunks() {
Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data");
AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
parms._train = frame._key;
parms._target_num_exemplars = 137;
parms._rel_tol_num_exemplars = 0.05;
long start = System.currentTimeMillis();
AggregatorModel agg = new Aggregator(parms).trainModel().get(); // 0.418
System.out.println("AggregatorModel finished in: " + (System.currentTimeMillis() - start)/1000. + " seconds"); agg.checkConsistency();
Frame output = agg._output._output_frame.get();
checkNumExemplars(agg);
output.remove();
agg.remove();
for (int i : new int[]{1,2,5,10,50,100}) {
Key key = Key.make();
RebalanceDataSet rb = new RebalanceDataSet(frame, key, i);
H2O.submitTask(rb);
rb.join();
Frame rebalanced = DKV.get(key).get();
parms = new AggregatorModel.AggregatorParameters();
parms._train = frame._key;
parms._target_num_exemplars = 137;
parms._rel_tol_num_exemplars = 0.05;
start = System.currentTimeMillis();
AggregatorModel agg2 = new Aggregator(parms).trainModel().get(); // 0.373 0.504 0.357 0.454 0.368 0.355
System.out.println("AggregatorModel finished in: " + (System.currentTimeMillis() - start)/1000. + " seconds"); agg2.checkConsistency();
Log.info("Number of exemplars for " + i + " chunks: " + agg2._exemplars.length);
rebalanced.delete();
Assert.assertTrue(Math.abs(agg._exemplars.length - agg2._exemplars.length) == 0);
output = agg2._output._output_frame.get();
output.remove();
checkNumExemplars(agg);
agg2.remove();
}
frame.delete();
}
示例7: testPubDev928
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
/** Load simple dataset, rebalance to a number of chunks > number of rows, and run deep learning */
@Test public void testPubDev928() {
// Create rebalanced dataset
Key rebalancedKey = Key.make("rebalanced");
NFSFileVec nfs = TestUtil.makeNfsFileVec("smalldata/logreg/prostate.csv");
Frame fr = ParseDataset.parse(Key.make(), nfs._key);
RebalanceDataSet rb = new RebalanceDataSet(fr, rebalancedKey, (int)(fr.numRows()+1));
H2O.submitTask(rb);
rb.join();
Frame rebalanced = DKV.get(rebalancedKey).get();
// Assert that there is at least one 0-len chunk
assertZeroLengthChunk("Rebalanced dataset should contain at least one 0-len chunk!", rebalanced.anyVec());
DeepLearningModel dlModel = null;
try {
// Launch Deep Learning
DeepLearningParameters dlParams = new DeepLearningParameters();
dlParams._train = rebalancedKey;
dlParams._epochs = 5;
dlParams._response_column = "CAPSULE";
dlModel = new DeepLearning(dlParams).trainModel().get();
} finally {
fr.delete();
rebalanced.delete();
if (dlModel != null) dlModel.delete();
}
}
示例8: serve
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Override public RequestBuilders.Response serve() {
if( source==null ) throw new IllegalArgumentException("Missing frame to rebalance!");
try {
if (chunks > source.numRows()) throw new IllegalArgumentException("Cannot create more than " + source.numRows() + " chunks.");
if( after==null ) after = Key.make(source._key.toString() + ".balanced");
RebalanceDataSet rb = new RebalanceDataSet(source, after, chunks);
H2O.submitTask(rb);
rb.join();
return RequestBuilders.Response.done(this);
} catch( Throwable t ) {
return RequestBuilders.Response.error(t);
}
}
示例9: testReprodubility
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Test public void testReprodubility() {
Frame tfr=null;
final int N = 5;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
// rebalance to 256 chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "C55";
parms._nbins = 1000;
parms._ntrees = 1;
parms._max_depth = 8;
parms._learn_rate = 0.1f;
parms._min_rows = 10;
// parms._distribution = Family.multinomial;
parms._distribution = Distribution.Family.gaussian;
// Build a first model; all remaining models should be equal
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse;
job.remove();
gbm.delete();
}
} finally{
if (tfr != null) tfr.remove();
}
Scope.exit();
for( double mse : mses ) assertEquals(mse, mses[0], 1e-15);
}
示例10: testReprodubilityAirline
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Test public void testReprodubilityAirline() {
Frame tfr=null;
final int N = 1;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 7;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = Distribution.Family.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
// Build a first model; all remaining models should be equal
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse;
job.remove();
gbm.delete();
}
} finally {
if (tfr != null) tfr.remove();
}
Scope.exit();
for( double mse : mses )
assertEquals(0.21979375165014595, mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks), mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
}
示例11: testReprodubilityAirlineSingleNode
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Test public void testReprodubilityAirlineSingleNode() {
Frame tfr=null;
final int N = 1;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 7;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = Distribution.Family.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
parms._build_tree_one_node = true;
// Build a first model; all remaining models should be equal
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse;
job.remove();
gbm.delete();
}
} finally {
if (tfr != null) tfr.remove();
}
Scope.exit();
for( double mse : mses )
assertEquals(0.21979375165014595, mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
}
示例12: testReproducibility
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Test public void testReproducibility() {
Frame tfr=null;
final int N = 5;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
// rebalance to 256 chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (int i=0; i<N; ++i) {
DRFModel.DRFParameters parms = new DRFModel.DRFParameters();
parms._train = tfr._key;
parms._response_column = "C55";
parms._nbins = 1000;
parms._ntrees = 1;
parms._max_depth = 8;
parms._mtries = -1;
parms._min_rows = 10;
parms._seed = 1234;
// Build a first model; all remaining models should be equal
DRF job = new DRF(parms);
DRFModel drf = job.trainModel().get();
assertEquals(drf._output._ntrees, parms._ntrees);
mses[i] = drf._output._scored_train[drf._output._scored_train.length-1]._mse;
job.remove();
drf.delete();
}
} finally{
if (tfr != null) tfr.remove();
}
Scope.exit();
for (int i=0; i<mses.length; ++i) {
Log.info("trial: " + i + " -> MSE: " + mses[i]);
}
for(double mse : mses)
assertEquals(mse, mses[0], 1e-15);
}
示例13: testReproducibilityAirline
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Test public void testReproducibilityAirline() {
Frame tfr=null;
final int N = 1;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i=0; i<N; ++i) {
DRFModel.DRFParameters parms = new DRFModel.DRFParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 1024;
parms._ntrees = 7;
parms._max_depth = 10;
parms._binomial_double_trees = false;
parms._mtries = -1;
parms._min_rows = 1;
parms._sample_rate = 0.632f; // Simulated sampling with replacement
parms._balance_classes = true;
parms._seed = (1L<<32)|2;
// Build a first model; all remaining models should be equal
DRF job = new DRF(parms);
DRFModel drf = job.trainModel().get();
assertEquals(drf._output._ntrees, parms._ntrees);
mses[i] = drf._output._training_metrics.mse();
job.remove();
drf.delete();
}
} finally{
if (tfr != null) tfr.remove();
}
Scope.exit();
for (int i=0; i<mses.length; ++i) {
Log.info("trial: " + i + " -> MSE: " + mses[i]);
}
for (int i=0; i<mses.length; ++i) {
assertEquals(0.2148575516521361, mses[i], 1e-4); //check for the same result on 1 nodes and 5 nodes
}
}
示例14: testReprodubilityAirline
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Test public void testReprodubilityAirline() {
Frame tfr=null;
final int N = 5;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 7;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = DistributionFamily.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse;
gbm.delete();
}
} finally {
if (tfr != null) tfr.remove();
}
Scope.exit();
System.out.println("MSEs start");
for(double d:mses)
System.out.println(d);
System.out.println("MSEs End");
System.out.flush();
for( double mse : mses )
assertEquals(0.21694215729861027, mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks), mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
}
示例15: testReprodubilityAirlineSingleNode
import water.fvec.RebalanceDataSet; //导入依赖的package包/类
@Test public void testReprodubilityAirlineSingleNode() {
Frame tfr=null;
final int N = 10;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 7;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = DistributionFamily.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
parms._build_tree_one_node = true;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse;
gbm.delete();
}
} finally {
if (tfr != null) tfr.remove();
}
Scope.exit();
System.out.println("MSE");
for(double d:mses)
System.out.println(d);
for( double mse : mses )
assertEquals(0.21694215729861027, mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
}