本文整理汇总了Python中tests.pyunit_utils.locate函数的典型用法代码示例。如果您正苦于以下问题:Python locate函数的具体用法?Python locate怎么用?Python locate使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了locate函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: wide_dataset_large
def wide_dataset_large():
print("Reading in Arcene training data for binomial modeling.")
trainDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
trainDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())
print("Run model on 3250 columns of Arcene with strong rules off.")
model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1])
print("Test model on validation set.")
validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
validDataResponse = np.where(validDataResponse == -1, 0, 1)
validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
prediction = model.predict(validData)
print("Check performance of predictions.")
performance = model.model_performance(validData)
print("Check that prediction AUC better than guessing (0.5).")
assert performance.auc() > 0.5, "predictions should be better then pure chance"
示例2: fiftycatRF
def fiftycatRF():
# Training set has only 45 categories cat1 through cat45
#Log.info("Importing 50_cattest_train.csv data...\n")
train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv"))
train["y"] = train["y"].asfactor()
#Log.info("Summary of 50_cattest_train.csv from H2O:\n")
#train.summary()
# Train H2O DRF Model:
#Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500)
# Test dataset has all 50 categories cat1 through cat50
#Log.info("Importing 50_cattest_test.csv data...\n")
test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv"))
#Log.info("Summary of 50_cattest_test.csv from H2O:\n")
#test.summary()
# Predict on test dataset with DRF model:
#Log.info("Performing predictions on test dataset...\n")
preds = model.predict(test)
preds.head()
# Get the confusion matrix and AUC
#Log.info("Confusion matrix of predictions (max accuracy):\n")
perf = model.model_performance(test)
perf.show()
cm = perf.confusion_matrix()
print(cm)
示例3: wide_dataset_large
def wide_dataset_large():
print("Reading in Arcene training data for binomial modeling.")
trainDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
trainDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
xtrain = np.transpose(trainDataFeatures).tolist()
ytrain = trainDataResponse.tolist()
trainData = h2o.H2OFrame.fromPython([ytrain]+xtrain)
trainData[0] = trainData[0].asfactor()
print("Run model on 3250 columns of Arcene with strong rules off.")
model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=False, alpha=1)
model.train(x=range(1,3250), y=0, training_frame=trainData)
print("Test model on validation set.")
validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
validDataResponse = np.where(validDataResponse == -1, 0, 1)
validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
xvalid = np.transpose(validDataFeatures).tolist()
yvalid = validDataResponse.tolist()
validData = h2o.H2OFrame.fromPython([yvalid]+xvalid)
prediction = model.predict(validData)
print("Check performance of predictions.")
performance = model.model_performance(validData)
print("Check that prediction AUC better than guessing (0.5).")
assert performance.auc() > 0.5, "predictions should be better then pure chance"
示例4: user
def user():
a = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4]
a.head()
print(a[0].names) # Column header
print(a[2,0]) # column 0, row 2 value
print(a[2,"sepal_len"]) # Column 0, row 2 value
(a[0] + 2).show() # Add 2 to every element; broadcast a constant
(a[0] + a[1]).show() # Add 2 columns; broadcast parallel add
sum(a).show()
print(a["sepal_len"].mean())
print()
print("Rows 50 through 77 in the `sepal_len` column")
a[50:78, "sepal_len"].show() # print out rows 50 thru 77 inclusive
print()
a["sepal_len"].show()
print(a[50:78, ["sepal_len", "sepal_wid"]].show())
a.show()
print("The column means: ")
print(a.mean())
print()
try:
print(a["Sepal_len"].dim) # Error, misspelt column name
except Exception:
pass # Expected error
b = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4]
c = a + b
d = c + c + sum(a)
e = c + a + 1
e.show()
# Note that "d=c+..." keeps the internal C expressions alive, until "d" goes
# out of scope even as we nuke "c"
c.show()
c = None
# Internal "ExprNode(c=a+b)" not dead!
print(1 + (a[0] + b[1]).mean())
import collections
c = h2o.H2OFrame(collections.OrderedDict({"A": [1, 2, 3], "B": [4, 5, 6]}))
c.show()
c.describe()
c.head()
c[0].show()
print(c[1,0])
c[0:2,0].show()
sliced = a[0:51,0]
sliced.show()
示例5: checkpoint_new_category_in_predictor
def checkpoint_new_category_in_predictor():
sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv"))
print("checkpoint_new_category_in_predictor-1")
m1 = H2ODeepLearningEstimator(epochs=100)
m1.train(x=[0,1,2,4], y=3, training_frame=sv1)
m2 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
m2.train(x=[0,1,2,4], y=3, training_frame=sv2)
print("checkpoint_new_category_in_predictor-2")
# attempt to continue building model, but with an expanded categorical predictor domain.
# this should fail
try:
m3 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
m3.train(x=[0,1,2,4], y=3, training_frame=vir)
assert False, "Expected continued model-building to fail with new categories introduced in predictor"
except EnvironmentError:
pass
print("checkpoint_new_category_in_predictor-3")
# attempt to predict on new model, but with observations that have expanded categorical predictor domain.
predictions = m2.predict(vir)
print("checkpoint_new_category_in_predictor-4")
示例6: xgboost_insurance_gaussian_small
def xgboost_insurance_gaussian_small():
assert H2OXGBoostEstimator.available()
# Import big dataset to ensure run across multiple nodes
training_frame = h2o.import_file(pyunit_utils.locate("smalldata/testng/insurance_train1.csv"))
test_frame = h2o.import_file(pyunit_utils.locate("smalldata/testng/insurance_validation1.csv"))
x = ['Age', 'District']
y = 'Claims'
# Model with maximum of 2 trees
model_2_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.7,
booster='gbtree', seed=1, ntrees=2, distribution='gaussian')
model_2_trees.train(x=x, y=y, training_frame=training_frame)
prediction_2_trees = model_2_trees.predict(test_frame)
assert prediction_2_trees.nrows == test_frame.nrows
# Model with 10 trees
model_10_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.7,
booster='gbtree', seed=1, ntrees=10, distribution='gaussian')
model_10_trees.train(x=x, y=y, training_frame=training_frame)
prediction_10_trees = model_10_trees.predict(test_frame)
assert prediction_10_trees.nrows == test_frame.nrows
## Mean square error on model with lower number of decision trees should be higher
assert model_2_trees.mse() > model_10_trees.mse()
示例7: table_check
def table_check():
df = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
print(df[['AGE','RACE']].table(dense=True).head().as_data_frame(True))
print(df[['AGE','RACE']].table(dense=False).head().as_data_frame(True))
print(df[['RACE','AGE']].table(dense=True).head().as_data_frame(True))
print(df[['RACE','AGE']].table(dense=False).head().as_data_frame(True))
iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
# single column (frame)
table1 = iris["C5"].table()
assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1])
assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1])
assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1])
# two-column (one argument)
#dense
table2 = iris["C1"].table(iris["C5"])
#not dense
table3 = iris["C1"].table(iris["C5"],dense=False)
#check same value
assert (table3[table3['C1'] == 5,'Iris-setosa'] == table2[(table2['C1'] == 5) & (table2['C5'] == 'Iris-setosa'),'Counts']).all()
assert (table2 == iris[["C1","C5"]].table()).all()
assert (table3 == iris[["C1","C5"]].table(dense=False)).all()
cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
table = cars[2].table().as_data_frame()
table = dict(table[1:])
table = {k:int(v) for k,v in list(table.items())}
expected = Counter(itertools.chain(*cars[2].as_data_frame()[1:]))
assert table == expected, "Expected {} for table counts but got {}".format(expected, table)
示例8: smallcat_gbm
def smallcat_gbm():
# Training set has 26 categories from A to Z
# Categories A, C, E, G, ... are perfect predictors of y = 1
# Categories B, D, F, H, ... are perfect predictors of y = 0
alphabet = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"))
alphabet["y"] = alphabet["y"].asfactor()
#Log.info("Summary of alphabet_cattest.csv from H2O:\n")
#alphabet.summary()
# Prepare data for scikit use
trainData = np.loadtxt(pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0:lambda s: ord(s.decode().split("\"")[1])})
trainDataResponse = trainData[:,1]
trainDataFeatures = trainData[:,0]
# Train H2O GBM Model:
gbm_h2o = H2OGradientBoostingEstimator(distribution="bernoulli",
ntrees=1,
max_depth=1,
nbins=100)
gbm_h2o.train(x="X",y="y", training_frame=alphabet)
gbm_h2o.show()
# Train scikit GBM Model:
# Log.info("scikit GBM with same parameters:")
gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None)
gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
示例9: glrm_catagorical_bug_fix
def glrm_catagorical_bug_fix():
trainData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip"))
testData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
glrmModel = H2OGeneralizedLowRankEstimator(k=4)
glrmModel.train(x=trainData.names, training_frame=trainData)
predV = glrmModel.predict(testData)
print(predV)
示例10: dim_checks
def dim_checks():
# Log.info("Uploading logreg/princeton/cuse.dat")
h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
np_data = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate.csv"), delimiter=',', skiprows=1)
h2o_rows, h2o_cols = h2o_data.dim
np_rows, np_cols = list(np_data.shape)
print('The dimensions of h2o frame is: {0} x {1}'.format(h2o_rows, h2o_cols))
print('The dimensions of numpy array is: {0} x {1}'.format(np_rows, np_cols))
assert [h2o_rows, h2o_cols] == [np_rows, np_cols], "expected equal number of columns and rows"
# Log.info("Slice out a column and data frame it, try dim on it...")
h2o_slice = h2o_data[4]
np_slice = np_data[:,4]
h2o_rows, h2o_cols = h2o_slice.dim
np_rows = np_slice.shape[0]
print('The dimensions of h2o column slice is: {0} x {1}'.format(h2o_rows, h2o_cols))
print('The dimensions of numpy array column slice is: {0} x 1'.format(np_rows))
assert [h2o_rows, h2o_cols] == [np_rows, 1], "expected equal number of columns and rows"
# Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...")
h2oColAmpFive = h2o_slice & 5
assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged"
示例11: link_functions_gaussian
def link_functions_gaussian():
print("Read in prostate data.")
h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))
h2o_data.head()
sm_data = pd.read_csv(zipfile.ZipFile(pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")).
open("prostate_complete.csv")).as_matrix()
sm_data_response = sm_data[:,9]
sm_data_features = sm_data[:,1:9]
print("Testing for family: GAUSSIAN")
print("Set variables for h2o.")
myY = "GLEASON"
myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]
print("Create models with canonical link: IDENTITY")
h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0)
h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features,
family=sm.families.Gaussian(sm.families.links.identity)).fit()
print("Compare model deviances for link function identity")
h2o_deviance = old_div(h2o_model.residual_deviance(), h2o_model.null_deviance())
sm_deviance = old_div(sm_model.deviance, sm_model.null_deviance)
assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
示例12: fiftycatGBM
def fiftycatGBM():
# Training set has only 45 categories cat1 through cat45
#Log.info("Importing 50_cattest_train.csv data...\n")
train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv"))
train["y"] = train["y"].asfactor()
#Log.info("Summary of 50_cattest_train.csv from H2O:\n")
#train.summary()
# Train H2O GBM Model:
#Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20)
model.show()
# Test dataset has all 50 categories cat1 through cat50
#Log.info("Importing 50_cattest_test.csv data...\n")
test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv"))
#Log.info("Summary of 50_cattest_test.csv from H2O:\n")
#test.summary()
# Predict on test dataset with GBM model:
#Log.info("Performing predictions on test dataset...\n")
predictions = model.predict(test)
predictions.show()
# Get the confusion matrix and AUC
#Log.info("Confusion matrix of predictions (max accuracy):\n")
performance = model.model_performance(test)
test_cm = performance.confusion_matrix()
test_auc = performance.auc()
示例13: xgboost_milsongs_gaussian_medium
def xgboost_milsongs_gaussian_medium():
assert H2OXGBoostEstimator.available()
# Import big dataset to ensure run across multiple nodes
training_frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
test_frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
x = list(range(1,training_frame.ncol))
y = 0
# Model with maximum of 2 trees
model_2_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.3,
booster='gbtree', seed=1, ntrees=2, distribution='gaussian')
model_2_trees.train(x=x, y=y, training_frame=training_frame)
prediction_2_trees = model_2_trees.predict(test_frame)
assert prediction_2_trees.nrows == test_frame.nrows
# Model with 10 trees
model_10_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.3,
booster='gbtree', seed=1, ntrees=10, distribution='gaussian')
model_10_trees.train(x=x, y=y, training_frame=training_frame)
prediction_10_trees = model_10_trees.predict(test_frame)
assert prediction_10_trees.nrows == test_frame.nrows
## Mean square error on model with lower number of decision trees should be higher
assert model_2_trees.mse() > model_10_trees.mse()
示例14: export_file
def export_file():
pros_hex = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
pros_hex[1] = pros_hex[1].asfactor()
pros_hex[3] = pros_hex[3].asfactor()
pros_hex[4] = pros_hex[4].asfactor()
pros_hex[5] = pros_hex[5].asfactor()
pros_hex[8] = pros_hex[8].asfactor()
p_sid = pros_hex.runif()
pros_train = pros_hex[p_sid > 0.2, :]
pros_test = pros_hex[p_sid <= 0.2, :]
glm = H2OGeneralizedLinearEstimator(family="binomial")
myglm = glm.train(x=list(range(2, pros_hex.ncol)), y=1, training_frame=pros_train)
mypred = glm.predict(pros_test)
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return "".join(random.choice(chars) for _ in range(size))
fname = id_generator() + "_prediction.csv"
path = pyunit_utils.locate("results")
dname = path + "/" + fname
h2o.export_file(mypred, dname)
py_pred = pd.read_csv(dname)
print(py_pred.head())
h_pred = mypred.as_data_frame(True)
print(h_pred.head())
# Test to check if py_pred & h_pred are identical
assert_frame_equal(py_pred, h_pred)
示例15: anomaly
def anomaly():
print("Deep Learning Anomaly Detection MNIST")
train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
test = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
predictors = list(range(0,784))
resp = 784
# unsupervised -> drop the response column (digit: 0-9)
train = train[predictors]
test = test[predictors]
# 1) LEARN WHAT'S NORMAL
# train unsupervised Deep Learning autoencoder model on train_hex
ae_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[2], l1=1e-5, ignore_const_cols=False, epochs=1)
ae_model.train(x=predictors,training_frame=train)
# 2) DETECT OUTLIERS
# anomaly app computes the per-row reconstruction error for the test data set
# (passing it through the autoencoder model and computing mean square error (MSE) for each row)
test_rec_error = ae_model.anomaly(test)
# 3) VISUALIZE OUTLIERS
# Let's look at the test set points with low/median/high reconstruction errors.
# We will now visualize the original test set points and their reconstructions obtained
# by propagating them through the narrow neural net.
# Convert the test data into its autoencoded representation (pass through narrow neural net)
test_recon = ae_model.predict(test)