本文整理汇总了Python中utils.pkl_utils._load函数的典型用法代码示例。如果您正苦于以下问题:Python _load函数的具体用法?Python _load怎么用?Python _load使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了_load函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process
def process(lang, pivot):
print "[%s]: process for language %s" % (time_utils._timestamp(), lang)
linkDict = pkl_utils._load(config.ILL_DICT["%s2%s" % (lang, pivot)])
templateDict = pkl_utils._load(config.TEMPLATE2ARTICLE[lang])
articleDict = pkl_utils._load(config.ARTICLE2TEMPLATE[pivot])
mapping = pd.read_csv(config.EXISTING_MAPPING_OUTPUT[pivot], index_col="template")
template1 = []; template2 = []
article1 = []; article2 = []; ontology = []
for template in templateDict:
articles = templateDict[template]
for article in articles:
if article in linkDict:
tmp = linkDict[article]
template1.append(template)
article1.append(article)
article2.append(tmp)
if tmp in articleDict:
templateList = articleDict[tmp]
else:
templateList = []
c = ""
t = ""
for Template in templateList:
if Template in mapping.index:
c = mapping.at[Template, "ontology"]
t = Template
template2.append(t)
ontology.append(c)
data = {"template1":template1, "article1":article1, "template2":template2, \
"article2":article2, "ontology":ontology}
df = pd.DataFrame(data)
df.to_csv(config.ENTITY_MATRIX["%s2%s" % (lang, pivot)], index=False)
print "[%s]: processing complete" % time_utils._timestamp()
示例2: main
def main():
logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
## run python3 splitter.py first
split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
n_iter = len(split)
## for cv
for i in range(n_iter):
trainInd, validInd = split[i][0], split[i][1]
dfTrain2 = dfTrain.iloc[trainInd].copy()
sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)
obs_fields = ["search_term", "product_title"][1:]
aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
param_list = [dfAll["id"], dfTrain2, aggregation_mode]
sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
sf.go()
## for all
sub_feature_dir = "%s/All" % (config.FEAT_DIR)
obs_fields = ["search_term", "product_title"][1:]
aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
param_list = [dfAll["id"], dfTrain, aggregation_mode]
sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
sf.go()
示例3: main
def main(options):
lang = options.lang
p = options.parse
t = options.train
ncomp = options.ncomp
me = options.me
fin = options.fin
fout = options.fout
if p:
parse(lang)
if t:
cmd = "python run_hole.py --fin %s --fout %s --test-all 50 --nb 100 --me %d \
--margin 0.2 --lr 0.1 --ncomp %d" % (lang, config.HOLE_OUTPUT[lang], me, ncomp)
os.system(cmd)
hole = pkl_utils._load(config.HOLE_OUTPUT[lang])
data_dict = pkl_utils._load(config.DATA_DICT[lang])
model = hole["model"]
entityDict = { y:x for x, y in enumerate(data_dict["entities"])}
predicateDict = { y:x for x, y in enumerate(data_dict["relations"])}
df = pd.read_csv(fin, names=["s", "p", "o"])
df["s"] = df["s"].map(entityDict)
df["p"] = df["p"].map(predicateDict)
df["o"] = df["o"].map(entityDict)
scores = model._scores(list(df["s"]), list(df["p"]), list(df["o"]))
pd.DataFrame(scores).to_csv(fout, index=False, header=False)
示例4: main
def main():
dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1")
dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1")
# splits for level1
splitter = HomedepotSplitter(dfTrain=dfTrain,
dfTest=dfTest,
n_iter=config.N_RUNS,
random_state=config.RANDOM_SEED,
verbose=True,
plot=True,
# tune these params to get a close distribution
split_param=[0.5, 0.25, 0.5],
)
splitter.split()
splitter.save("%s/splits_level1.pkl"%config.SPLIT_DIR)
splits_level1 = splitter.splits
## splits for level2
splits_level1 = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
splits_level2 = [0]*config.N_RUNS
for run, (trainInd, validInd) in enumerate(splits_level1):
dfValid = dfTrain.iloc[validInd].copy()
splitter2 = HomedepotSplitter(dfTrain=dfValid,
dfTest=dfTest,
n_iter=1,
random_state=run,
verbose=True,
# tune these params to get a close distribution
split_param=[0.5, 0.15, 0.6])
splitter2.split()
splits_level2[run] = splitter2.splits[0]
pkl_utils._save("%s/splits_level2.pkl"%config.SPLIT_DIR, splits_level2)
## splits for level3
splits_level2 = pkl_utils._load("%s/splits_level2.pkl"%config.SPLIT_DIR)
splits_level3 = [0]*config.N_RUNS
for run, (trainInd, validInd) in enumerate(splits_level2):
dfValid = dfTrain.iloc[validInd].copy()
splitter3 = HomedepotSplitter(dfTrain=dfValid,
dfTest=dfTest,
n_iter=1,
random_state=run,
verbose=True,
# tune these params to get a close distribution
split_param=[0.5, 0.15, 0.7])
splitter3.split()
splits_level3[run] = splitter3.splits[0]
pkl_utils._save("%s/splits_level3.pkl"%config.SPLIT_DIR, splits_level3)
示例5: run_count
def run_count():
logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
generators = [
FirstIntersectCount_Ngram,
LastIntersectCount_Ngram,
FirstIntersectRatio_Ngram,
LastIntersectRatio_Ngram,
]
obs_fields_list = []
target_fields_list = []
## query in document
obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
## document in query
obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
ngrams = [1,2,3,12,123][:3]
for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
for generator in generators:
for ngram in ngrams:
param_list = [ngram]
pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
pf.go()
示例6: main
def main():
logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
generators = [
IntersectPosition_Ngram,
IntersectNormPosition_Ngram,
]
obs_fields_list = []
target_fields_list = []
## query in document
obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
## document in query
obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
ngrams = [1,2,3,12,123][:3]
aggregation_mode = ["mean", "std", "max", "min", "median"]
for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
for generator in generators:
for ngram in ngrams:
param_list = [ngram, aggregation_mode]
pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
pf.go()
示例7: run_tsne_lsa_ngram
def run_tsne_lsa_ngram():
logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
dfAll.drop(["product_attribute_list"], inplace=True, axis=1)
generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
ngrams_list = [[1,2,3], [2,3,4,5]]
ngrams_list = [[1,2,3], [4]]
obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"]
for generator,ngrams in zip(generators, ngrams_list):
for ngram in ngrams:
param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
sf.go()
generators = [TSNE_LSA_Word_Ngram_Pair]
ngrams = [1, 2]
obs_fields_list = []
target_fields_list = []
obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
target_fields_list.append( ["product_title", "product_description"] )
for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
for ngram in ngrams:
for generator in generators:
param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
pf.go()
示例8: main
def main():
logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
#### NOTE: use data BEFORE STEMMING
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
doc2vec_model_dirs = []
model_prefixes = []
## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
model_prefixes.append( "Homedepot" )
for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes):
## load model
try:
if ".bin" in doc2vec_model_dir:
doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True)
if ".txt" in doc2vec_model_dir:
doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False)
else:
doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label")
except:
continue
# ## standalone (not used in model building)
# obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
# generator = Doc2Vec_Vector
# param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
# sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
# sf.go()
## pairwise
generators = [
Doc2Vec_CosineSim,
Doc2Vec_RMSE,
Doc2Vec_Vdiff,
]
obs_fields_list = []
target_fields_list = []
obs_fields_list.append( ["search_term", "search_term_alt"] )
target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
for generator in generators:
param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
pf.go()
示例9: main
def main():
logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
obs_corpus = []
query_suffix = []
# raw
dfAll = pkl_utils._load(config.ALL_DATA_RAW)
obs_corpus.append(dfAll["search_term"].values)
query_suffix.append("raw")
# after processing
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
obs_corpus.append(dfAll["search_term"].values)
query_suffix.append("lemmatized")
# after extracting product_name in search_term
obs_corpus.append(dfAll["search_term_product_name"].values)
query_suffix.append("product_name")
if "search_term_auto_corrected" in dfAll.columns:
# after auto correction
obs_corpus.append(dfAll["search_term_auto_corrected"].values)
query_suffix.append("corrected")
# after stemming
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
obs_corpus.append(dfAll["search_term"].values)
query_suffix.append("stemmed")
y_train = dfAll["relevance"].values[:TRAIN_SIZE]
for i in range(len(query_suffix)-1):
for j in range(i+1, len(query_suffix)):
ext = QueryQuality(obs_corpus[i], obs_corpus[j])
x = ext.transform()
dim = 1
fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim)
pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
# raw
dfAll = pkl_utils._load(config.ALL_DATA_RAW)
obs_fields = ["search_term"]
param_list = []
sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
sf.go()
示例10: main
def main(which):
logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
generators = []
for w in which.split(","):
if w == "tf":
generators.append( StatCoocTF_Ngram )
elif w == "norm_tf":
generators.append( StatCoocNormTF_Ngram )
elif w == "tfidf":
generators.append( StatCoocTFIDF_Ngram )
elif w == "norm_tfidf":
generators.append( StatCoocNormTFIDF_Ngram )
elif w == "bm25":
generators.append( StatCoocBM25_Ngram )
obs_fields_list = []
target_fields_list = []
## query in document
obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
## document in query
obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
ngrams = [1,2,3,12,123][:3]
aggregation_mode = ["mean", "std", "max", "min", "median"]
for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
for generator in generators:
for ngram in ngrams:
param_list = [ngram, aggregation_mode]
pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
pf.go()
obs_fields_list = []
target_fields_list = []
## query in document
obs_fields_list.append( ["search_term_product_name"] )
target_fields_list.append( ["product_title_product_name"] )
ngrams = [1,2]
aggregation_mode = ["mean", "std", "max", "min", "median"]
for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
for generator in generators:
for ngram in ngrams:
if ngram == 2:
# since product_name is of length 2, it makes no difference
# for various aggregation as there is only one item
param_list = [ngram, "mean"]
else:
param_list = [ngram, aggregation_mode]
pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
pf.go()
示例11: factorize
def factorize(lang="en"):
X = pkl_utils._load(config.TENSOR[lang])
entityDict = pkl_utils._load(config.ENTITY[lang])
typeDict = pkl_utils._load(config.TYPE[lang])
entry = pkl_utils._load(config.TYPE_MATRIX[lang])
t2e = {typeDict[t]:entityDict[t] for t in typeDict}
_log.info("Data has been loaded")
N, M = X[0].shape[0], len(X)
_log.info('Datasize: %d x %d x %d' % (N, N, M))
FOLDS = 5
IDX = list(range(N))
shuffle(IDX)
fsz = int(N/FOLDS)
offset = 0
tid = t2e[typeDict["http://dbpedia.org/ontology/Person"]]
GROUND_TRUTH = X[-1][:, tid]
AUC = np.zeros(FOLDS)
for f in range(FOLDS):
idx = set(IDX[offset:offset+fsz])
offset += fsz
_log.info('Fold %d' % f)
T = [x.copy() for x in X[:-1]]
rows = []
cols = []
data = []
for x,y in zip(entry[0], entry[1]):
if (x in idx) and (y == tid):
continue
rows.append(x)
cols.append(y)
data.append(1)
T.append(spsp.csr_matrix((data, (rows, cols)), (N, N)))
_log.info('Construction complete')
P = predict_rescal_als(T, tid)
precision, recall, _ = precision_recall_curve(GROUND_TRUTH, P)
AUC[f] = auc(precision, recall)
_log.info('AUC: %f' % AUC[f])
_log.info('AUC-PR Test Mean / Std: %f / %f' % (AUC.mean(), AUC.std()))
示例12: run_compression_distance
def run_compression_distance():
logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
obs_fields_list = []
target_fields_list = []
obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
param_list = []
pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
pf.go()
示例13: main
def main():
logname = "generate_feature_basic_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
## basic
generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
obs_fields = ["search_term", "product_title", "product_description",
"product_attribute", "product_brand", "product_color"]
for generator in generators:
param_list = []
sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
sf.go()
## for product_uid
generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3]
obs_fields = ["product_uid"]
for generator in generators:
param_list = []
sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
sf.go()
## unique count
generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
obs_fields = ["search_term", "product_title", "product_description",
"product_attribute", "product_brand", "product_color"]
ngrams = [1,2,3]
for generator in generators:
for ngram in ngrams:
param_list = [ngram]
sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
sf.go()
## for product_attribute_list
generators = [
AttrCount,
AttrBulletCount,
AttrBulletRatio,
AttrNonBulletCount,
AttrNonBulletRatio,
AttrHasProductHeight,
AttrHasProductWidth,
AttrHasProductLength,
AttrHasProductDepth,
AttrHasIndoorOutdoor,
]
obs_fields = ["product_attribute_list"]
for generator in generators:
param_list = []
sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
sf.go()
示例14: main
def main():
logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp()
logger = logging_utils._get_logger(config.LOG_DIR, logname)
dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
## run python3 splitter.py first
split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
n_iter = len(split)
relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3]
relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
ngrams = [1]
obs_fields = ["search_term"]
target_fields = ["product_title", "product_description"]
aggregation_mode = ["mean", "std", "max", "min", "median"]
## for cv
for i in range(n_iter):
trainInd, validInd = split[i][0], split[i][1]
dfTrain2 = dfTrain.iloc[trainInd].copy()
sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)
for target_field in target_fields:
for relevance in relevances:
for ngram in ngrams:
param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode]
pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
pf.go()
## for all
sub_feature_dir = "%s/All" % (config.FEAT_DIR)
for target_field in target_fields:
for relevance in relevances:
for ngram in ngrams:
param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode]
pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
pf.go()
示例15: main
def main():
fnames = [
"LSA100_Word_Unigram_Pair_search_term_x_product_title_100D",
"LSA100_Word_Bigram_Pair_search_term_x_product_title_100D",
"LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D",
"LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D",
]
fnames = [os.path.join(config.FEAT_DIR, fname+".pkl") for fname in fnames]
for fname in fnames:
f = pkl_utils._load(fname)
columns = ["LSA%d"%(i+1) for i in range(f.shape[1])]
pd.DataFrame(f, columns=columns).to_csv(fname[:-4]+".csv", index=False)