当前位置: 首页>>代码示例>>Python>>正文


Python pkl_utils._load函数代码示例

本文整理汇总了Python中utils.pkl_utils._load函数的典型用法代码示例。如果您正苦于以下问题:Python _load函数的具体用法?Python _load怎么用?Python _load使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了_load函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process

def process(lang, pivot):
	print "[%s]: process for language %s" % (time_utils._timestamp(), lang)
	linkDict = pkl_utils._load(config.ILL_DICT["%s2%s" % (lang, pivot)])
	templateDict = pkl_utils._load(config.TEMPLATE2ARTICLE[lang])
	articleDict = pkl_utils._load(config.ARTICLE2TEMPLATE[pivot])
	mapping = pd.read_csv(config.EXISTING_MAPPING_OUTPUT[pivot], index_col="template")
	template1 = []; template2 = []
	article1 = []; article2 = []; ontology = []
	for template in templateDict:
		articles = templateDict[template]
		for article in articles:
			if article in linkDict:
				tmp = linkDict[article]
				template1.append(template)
				article1.append(article)
				article2.append(tmp)
				if tmp in articleDict:
					templateList = articleDict[tmp]
				else:
					templateList = []
				c = ""
				t = ""
				for Template in templateList:
					if Template in mapping.index:
						c = mapping.at[Template, "ontology"]
						t = Template
				template2.append(t)
				ontology.append(c)

	data = {"template1":template1, "article1":article1, "template2":template2, \
			"article2":article2, "ontology":ontology}
	df = pd.DataFrame(data)
	df.to_csv(config.ENTITY_MATRIX["%s2%s" % (lang, pivot)], index=False)
	print "[%s]: processing complete" % time_utils._timestamp()
开发者ID:dbpedia,项目名称:mappings-autogeneration,代码行数:34,代码来源:parse.py

示例2: main

def main():
    logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        obs_fields = ["search_term", "product_title"][1:]
        aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
        param_list = [dfAll["id"], dfTrain2, aggregation_mode]
        sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
        sf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    obs_fields = ["search_term", "product_title"][1:]
    aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
    param_list = [dfAll["id"], dfTrain, aggregation_mode]
    sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
    sf.go()
开发者ID:Anhmike,项目名称:Kaggle_HomeDepot,代码行数:29,代码来源:feature_group_relevance.py

示例3: main

def main(options):
	lang = options.lang
	p = options.parse
	t = options.train
	ncomp = options.ncomp
	me = options.me
	fin = options.fin
	fout = options.fout

	if p:
		parse(lang)
	if t:
		cmd = "python run_hole.py --fin %s --fout %s --test-all 50 --nb 100 --me %d \
			--margin 0.2 --lr 0.1 --ncomp %d" % (lang, config.HOLE_OUTPUT[lang], me, ncomp)
		os.system(cmd)
	
	hole = pkl_utils._load(config.HOLE_OUTPUT[lang])
	data_dict = pkl_utils._load(config.DATA_DICT[lang])
	model = hole["model"]
	entityDict = { y:x for x, y in enumerate(data_dict["entities"])}
	predicateDict = { y:x for x, y in enumerate(data_dict["relations"])}
	df = pd.read_csv(fin, names=["s", "p", "o"])
	df["s"] = df["s"].map(entityDict)
	df["p"] = df["p"].map(predicateDict)
	df["o"] = df["o"].map(entityDict)
	scores = model._scores(list(df["s"]), list(df["p"]), list(df["o"]))
	pd.DataFrame(scores).to_csv(fout, index=False, header=False)
开发者ID:dbpedia,项目名称:mappings-autogeneration,代码行数:27,代码来源:hole.py

示例4: main

def main():
    
    dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1")
    dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1")


    # splits for level1
    splitter = HomedepotSplitter(dfTrain=dfTrain, 
                                dfTest=dfTest, 
                                n_iter=config.N_RUNS, 
                                random_state=config.RANDOM_SEED, 
                                verbose=True,
                                plot=True,
                                # tune these params to get a close distribution
                                split_param=[0.5, 0.25, 0.5],
                                )
    splitter.split()
    splitter.save("%s/splits_level1.pkl"%config.SPLIT_DIR)
    splits_level1 = splitter.splits


    ## splits for level2
    splits_level1 = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    splits_level2 = [0]*config.N_RUNS
    for run, (trainInd, validInd) in enumerate(splits_level1):
        dfValid = dfTrain.iloc[validInd].copy()
        splitter2 = HomedepotSplitter(dfTrain=dfValid, 
                                    dfTest=dfTest, 
                                    n_iter=1, 
                                    random_state=run, 
                                    verbose=True,
                                    # tune these params to get a close distribution
                                    split_param=[0.5, 0.15, 0.6])
        splitter2.split()
        splits_level2[run] = splitter2.splits[0]
    pkl_utils._save("%s/splits_level2.pkl"%config.SPLIT_DIR, splits_level2)


    ## splits for level3
    splits_level2 = pkl_utils._load("%s/splits_level2.pkl"%config.SPLIT_DIR)
    splits_level3 = [0]*config.N_RUNS
    for run, (trainInd, validInd) in enumerate(splits_level2):
        dfValid = dfTrain.iloc[validInd].copy()
        splitter3 = HomedepotSplitter(dfTrain=dfValid, 
                                    dfTest=dfTest, 
                                    n_iter=1, 
                                    random_state=run, 
                                    verbose=True,
                                    # tune these params to get a close distribution
                                    split_param=[0.5, 0.15, 0.7])
        splitter3.split()
        splits_level3[run] = splitter3.splits[0]
    pkl_utils._save("%s/splits_level3.pkl"%config.SPLIT_DIR, splits_level3)
开发者ID:Anhmike,项目名称:Kaggle_HomeDepot,代码行数:53,代码来源:splitter.py

示例5: run_count

def run_count():
    logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectCount_Ngram, 
        LastIntersectCount_Ngram, 
        FirstIntersectRatio_Ngram, 
        LastIntersectRatio_Ngram, 
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
开发者ID:amsqr,项目名称:Kaggle_HomeDepot,代码行数:27,代码来源:feature_first_last_ngram.py

示例6: main

def main():
    logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectPosition_Ngram, 
        IntersectNormPosition_Ngram, 
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
开发者ID:amsqr,项目名称:Kaggle_HomeDepot,代码行数:25,代码来源:feature_intersect_position.py

示例7: run_tsne_lsa_ngram

def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[1,2,3], [4]]
    obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
                pf.go()
开发者ID:amsqr,项目名称:Kaggle_HomeDepot,代码行数:28,代码来源:feature_vector_space.py

示例8: main

def main():
    logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
    doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
    model_prefixes.append( "Homedepot" )
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label")
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
        # generator = Doc2Vec_Vector
        # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Doc2Vec_CosineSim, 
            Doc2Vec_RMSE, 
            Doc2Vec_Vdiff,
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append( ["search_term", "search_term_alt"] )
        target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
开发者ID:MrSnark,项目名称:Kaggle_HomeDepot,代码行数:46,代码来源:feature_doc2vec.py

示例9: main

def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = 1
            fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go()
开发者ID:MrSnark,项目名称:Kaggle_HomeDepot,代码行数:43,代码来源:feature_query_quality.py

示例10: main

def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    for w in which.split(","):
        if w == "tf":
            generators.append( StatCoocTF_Ngram )
        elif w == "norm_tf":
            generators.append( StatCoocNormTF_Ngram )
        elif w == "tfidf":
            generators.append( StatCoocTFIDF_Ngram )
        elif w == "norm_tfidf":
            generators.append( StatCoocNormTFIDF_Ngram )
        elif w == "bm25":
            generators.append( StatCoocBM25_Ngram )


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term_product_name"] )
    target_fields_list.append( ["product_title_product_name"] )
    ngrams = [1,2]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                if ngram == 2:
                    # since product_name is of length 2, it makes no difference 
                    # for various aggregation as there is only one item
                    param_list = [ngram, "mean"]
                else:
                    param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
开发者ID:amsqr,项目名称:Kaggle_HomeDepot,代码行数:55,代码来源:feature_stat_cooc_tfidf.py

示例11: factorize

def factorize(lang="en"):
	X = pkl_utils._load(config.TENSOR[lang])
	entityDict = pkl_utils._load(config.ENTITY[lang])
	typeDict = pkl_utils._load(config.TYPE[lang])
	entry = pkl_utils._load(config.TYPE_MATRIX[lang])
	t2e = {typeDict[t]:entityDict[t] for t in typeDict}
	_log.info("Data has been loaded")
	N, M = X[0].shape[0], len(X)
	_log.info('Datasize: %d x %d x %d' % (N, N, M))

	FOLDS = 5
	IDX = list(range(N))
	shuffle(IDX)
	fsz = int(N/FOLDS)
	offset = 0
	tid = t2e[typeDict["http://dbpedia.org/ontology/Person"]]
	GROUND_TRUTH = X[-1][:, tid]
	AUC = np.zeros(FOLDS)
	for f in range(FOLDS):
		idx = set(IDX[offset:offset+fsz])
		offset += fsz
		_log.info('Fold %d' % f)
		T = [x.copy() for x in X[:-1]]
		rows = []
		cols = []
		data = []
		for x,y in zip(entry[0], entry[1]):
			if (x in idx) and (y == tid):
				continue
			rows.append(x)
			cols.append(y)
			data.append(1)
		T.append(spsp.csr_matrix((data, (rows, cols)), (N, N)))
		_log.info('Construction complete')
		P = predict_rescal_als(T, tid)
		precision, recall, _ = precision_recall_curve(GROUND_TRUTH, P)
		AUC[f] = auc(precision, recall)
		_log.info('AUC: %f' % AUC[f])
	
	_log.info('AUC-PR Test Mean / Std: %f / %f' % (AUC.mean(), AUC.std()))
开发者ID:hackur,项目名称:mappings-autogeneration,代码行数:40,代码来源:tf.py

示例12: run_compression_distance

def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        pf.go()
开发者ID:amsqr,项目名称:Kaggle_HomeDepot,代码行数:13,代码来源:feature_distance.py

示例13: main

def main():
    logname = "generate_feature_basic_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    ## basic
    generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
    obs_fields = ["search_term", "product_title", "product_description", 
                "product_attribute", "product_brand", "product_color"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## for product_uid
    generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3]
    obs_fields = ["product_uid"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["search_term", "product_title", "product_description", 
    "product_attribute", "product_brand", "product_color"]
    ngrams = [1,2,3]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
            sf.go()

    ## for product_attribute_list
    generators = [
        AttrCount, 
        AttrBulletCount, 
        AttrBulletRatio, 
        AttrNonBulletCount, 
        AttrNonBulletRatio,
        AttrHasProductHeight,
        AttrHasProductWidth,
        AttrHasProductLength,
        AttrHasProductDepth,
        AttrHasIndoorOutdoor,
    ]
    obs_fields = ["product_attribute_list"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()
开发者ID:amsqr,项目名称:Kaggle_HomeDepot,代码行数:51,代码来源:feature_basic.py

示例14: main

def main():
    logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3]
    relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
    ngrams = [1]
    obs_fields = ["search_term"]
    target_fields = ["product_title", "product_description"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        for target_field in target_fields:
            for relevance in relevances:
                for ngram in ngrams:
                    param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode]
                    pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                    pf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    for target_field in target_fields:
        for relevance in relevances:
            for ngram in ngrams:
                param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                pf.go()
开发者ID:MrSnark,项目名称:Kaggle_HomeDepot,代码行数:38,代码来源:feature_group_distance.py

示例15: main

def main():
    fnames = [
        "LSA100_Word_Unigram_Pair_search_term_x_product_title_100D",
        "LSA100_Word_Bigram_Pair_search_term_x_product_title_100D",
        "LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D",
        "LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D",
    ]

    fnames = [os.path.join(config.FEAT_DIR, fname+".pkl") for fname in fnames]

    for fname in fnames:
        f = pkl_utils._load(fname)
        columns = ["LSA%d"%(i+1) for i in range(f.shape[1])]
        pd.DataFrame(f, columns=columns).to_csv(fname[:-4]+".csv", index=False)
开发者ID:Anhmike,项目名称:Kaggle_HomeDepot,代码行数:14,代码来源:convert_pkl_lsa_to_csv_lsa.py


注:本文中的utils.pkl_utils._load函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。