当前位置: 首页>>代码示例>>Python>>正文


Python FeatureHasher.fit_transform方法代码示例

本文整理汇总了Python中sklearn.feature_extraction.FeatureHasher.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python FeatureHasher.fit_transform方法的具体用法?Python FeatureHasher.fit_transform怎么用?Python FeatureHasher.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.feature_extraction.FeatureHasher的用法示例。


在下文中一共展示了FeatureHasher.fit_transform方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: hash

# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
def hash(data, labels, new_dimension):
    print "start hashing trick..."
    # convert features as dict
    dictList = list()
    if hasattr(data, "indices"):
        #ind = data.indices
        #dat = data.data
        data = data.toarray()
        indices = range(len(data[0]))
        for item in data:
            zipped = zip(indices, item)
            row = dict()
            for index,value in zipped:
                if value != 0:
                    row[str(index)] = value
            dictList.append(row)

        a = 234
    else:
        indices = map(str, range(len(data[0])))
        for row in data:
            dictList.append(dict(zip(indices, row)))

    start = time.time()
    hasher = FeatureHasher(n_features=new_dimension) # , input_type='dict'
    reduced = hasher.fit_transform(dictList).toarray()
    end = time.time()
    return (reduced, end-start)
开发者ID:sebastian-alfers,项目名称:master-thesis,代码行数:30,代码来源:dimensionality_reduction.py

示例2: hash_array

# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
def hash_array(feature_dict, feature_num):
	# print feature_dict[0]
	if feature_num == 1:
		x_new = np.asarray(feature_dict)
		x_h = x_new.reshape(len(feature_dict), 1)
	else:
		hasher = FeatureHasher(n_features=feature_num, non_negative=True, input_type='dict')
		X_new = hasher.fit_transform(feature_dict)
		x_h = X_new.toarray()
		# vec = DictVectorizer()
		# x_h = vec.fit_transform(feature_dict).toarray()
		# print x_h.shape, type(x_h)
	return x_h
开发者ID:Nikhil112,项目名称:CTR_prediction,代码行数:15,代码来源:nn_CTR.py

示例3: MultinomialNB

# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
    labels =  data_frame['Category']
    pd_frame = data_frame['PdDistrict']
    resolution = data_frame['Resolution']
    data_frame.drop(['Category'],inplace=True,axis=1)
    #training_data = pd.concat([pd_frame,resolution], axis=1)
    training_data = data_frame.as_matrix(['Dates','DayOfWeek','Address'])
    testing_data = data_frame_test.as_matrix(['Dates','DayOfWeek','Address'])


    gnb = MultinomialNB(alpha=0)
    #gnb = LinearSVC()

    print 'Made it till here-1'
    fh = FeatureHasher(input_type='string',non_negative=True)
    X=fh.fit_transform(training_data)
    X_test = fh.fit_transform(testing_data)


    print 'Made it till here-2'
    print training_data.shape

    #print X.toarray()
    print 'Made it till here-3'

    gnb_model = gnb.fit(X,labels)
    y_pred=gnb_model.predict(X_test)

    print len(y_pred)

    #for actual,predicted in zip(labels,y_pred):
开发者ID:jainprateek,项目名称:MachineLearning,代码行数:32,代码来源:crime_prediction_classification.py

示例4: cluster

# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
    def cluster(self, dataset):
        """
		clusters the data provided into the number of 
		clusters set by self.numberOfClusters

		stationToArtist: dict where 
		dict['data'] = data in a 2d array
		dict['labels'] = labels for each array

		returns
		-------
		a list of clusters, where a cluster is a
		list of station names
		"""
        outputlabels = []  # the set of stations per cluster
        outputdata = []  # list of set of artists per cluster
        finaloutputdata = []

        hasher = FeatureHasher(input_type="string")
        transformer = TfidfTransformer()
        km = KMeans(n_clusters=self.numberOfClusters, init="k-means++", max_iter=10, n_init=1, verbose=0)

        # edit the dataset so that it contains only artist name and not
        # artist popularity
        artistdataset = dataset["data"]

        newartistdataset = []
        for i in range(0, len(artistdataset)):
            if len(artistdataset[i]) != 0:
                newartistdataset.append(artistdataset[i][0][0])

                # if the number of artists is not enough, get more artists
                # here!!!
        print "clustering " + str(len(artistdataset)) + " artists"

        if len(artistdataset) < self.maximumArtistsToCluster:

            print "we need more artists to cluster"
            self.getMoreArtists(artistdataset)

        datacounts = hasher.fit_transform(newartistdataset)
        # tfidfcounts = transformer.fit_transform(datacounts)

        # disabled tf-idf because too slow
        # km.fit(tfidfcounts)
        km.fit(datacounts)

        labeleddata = km.labels_

        # init output array
        for i in range(0, len(set(labeleddata))):
            outputlabels.append([])
            outputdata.append([])

            # add items to output array
        for i in range(0, len(labeleddata)):
            currentcluster = labeleddata[i]
            outputlabels[currentcluster].append(dataset["labels"][i])
            outputdata[currentcluster].append(dataset["data"][i])

            # change the artist list to artist sets
        for item in outputdata:
            listofartists = []

            for artistlist in item:
                for artist in artistlist:
                    listofartists.append(artist)

            finaloutputdata.append(list(set(listofartists)))

        return {"labels": outputlabels, "data": finaloutputdata}
开发者ID:JimiLab,项目名称:meuse_backend,代码行数:73,代码来源:cluster_module.py

示例5: print

# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
gen_effect_features = gen_onehot_features.iloc[:,:-1]
gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10]


# ## Feature Hashing scheme

# In[19]:

unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:", len(unique_genres))
print(unique_genres)


# In[20]:

from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()
pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]


# In[21]:

fh.get_params()

开发者ID:Zoery,项目名称:practical-machine-learning-with-python,代码行数:30,代码来源:feature_engineering_categorical.py

示例6: main

# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
def main():
    storting_csv = sys.argv[1]
    annotations_path = sys.argv[2]

    loc = os.path.dirname(os.path.abspath(__file__))
    stopwords = [w for w
                 in codecs.open(os.path.join(loc, 'stop.txt'),
                                'r', 'utf8').read().split()
                 if not w.startswith('|')]

    csv_reader = csv.DictReader(open(storting_csv))

    examples = []

    #v = DictVectorizer(sparse=False)
    v = FeatureHasher()

    print 'Reading speeches and extracting features...'
    for speech in csv_reader:
        if speech['title'] == 'Representant':
            sys.stdout.write(speech['id'])
            sys.stdout.write("\b" * len(speech['id']))
            metadata = {}
            for name in csv_reader.fieldnames:
                if name != 'text':
                    metadata[name] = speech[name]

            label = metadata['party_id']
            example = Example(label, metadata=metadata)

            annotations = codecs.open(os.path.join(annotations_path,
                                                    '%s.tsv' % (speech['id'])),
                                                                'r',
                                                                'utf8').read()

            sentlengths = []
            for sentence in parse_conll(annotations):
                sentlengths.append(float(len(sentence)))
                for token in sentence:
                    if token[1] not in stopwords:
                        #example.add_feature('#token:' + token[1])
                        example.add_feature('#lemma-pos:%s-%s' % (token[2], token[3]))

            average_sent_length = sum(sentlengths) / len(sentlengths)
            example.add_feature('#avg-s-length:%s' % (average_sent_length))
            examples.append(example)

    print
    print 'Done!'
    print 'Vectorizing...'
    X = v.fit_transform([e.features for e in examples])
    print 'Done!'
    print 'Tfidf weighting...'
    t = TfidfTransformer()
    X = t.fit_transform(X)
    print 'Done!'

    print 'Binning vectors...'
    parties = {}
    for e, x in zip(examples, X):
        if e.label not in parties:
            parties[e.label] = {}
        year = int(e.metadata['date'].split('-')[0])
        if year not in parties[e.label]:
            parties[e.label][year] = []
        parties[e.label][year].append(x)
    print 'Done!'

    # for p in parties:
    #     print sorted(parties[p].keys())

    results = {}

    for p in tqdm(parties, desc='Computing similarities:'):
        results[p] = {}
        for year in tqdm(parties[p], desc=p):
            results[p][year] = []
            for i, x in enumerate(tqdm(parties[p][year], desc=str(year))):
                for j, y in enumerate(parties[p][year]):
                    if j != i:
                        score = cosine_similarity(x, y)[0][0]
                        results[p][year].append(score)
    print 'Done!'

    print 'Saving results...'
    na_counter = 0
    for p in results:
        if not p:
            out = open('na_%s' % (na_counter) + '.out', 'w')
            na_counter += 1
        else:
            out = open(p + '.out', 'w')
        years = sorted(results[p].keys())
        for y in years:
            try:
                avg = sum(results[p][y]) / len(results[p][y])
            except ZeroDivisionError:
                avg = 0
            out.write("%s\t%s\n" % (y, avg))
        out.close()
#.........这里部分代码省略.........
开发者ID:emanlapponi,项目名称:storting,代码行数:103,代码来源:sim_test.py

示例7: len

# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
        feature["FromTimestamp"] = click[1]
        feature["ToTimestamp"] = 0
        feature["ItemId"] = click[2]
        feature["Category"] = click[3]
        feature["Price"] = 0
        feature["Quantitiy"] = 0
        X.append(feature)
    sys.stderr.write("\rProgress:%.2f%%" % (100. * i / len(clicks)))

# make dictvect
print "make dict vect"
v = DictVectorizer()
X_dict_sparse = v.fit_transform(X)
X_dict = [zip(map(str, row.indices), row.data) for row in X_dict_sparse]

# Feature Hashing
print "Feature Hashing"
n_features = 2**24
hasher = FeatureHasher(n_features=n_features, input_type='pair')
X_hash_sparse = hasher.fit_transform(X_dict)
X_hash = [zip(row.indices, row.data) for row in X_hash_sparse]

# make libsvm data
with open("./data/yoochoose-train.dat", "w") as f:
    for val, features in zip(c, X_hash):
        features_list = []
        for feature in features:
            features_list.append(str(feature[0]) + ":" + str(feature[1]))
        features_line = " ".join(features_list)
        f.write(str(val)+" "+features_line+"\n")
开发者ID:eliethesaiyan,项目名称:Recsys2015,代码行数:32,代码来源:make_dataset.py

示例8: read_training_file

# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
if __name__=='__main__':

    data_frame = read_training_file('/Users/prateek.jain/work/datasets/kaggle-competition/sf-crime/train.csv')

    labels =  data_frame['Category']
    pd_frame = data_frame['PdDistrict']
    resolution = data_frame['Resolution']
    data_frame.drop(['Category'],inplace=True,axis=1)
    training_data = pd.concat([pd_frame,resolution], axis=1)
    training_data = data_frame.as_matrix(['PdDistrict','Address'])
    regr = linear_model.LinearRegression()
    #gnb = LinearSVC()

    print 'Made it till here-1'
    fh = FeatureHasher(input_type='string',non_negative=True)
    X=fh.fit_transform(training_data)

    fhy = FeatureHasher(input_type='string',non_negative=True)
    Y = fhy.fit_transform(labels)


    knn_prediction = regr.fit(X,Y)
    print(regr.coef_)
    prediction = regr.predict(X)
    print regr.score(X, prediction)
    print 'Made it till here-2'
    print prediction

    #print X.toarray()
    #print 'Made it till here-3'
开发者ID:jainprateek,项目名称:MachineLearning,代码行数:32,代码来源:crime_prediction.py


注:本文中的sklearn.feature_extraction.FeatureHasher.fit_transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。