本文整理汇总了Python中sklearn.feature_extraction.FeatureHasher.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python FeatureHasher.fit_transform方法的具体用法?Python FeatureHasher.fit_transform怎么用?Python FeatureHasher.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction.FeatureHasher
的用法示例。
在下文中一共展示了FeatureHasher.fit_transform方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: hash
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
def hash(data, labels, new_dimension):
print "start hashing trick..."
# convert features as dict
dictList = list()
if hasattr(data, "indices"):
#ind = data.indices
#dat = data.data
data = data.toarray()
indices = range(len(data[0]))
for item in data:
zipped = zip(indices, item)
row = dict()
for index,value in zipped:
if value != 0:
row[str(index)] = value
dictList.append(row)
a = 234
else:
indices = map(str, range(len(data[0])))
for row in data:
dictList.append(dict(zip(indices, row)))
start = time.time()
hasher = FeatureHasher(n_features=new_dimension) # , input_type='dict'
reduced = hasher.fit_transform(dictList).toarray()
end = time.time()
return (reduced, end-start)
示例2: hash_array
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
def hash_array(feature_dict, feature_num):
# print feature_dict[0]
if feature_num == 1:
x_new = np.asarray(feature_dict)
x_h = x_new.reshape(len(feature_dict), 1)
else:
hasher = FeatureHasher(n_features=feature_num, non_negative=True, input_type='dict')
X_new = hasher.fit_transform(feature_dict)
x_h = X_new.toarray()
# vec = DictVectorizer()
# x_h = vec.fit_transform(feature_dict).toarray()
# print x_h.shape, type(x_h)
return x_h
示例3: MultinomialNB
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
labels = data_frame['Category']
pd_frame = data_frame['PdDistrict']
resolution = data_frame['Resolution']
data_frame.drop(['Category'],inplace=True,axis=1)
#training_data = pd.concat([pd_frame,resolution], axis=1)
training_data = data_frame.as_matrix(['Dates','DayOfWeek','Address'])
testing_data = data_frame_test.as_matrix(['Dates','DayOfWeek','Address'])
gnb = MultinomialNB(alpha=0)
#gnb = LinearSVC()
print 'Made it till here-1'
fh = FeatureHasher(input_type='string',non_negative=True)
X=fh.fit_transform(training_data)
X_test = fh.fit_transform(testing_data)
print 'Made it till here-2'
print training_data.shape
#print X.toarray()
print 'Made it till here-3'
gnb_model = gnb.fit(X,labels)
y_pred=gnb_model.predict(X_test)
print len(y_pred)
#for actual,predicted in zip(labels,y_pred):
示例4: cluster
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
def cluster(self, dataset):
"""
clusters the data provided into the number of
clusters set by self.numberOfClusters
stationToArtist: dict where
dict['data'] = data in a 2d array
dict['labels'] = labels for each array
returns
-------
a list of clusters, where a cluster is a
list of station names
"""
outputlabels = [] # the set of stations per cluster
outputdata = [] # list of set of artists per cluster
finaloutputdata = []
hasher = FeatureHasher(input_type="string")
transformer = TfidfTransformer()
km = KMeans(n_clusters=self.numberOfClusters, init="k-means++", max_iter=10, n_init=1, verbose=0)
# edit the dataset so that it contains only artist name and not
# artist popularity
artistdataset = dataset["data"]
newartistdataset = []
for i in range(0, len(artistdataset)):
if len(artistdataset[i]) != 0:
newartistdataset.append(artistdataset[i][0][0])
# if the number of artists is not enough, get more artists
# here!!!
print "clustering " + str(len(artistdataset)) + " artists"
if len(artistdataset) < self.maximumArtistsToCluster:
print "we need more artists to cluster"
self.getMoreArtists(artistdataset)
datacounts = hasher.fit_transform(newartistdataset)
# tfidfcounts = transformer.fit_transform(datacounts)
# disabled tf-idf because too slow
# km.fit(tfidfcounts)
km.fit(datacounts)
labeleddata = km.labels_
# init output array
for i in range(0, len(set(labeleddata))):
outputlabels.append([])
outputdata.append([])
# add items to output array
for i in range(0, len(labeleddata)):
currentcluster = labeleddata[i]
outputlabels[currentcluster].append(dataset["labels"][i])
outputdata[currentcluster].append(dataset["data"][i])
# change the artist list to artist sets
for item in outputdata:
listofartists = []
for artistlist in item:
for artist in artistlist:
listofartists.append(artist)
finaloutputdata.append(list(set(listofartists)))
return {"labels": outputlabels, "data": finaloutputdata}
示例5: print
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
gen_effect_features = gen_onehot_features.iloc[:,:-1]
gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10]
# ## Feature Hashing scheme
# In[19]:
unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:", len(unique_genres))
print(unique_genres)
# In[20]:
from sklearn.feature_extraction import FeatureHasher
fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()
pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]
# In[21]:
fh.get_params()
开发者ID:Zoery,项目名称:practical-machine-learning-with-python,代码行数:30,代码来源:feature_engineering_categorical.py
示例6: main
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
def main():
storting_csv = sys.argv[1]
annotations_path = sys.argv[2]
loc = os.path.dirname(os.path.abspath(__file__))
stopwords = [w for w
in codecs.open(os.path.join(loc, 'stop.txt'),
'r', 'utf8').read().split()
if not w.startswith('|')]
csv_reader = csv.DictReader(open(storting_csv))
examples = []
#v = DictVectorizer(sparse=False)
v = FeatureHasher()
print 'Reading speeches and extracting features...'
for speech in csv_reader:
if speech['title'] == 'Representant':
sys.stdout.write(speech['id'])
sys.stdout.write("\b" * len(speech['id']))
metadata = {}
for name in csv_reader.fieldnames:
if name != 'text':
metadata[name] = speech[name]
label = metadata['party_id']
example = Example(label, metadata=metadata)
annotations = codecs.open(os.path.join(annotations_path,
'%s.tsv' % (speech['id'])),
'r',
'utf8').read()
sentlengths = []
for sentence in parse_conll(annotations):
sentlengths.append(float(len(sentence)))
for token in sentence:
if token[1] not in stopwords:
#example.add_feature('#token:' + token[1])
example.add_feature('#lemma-pos:%s-%s' % (token[2], token[3]))
average_sent_length = sum(sentlengths) / len(sentlengths)
example.add_feature('#avg-s-length:%s' % (average_sent_length))
examples.append(example)
print
print 'Done!'
print 'Vectorizing...'
X = v.fit_transform([e.features for e in examples])
print 'Done!'
print 'Tfidf weighting...'
t = TfidfTransformer()
X = t.fit_transform(X)
print 'Done!'
print 'Binning vectors...'
parties = {}
for e, x in zip(examples, X):
if e.label not in parties:
parties[e.label] = {}
year = int(e.metadata['date'].split('-')[0])
if year not in parties[e.label]:
parties[e.label][year] = []
parties[e.label][year].append(x)
print 'Done!'
# for p in parties:
# print sorted(parties[p].keys())
results = {}
for p in tqdm(parties, desc='Computing similarities:'):
results[p] = {}
for year in tqdm(parties[p], desc=p):
results[p][year] = []
for i, x in enumerate(tqdm(parties[p][year], desc=str(year))):
for j, y in enumerate(parties[p][year]):
if j != i:
score = cosine_similarity(x, y)[0][0]
results[p][year].append(score)
print 'Done!'
print 'Saving results...'
na_counter = 0
for p in results:
if not p:
out = open('na_%s' % (na_counter) + '.out', 'w')
na_counter += 1
else:
out = open(p + '.out', 'w')
years = sorted(results[p].keys())
for y in years:
try:
avg = sum(results[p][y]) / len(results[p][y])
except ZeroDivisionError:
avg = 0
out.write("%s\t%s\n" % (y, avg))
out.close()
#.........这里部分代码省略.........
示例7: len
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
feature["FromTimestamp"] = click[1]
feature["ToTimestamp"] = 0
feature["ItemId"] = click[2]
feature["Category"] = click[3]
feature["Price"] = 0
feature["Quantitiy"] = 0
X.append(feature)
sys.stderr.write("\rProgress:%.2f%%" % (100. * i / len(clicks)))
# make dictvect
print "make dict vect"
v = DictVectorizer()
X_dict_sparse = v.fit_transform(X)
X_dict = [zip(map(str, row.indices), row.data) for row in X_dict_sparse]
# Feature Hashing
print "Feature Hashing"
n_features = 2**24
hasher = FeatureHasher(n_features=n_features, input_type='pair')
X_hash_sparse = hasher.fit_transform(X_dict)
X_hash = [zip(row.indices, row.data) for row in X_hash_sparse]
# make libsvm data
with open("./data/yoochoose-train.dat", "w") as f:
for val, features in zip(c, X_hash):
features_list = []
for feature in features:
features_list.append(str(feature[0]) + ":" + str(feature[1]))
features_line = " ".join(features_list)
f.write(str(val)+" "+features_line+"\n")
示例8: read_training_file
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import fit_transform [as 别名]
if __name__=='__main__':
data_frame = read_training_file('/Users/prateek.jain/work/datasets/kaggle-competition/sf-crime/train.csv')
labels = data_frame['Category']
pd_frame = data_frame['PdDistrict']
resolution = data_frame['Resolution']
data_frame.drop(['Category'],inplace=True,axis=1)
training_data = pd.concat([pd_frame,resolution], axis=1)
training_data = data_frame.as_matrix(['PdDistrict','Address'])
regr = linear_model.LinearRegression()
#gnb = LinearSVC()
print 'Made it till here-1'
fh = FeatureHasher(input_type='string',non_negative=True)
X=fh.fit_transform(training_data)
fhy = FeatureHasher(input_type='string',non_negative=True)
Y = fhy.fit_transform(labels)
knn_prediction = regr.fit(X,Y)
print(regr.coef_)
prediction = regr.predict(X)
print regr.score(X, prediction)
print 'Made it till here-2'
print prediction
#print X.toarray()
#print 'Made it till here-3'