本文整理汇总了Python中sklearn.feature_extraction.FeatureHasher类的典型用法代码示例。如果您正苦于以下问题:Python FeatureHasher类的具体用法?Python FeatureHasher怎么用?Python FeatureHasher使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了FeatureHasher类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: hash
def hash(data, labels, new_dimension):
print "start hashing trick..."
# convert features as dict
dictList = list()
if hasattr(data, "indices"):
#ind = data.indices
#dat = data.data
data = data.toarray()
indices = range(len(data[0]))
for item in data:
zipped = zip(indices, item)
row = dict()
for index,value in zipped:
if value != 0:
row[str(index)] = value
dictList.append(row)
a = 234
else:
indices = map(str, range(len(data[0])))
for row in data:
dictList.append(dict(zip(indices, row)))
start = time.time()
hasher = FeatureHasher(n_features=new_dimension) # , input_type='dict'
reduced = hasher.fit_transform(dictList).toarray()
end = time.time()
return (reduced, end-start)
示例2: process_records
def process_records(records, fields, target, textmodel=None):
tokenize = CountVectorizer().build_analyzer()
input = None
X = None
y_labels = []
for i, record in enumerate(records):
nums = []
strs = []
y_labels.append(record.get(target))
for field in fields:
if is_number(record.get(field)):
nums.append(record[field])
else:
strs.append(str(record.get(field) or "").lower())
if strs:
if input is None:
input = StringIO.StringIO()
print >> input, " ".join(tokenize(" ".join(strs)))
if nums:
if X is None:
X = sp.lil_matrix((len(records),len(nums)))
X[i] = np.array(nums, dtype=np.float64)
if input is not None:
if X is not None:
X_2 = X.tocsr()
else:
X_2 = None
if isinstance(textmodel,basestring):
if textmodel == 'lsi':
corpus = TextCorpus(input)
textmodel = LsiModel(corpus, chunksize=1000)
elif textmodel == 'tfidf':
corpus = TextCorpus(input)
textmodel = TfidfModel(corpus)
elif textmodel == 'hashing':
textmodel = None
hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
input.seek(0)
X = hasher.transform(tokenize(line.strip()) for line in input)
if textmodel:
num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
X = corpus2csc(textmodel[corpus], num_terms).transpose()
if X_2 is not None:
# print >> sys.stderr, "X SHAPE:", X.shape
# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
X = sp.hstack([X, X_2], format='csr')
elif X is not None:
textmodel = None
X = X.tocsr()
print >> sys.stderr, "X SHAPE:", X.shape
return X, y_labels, textmodel
示例3: io
def io():
hv = FeatureHasher()
target = []
train_int = []
train_label = []
for iline in dio.io():
iline = iline.strip().split(',')
t = int(iline[0])
int_fs = map(lambda i: numpy.NaN if not i else int(i), iline[1:14])
label_fs = [k for k in iline[14:]]
#label_fs = ",".join(iline[14:])
# print int_fs, label_fs
target.append(t)
train_int.append(int_fs)
train_label.append({k:1 for k in label_fs if k})
# print train_int
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_int = imp.fit_transform(train_int)
# print train_int
scaler = preprocessing.StandardScaler().fit(train_int)
train_int = scaler.transform(train_int)
# print train_int
train_int = csr_matrix(train_int)
# print train_label
train_label = hv.transform(train_label)
train = hstack((train_int, train_label))
# print train_label
# print train
return target, train
示例4: __init__
class QClassifierImpl:
"""
A wrapper for question classifier
"""
def __init__(self, train_data_path, pred_qs = None):
"""
Constructor
"""
logging.basicConfig(level = logging.DEBUG,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='qclassifier.log',
filemode='w')
reload(sys)
sys.setdefaultencoding('utf8')
self.clf = None
self.path = train_data_path
self.pred_qs = pred_qs
self.extractor = FeatureExtractor()
self.features = None
self.labels = None
self.vectorizer = None
self.cate = ['Person', 'Number', 'Location', 'Other']
def train(self):
"""
Train use all of the given data
"""
self.extractor.load(path = self.path)
self.features = self.extractor.extract_features()
self.labels = self.extractor.get_labels()
self.clf = QClassifier(questions = self.extractor.questions)
assert(len(self.labels) == len(self.features))
X = self.features
Y = self.labels
self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True)
X = self.vectorizer.transform(X)
Y = asarray(Y)
logging.info('start training')
self.clf.train(X, Y)
logging.info('done')
def get_type(self, question):
"""
Get type for a given question
"""
if not self.features or not self.labels:
logging.error('You need to train model first!')
return None
if not question:
logging.error('Question should not be None')
return None
f = [self.extractor.extract_features_aux(question)]
f = self.vectorizer.transform(f)
# print self.clf.predict(f)
return self.cate[self.clf.predict(f)[0]]
示例5: HashSarca
class HashSarca(Sarcalingua):
def __init__(self, nbits=20, model=SGDClassifier(alpha=1e-5, penalty="l1", loss="modified_huber")):
self.featureExtractor = FeatureHasher(pow(2,nbits), input_type="pair")
self.classifier = model
self.outEncoder = LabelEncoder()
self.drop_outs = set(( u"#sarcasm", u"#sarcastic", u"#ironic", u"#irony",
u"#sarcasme", u"#sarcastique", u"#ironie", u"#ironique",
u"uncyclopedia", u"wikipedia"))
def extractFeatures(self, clean_text):
return self.featureExtractor.transform( (token_pattern.finditer(clean_text),) )
def corpusToDataset(self, chunkIterator, column_label, HTML=False, **args):
def prepare(raw_text):
tokens = token_pattern.findall(self.sanitize(raw_text, HTML))
if random.random() < 0.5: # we delete the drop-outs half the time
tokens = [tok for tok in tokens if tok not in self.drop_outs]
try:
alpha = 1./len(tokens) #1./(1+log(len(tokens)))
return ((tok.lower(), alpha) for tok in tokens)
except ZeroDivisionError:
return tuple()
for chunk in chunkIterator:
X = self.featureExtractor.transform(imap(prepare, chunk.text))
y = np.array(self.outEncoder.fit_transform(chunk[column_label]))
yield X,y
gc.collect()
示例6: to_ffm
def to_ffm(df, outfile, ycol, num_columns = []):
df = df.copy()
one_based = True
hasher = FeatureHasher(input_type='string', non_negative=True)
bs = 2**10
value_pattern = u'%d:%d:%.16g'
line_pattern = u'%d %s\n'
with open(outfile, 'w') as out:
pb = progressbar.ProgressBar(maxval=(df.shape[0]+bs+1) // bs).start()
for i in xrange((df.shape[0]+bs+1) // bs):
pb.update(i)
s = slice(i*bs, (i+1)*bs)
if ycol in df.columns:
Xh = np.asarray(df.iloc[s].drop([ycol], axis=1).drop(num_columns,axis=1).astype('str'))
Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
y = df.iloc[s][ycol].values.astype('int')
else:
Xh = np.asarray(df.iloc[s].drop(num_columns,axis=1).astype('str'))
Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
y = np.zeros((bs,))
Xt = scipy.sparse.hstack([Xv,hasher.transform(Xh)]).tocsr()
for j in xrange(Xt.shape[0]):
span = slice(Xt.indptr[j], Xt.indptr[j+1])
row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span])
st = " ".join(value_pattern % (j + one_based, fe + one_based, x) for j, fe, x in row if np.isnan(x) == False)
feat = (y[j], st)
out.write((line_pattern % feat).encode('ascii'))
pb.finish()
示例7: load_conll
def load_conll(f, features, n_features=(2 ** 16), split=False):
"""Load CoNLL file, extract features on the tokens and hash them.
Parameters
----------
f : {string, file-like}
Input file.
features : callable
Feature extraction function. Must take a list of tokens (see below)
and an index into this list.
n_features : integer, optional
Number of columns in the output.
split : boolean, default=False
Whether to split lines on whitespace beyond what is needed to parse
out the labels. This is useful for CoNLL files that have extra columns
containing information like part of speech tags.
"""
fh = FeatureHasher(n_features=n_features, input_type="string")
labels = []
lengths = []
with _open(f) as f:
raw_X = _conll_sequences(f, features, labels, lengths, split)
X = fh.transform(raw_X)
return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
示例8: test_hash_empty_input
def test_hash_empty_input():
n_features = 16
raw_X = [[], (), iter(range(0))]
h = FeatureHasher(n_features=n_features, input_type="string")
X = h.transform(raw_X)
assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
示例9: ner
def ner(tokens):
"""Baseline NER tagger for Dutch, based on the CoNLL'02 dataset."""
global _model
X = [_features(tokens, i) for i in range(len(tokens))]
hasher = FeatureHasher(2**16, input_type="string")
return zip(tokens, _model.predict(hasher.transform(X)))
示例10: test_feature_hasher_pairs
def test_feature_hasher_pairs():
raw_X = (d.iteritems() for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert_equal([1, 2], x1_nz)
assert_equal([1, 3, 4], x2_nz)
示例11: hash
def hash(mat, num_features):
"""
hashing trick
"""
hasher = FeatureHasher(n_features=num_features, non_negative=True)
X = hasher.transform(mat)
X = X.toarray()
return X
示例12: test_feature_hasher_dicts
def test_feature_hasher_dicts():
h = FeatureHasher(n_features=16)
assert_equal("dict", h.input_type)
raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}]
X1 = FeatureHasher(n_features=16).transform(raw_X)
gen = (d.iteritems() for d in raw_X)
X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
assert_array_equal(X1.toarray(), X2.toarray())
示例13: load_seq2seq
def load_seq2seq(f, features, n_features=(2 ** 16)):
fh = FeatureHasher(n_features=n_features, input_type="string")
labels = []
lengths = []
with _open(f) as f:
raw_X = _sequences(f, features, labels, lengths)
X = fh.transform(raw_X)
return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
示例14: encode_titles
def encode_titles(titles, num_features=2**14):
'''
Encode the titles formatted as a string as numerical values using
the 'hashing trick'.
The size of the feature vector can be specified using the
num_features parameter'
'''
myHasher = FeatureHasher(input_type='string',
n_features= num_features,
non_negative=True)
featureMatrix = myHasher.transform(titles)
return featureMatrix, myHasher
示例15: hash_features
def hash_features(features, arm_ids, use_id=True):
n_features = np.shape(features)[1]
feature_names = [str(x) for x in np.arange(n_features)]
all_features = []
for arm_id, feature_set in zip(arm_ids, features):
temp_features = zip(feature_names, feature_set)
if use_id == True:
temp_features.append(("id_"+str(arm_id), 1))
all_features.append(temp_features)
f = FeatureHasher(input_type='pair')
return f.transform(all_features)