本文整理汇总了Python中sklearn.feature_extraction.FeatureHasher.transform方法的典型用法代码示例。如果您正苦于以下问题:Python FeatureHasher.transform方法的具体用法?Python FeatureHasher.transform怎么用?Python FeatureHasher.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction.FeatureHasher
的用法示例。
在下文中一共展示了FeatureHasher.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
class QClassifierImpl:
"""
A wrapper for question classifier
"""
def __init__(self, train_data_path, pred_qs = None):
"""
Constructor
"""
logging.basicConfig(level = logging.DEBUG,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='qclassifier.log',
filemode='w')
reload(sys)
sys.setdefaultencoding('utf8')
self.clf = None
self.path = train_data_path
self.pred_qs = pred_qs
self.extractor = FeatureExtractor()
self.features = None
self.labels = None
self.vectorizer = None
self.cate = ['Person', 'Number', 'Location', 'Other']
def train(self):
"""
Train use all of the given data
"""
self.extractor.load(path = self.path)
self.features = self.extractor.extract_features()
self.labels = self.extractor.get_labels()
self.clf = QClassifier(questions = self.extractor.questions)
assert(len(self.labels) == len(self.features))
X = self.features
Y = self.labels
self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True)
X = self.vectorizer.transform(X)
Y = asarray(Y)
logging.info('start training')
self.clf.train(X, Y)
logging.info('done')
def get_type(self, question):
"""
Get type for a given question
"""
if not self.features or not self.labels:
logging.error('You need to train model first!')
return None
if not question:
logging.error('Question should not be None')
return None
f = [self.extractor.extract_features_aux(question)]
f = self.vectorizer.transform(f)
# print self.clf.predict(f)
return self.cate[self.clf.predict(f)[0]]
示例2: HashSarca
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
class HashSarca(Sarcalingua):
def __init__(self, nbits=20, model=SGDClassifier(alpha=1e-5, penalty="l1", loss="modified_huber")):
self.featureExtractor = FeatureHasher(pow(2,nbits), input_type="pair")
self.classifier = model
self.outEncoder = LabelEncoder()
self.drop_outs = set(( u"#sarcasm", u"#sarcastic", u"#ironic", u"#irony",
u"#sarcasme", u"#sarcastique", u"#ironie", u"#ironique",
u"uncyclopedia", u"wikipedia"))
def extractFeatures(self, clean_text):
return self.featureExtractor.transform( (token_pattern.finditer(clean_text),) )
def corpusToDataset(self, chunkIterator, column_label, HTML=False, **args):
def prepare(raw_text):
tokens = token_pattern.findall(self.sanitize(raw_text, HTML))
if random.random() < 0.5: # we delete the drop-outs half the time
tokens = [tok for tok in tokens if tok not in self.drop_outs]
try:
alpha = 1./len(tokens) #1./(1+log(len(tokens)))
return ((tok.lower(), alpha) for tok in tokens)
except ZeroDivisionError:
return tuple()
for chunk in chunkIterator:
X = self.featureExtractor.transform(imap(prepare, chunk.text))
y = np.array(self.outEncoder.fit_transform(chunk[column_label]))
yield X,y
gc.collect()
示例3: process_records
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def process_records(records, fields, target, textmodel=None):
tokenize = CountVectorizer().build_analyzer()
input = None
X = None
y_labels = []
for i, record in enumerate(records):
nums = []
strs = []
y_labels.append(record.get(target))
for field in fields:
if is_number(record.get(field)):
nums.append(record[field])
else:
strs.append(str(record.get(field) or "").lower())
if strs:
if input is None:
input = StringIO.StringIO()
print >> input, " ".join(tokenize(" ".join(strs)))
if nums:
if X is None:
X = sp.lil_matrix((len(records),len(nums)))
X[i] = np.array(nums, dtype=np.float64)
if input is not None:
if X is not None:
X_2 = X.tocsr()
else:
X_2 = None
if isinstance(textmodel,basestring):
if textmodel == 'lsi':
corpus = TextCorpus(input)
textmodel = LsiModel(corpus, chunksize=1000)
elif textmodel == 'tfidf':
corpus = TextCorpus(input)
textmodel = TfidfModel(corpus)
elif textmodel == 'hashing':
textmodel = None
hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
input.seek(0)
X = hasher.transform(tokenize(line.strip()) for line in input)
if textmodel:
num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
X = corpus2csc(textmodel[corpus], num_terms).transpose()
if X_2 is not None:
# print >> sys.stderr, "X SHAPE:", X.shape
# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
X = sp.hstack([X, X_2], format='csr')
elif X is not None:
textmodel = None
X = X.tocsr()
print >> sys.stderr, "X SHAPE:", X.shape
return X, y_labels, textmodel
示例4: io
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def io():
hv = FeatureHasher()
target = []
train_int = []
train_label = []
for iline in dio.io():
iline = iline.strip().split(',')
t = int(iline[0])
int_fs = map(lambda i: numpy.NaN if not i else int(i), iline[1:14])
label_fs = [k for k in iline[14:]]
#label_fs = ",".join(iline[14:])
# print int_fs, label_fs
target.append(t)
train_int.append(int_fs)
train_label.append({k:1 for k in label_fs if k})
# print train_int
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_int = imp.fit_transform(train_int)
# print train_int
scaler = preprocessing.StandardScaler().fit(train_int)
train_int = scaler.transform(train_int)
# print train_int
train_int = csr_matrix(train_int)
# print train_label
train_label = hv.transform(train_label)
train = hstack((train_int, train_label))
# print train_label
# print train
return target, train
示例5: to_ffm
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def to_ffm(df, outfile, ycol, num_columns = []):
df = df.copy()
one_based = True
hasher = FeatureHasher(input_type='string', non_negative=True)
bs = 2**10
value_pattern = u'%d:%d:%.16g'
line_pattern = u'%d %s\n'
with open(outfile, 'w') as out:
pb = progressbar.ProgressBar(maxval=(df.shape[0]+bs+1) // bs).start()
for i in xrange((df.shape[0]+bs+1) // bs):
pb.update(i)
s = slice(i*bs, (i+1)*bs)
if ycol in df.columns:
Xh = np.asarray(df.iloc[s].drop([ycol], axis=1).drop(num_columns,axis=1).astype('str'))
Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
y = df.iloc[s][ycol].values.astype('int')
else:
Xh = np.asarray(df.iloc[s].drop(num_columns,axis=1).astype('str'))
Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
y = np.zeros((bs,))
Xt = scipy.sparse.hstack([Xv,hasher.transform(Xh)]).tocsr()
for j in xrange(Xt.shape[0]):
span = slice(Xt.indptr[j], Xt.indptr[j+1])
row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span])
st = " ".join(value_pattern % (j + one_based, fe + one_based, x) for j, fe, x in row if np.isnan(x) == False)
feat = (y[j], st)
out.write((line_pattern % feat).encode('ascii'))
pb.finish()
示例6: load_conll
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def load_conll(f, features, n_features=(2 ** 16), split=False):
"""Load CoNLL file, extract features on the tokens and hash them.
Parameters
----------
f : {string, file-like}
Input file.
features : callable
Feature extraction function. Must take a list of tokens (see below)
and an index into this list.
n_features : integer, optional
Number of columns in the output.
split : boolean, default=False
Whether to split lines on whitespace beyond what is needed to parse
out the labels. This is useful for CoNLL files that have extra columns
containing information like part of speech tags.
"""
fh = FeatureHasher(n_features=n_features, input_type="string")
labels = []
lengths = []
with _open(f) as f:
raw_X = _conll_sequences(f, features, labels, lengths, split)
X = fh.transform(raw_X)
return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
示例7: test_feature_hasher_pairs
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def test_feature_hasher_pairs():
raw_X = (d.iteritems() for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert_equal([1, 2], x1_nz)
assert_equal([1, 3, 4], x2_nz)
示例8: test_hash_empty_input
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def test_hash_empty_input():
n_features = 16
raw_X = [[], (), iter(range(0))]
h = FeatureHasher(n_features=n_features, input_type="string")
X = h.transform(raw_X)
assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
示例9: ner
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def ner(tokens):
"""Baseline NER tagger for Dutch, based on the CoNLL'02 dataset."""
global _model
X = [_features(tokens, i) for i in range(len(tokens))]
hasher = FeatureHasher(2**16, input_type="string")
return zip(tokens, _model.predict(hasher.transform(X)))
示例10: hash
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def hash(mat, num_features):
"""
hashing trick
"""
hasher = FeatureHasher(n_features=num_features, non_negative=True)
X = hasher.transform(mat)
X = X.toarray()
return X
示例11: __init__
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
class Model:
def __init__(self,numFeatures, learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
#Init scikit models
self.FH = FeatureHasher(n_features=numFeatures, input_type='string')
self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
def train(self, gen, v=False):
i = 0
for x, y in gen: #For each batch
xHash = self.FH.transform(x) #hash trick
y = np.array(y)
## for epoch in range(numEpochs):
self.Classifier.partial_fit(xHash, y, [0,1])
i += len(x)
if v : print(str(datetime.now())[:-7] , "example:", i)
def test(self, gen, v=False):
#init target and prediction arrays
ytot = np.array([])
ptot = np.array([])
#Get prediction for each batch
i = 0
for x,y in gen:
xHash = self.FH.transform(x) #hash trick
p = self.Classifier.predict_proba(xHash)
p = p.T[1].T #Keep column corresponding to probability of class 1
#Stack target and prediction for later analysis
ytot = np.hstack((ytot, y))
ptot = np.hstack((ptot, p))
i += y.shape[0]
if v : print(str(datetime.now())[:-7] , "example:", i)
if v: print("Score:", self.score(ytot, ptot))
return (ytot, ptot)
def predictBatch(self, batch):
hashedBatch = self.FH.transform(batch)
prediction = self.Classifier.predict_proba(hashedBatch)
return prediction
def generatePrediction(self, generator):
for xBatch, idBatch in generator:
prediction = self.predictBatch(xBatch)
yield prediction, idBatch
def score(self, target, prediction):
return llfun(target, prediction)
示例12: test_feature_hasher_pairs_with_string_values
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def test_feature_hasher_pairs_with_string_values():
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
{"baz": u"abc", "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert_equal([1, 1], x1_nz)
assert_equal([1, 1, 4], x2_nz)
raw_X = (iter(d.items()) for d in [{"bax": "abc"},
{"bax": "abc"}])
x1, x2 = h.transform(raw_X).toarray()
x1_nz = np.abs(x1[x1 != 0])
x2_nz = np.abs(x2[x2 != 0])
assert_equal([1], x1_nz)
assert_equal([1], x2_nz)
assert_equal(x1, x2)
示例13: load_seq2seq
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
def load_seq2seq(f, features, n_features=(2 ** 16)):
fh = FeatureHasher(n_features=n_features, input_type="string")
labels = []
lengths = []
with _open(f) as f:
raw_X = _sequences(f, features, labels, lengths)
X = fh.transform(raw_X)
return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
示例14: ColumnHasherTransformer
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
class ColumnHasherTransformer(BaseEstimator, TransformerMixin):
def __init__(self, col):
self.col = col
self.fh = FeatureHasher(n_features=1024, input_type='dict')
def fit(self, X, y=None):
return self
def transform(self, df):
return self.fh.transform(df.loc[:,self.col]\
.apply(lambda x: {x: 1}).values)
示例15: GraphemeBasedModel
# 需要导入模块: from sklearn.feature_extraction import FeatureHasher [as 别名]
# 或者: from sklearn.feature_extraction.FeatureHasher import transform [as 别名]
class GraphemeBasedModel(DiacriticsRestorationModel):
def __init__(self, window=5, input_classes=None):
self.window = window
self.input_classes = input_classes
def train(self, corpus, classes=None, chunk_size=100000):
self.vectorizer = FeatureHasher(non_negative=True,
n_features=len(classes)*2*self.window,
input_type='pair')
self.clf = MultinomialNB()
i = 0
j = 0
X = []
Y = []
for x, y in corpus:
if x[self.window][1] in self.input_classes:
X.append(x)
Y.append(y)
i += 1
if i < chunk_size:
continue
j += 1
click.echo("Running iteration {}".format(j))
X = self.vectorizer.transform(X)
self.clf.partial_fit(X, Y, classes)
X = []
Y = []
i = 0
def restore(self, string):
corpus = []
out = ''
for x, y in string_to_grapheme_corpus(string, self.window):
if x[self.window][1] in self.input_classes:
x = self.vectorizer.transform([x])
out += self.clf.predict(x)[0]
else:
out += y
return out