Python feature_extraction.FeatureHasher类代码示例

本文整理汇总了Python中sklearn.feature_extraction.FeatureHasher类的典型用法代码示例。如果您正苦于以下问题：Python FeatureHasher类的具体用法？Python FeatureHasher怎么用？Python FeatureHasher使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了FeatureHasher类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: hash

def hash(data, labels, new_dimension):
    print "start hashing trick..."
    # convert features as dict
    dictList = list()
    if hasattr(data, "indices"):
        #ind = data.indices
        #dat = data.data
        data = data.toarray()
        indices = range(len(data[0]))
        for item in data:
            zipped = zip(indices, item)
            row = dict()
            for index,value in zipped:
                if value != 0:
                    row[str(index)] = value
            dictList.append(row)

        a = 234
    else:
        indices = map(str, range(len(data[0])))
        for row in data:
            dictList.append(dict(zip(indices, row)))

    start = time.time()
    hasher = FeatureHasher(n_features=new_dimension) # , input_type='dict'
    reduced = hasher.fit_transform(dictList).toarray()
    end = time.time()
    return (reduced, end-start)

开发者ID:sebastian-alfers，项目名称:master-thesis，代码行数:28，代码来源:dimensionality_reduction.py

示例2: process_records

def process_records(records, fields, target, textmodel=None):
	tokenize = CountVectorizer().build_analyzer()

	input = None
	X = None
	y_labels = []

	for i, record in enumerate(records):
		nums = []
		strs = []
		y_labels.append(record.get(target))

		for field in fields:
			if is_number(record.get(field)):
				nums.append(record[field])
			else:
				strs.append(str(record.get(field) or "").lower())
		if strs:
			if input is None:
				input = StringIO.StringIO()
			print >> input, " ".join(tokenize(" ".join(strs)))
		if nums:
			if X is None:
				X = sp.lil_matrix((len(records),len(nums)))
			X[i] = np.array(nums, dtype=np.float64)

	if input is not None:
		if X is not None:
			X_2 = X.tocsr()
		else:
			X_2 = None

		if isinstance(textmodel,basestring):
			if textmodel == 'lsi':
				corpus = TextCorpus(input)
				textmodel = LsiModel(corpus, chunksize=1000)
			elif textmodel == 'tfidf':
				corpus = TextCorpus(input)
				textmodel = TfidfModel(corpus)
			elif textmodel == 'hashing':
				textmodel = None
				hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
				input.seek(0)
				X = hasher.transform(tokenize(line.strip()) for line in input)
		if textmodel:
			num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
			X = corpus2csc(textmodel[corpus], num_terms).transpose()

		if X_2 is not None:
			# print >> sys.stderr, "X SHAPE:", X.shape
			# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
			X = sp.hstack([X, X_2], format='csr')

	elif X is not None:
		textmodel = None
		X = X.tocsr()

	print >> sys.stderr, "X SHAPE:", X.shape

	return X, y_labels, textmodel

开发者ID:nlproc，项目名称:splunkml，代码行数:60，代码来源:multiclassify.py

示例3: io

def io():
    hv = FeatureHasher()

    target = []
    train_int = []
    train_label = []

    for iline in dio.io():
        iline = iline.strip().split(',')
        t = int(iline[0])
        int_fs = map(lambda i: numpy.NaN if not i else int(i), iline[1:14])
        label_fs = [k for k in iline[14:]]
        #label_fs = ",".join(iline[14:])
#        print int_fs, label_fs

        target.append(t)
        train_int.append(int_fs)
        train_label.append({k:1 for k in label_fs if k})

#    print train_int
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    train_int = imp.fit_transform(train_int)
#    print train_int
    scaler = preprocessing.StandardScaler().fit(train_int)
    train_int = scaler.transform(train_int)
#    print train_int
    train_int = csr_matrix(train_int)
#    print train_label
    train_label = hv.transform(train_label)
    train = hstack((train_int, train_label))
#    print train_label
#    print train
    return target, train

开发者ID:Tagtoo，项目名称:model，代码行数:33，代码来源:kaggle_train_lr.py

示例4: init

class QClassifierImpl:
    """
    A wrapper for question classifier
    """

    def __init__(self, train_data_path, pred_qs = None):
        """
        Constructor
        """
        logging.basicConfig(level = logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename='qclassifier.log',
                filemode='w')
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.clf = None
        self.path = train_data_path
        self.pred_qs = pred_qs
        self.extractor = FeatureExtractor()
        self.features = None
        self.labels = None
        self.vectorizer = None
        self.cate = ['Person', 'Number', 'Location', 'Other']

    def train(self):
        """
        Train use all of the given data
        """
        self.extractor.load(path = self.path)
        self.features = self.extractor.extract_features()
        self.labels = self.extractor.get_labels()
        self.clf = QClassifier(questions = self.extractor.questions)
        assert(len(self.labels) == len(self.features))

        X = self.features
        Y = self.labels
        self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True)
        X = self.vectorizer.transform(X)
        Y = asarray(Y)

        logging.info('start training')
        self.clf.train(X, Y)
        logging.info('done')

    def get_type(self, question):
        """
        Get type for a given question
        """
        if not self.features or not self.labels:
            logging.error('You need to train model first!')
            return None
        if not question:
            logging.error('Question should not be None')
            return None
        f = [self.extractor.extract_features_aux(question)]
        f = self.vectorizer.transform(f)
        # print self.clf.predict(f)
        return self.cate[self.clf.predict(f)[0]]

开发者ID:StevenLOL，项目名称:Factoid-QA，代码行数:60，代码来源:QClassifier.py

示例5: HashSarca

class HashSarca(Sarcalingua):
    
    def __init__(self, nbits=20, model=SGDClassifier(alpha=1e-5, penalty="l1", loss="modified_huber")):
        
        self.featureExtractor = FeatureHasher(pow(2,nbits), input_type="pair")
        self.classifier = model
        self.outEncoder = LabelEncoder()
        self.drop_outs = set((   u"#sarcasm", u"#sarcastic", u"#ironic", u"#irony",
                    u"#sarcasme", u"#sarcastique", u"#ironie", u"#ironique",
                    u"uncyclopedia", u"wikipedia"))
        
    def extractFeatures(self, clean_text):
        return self.featureExtractor.transform( (token_pattern.finditer(clean_text),) )
        
    def corpusToDataset(self, chunkIterator, column_label, HTML=False, **args):
        
        def prepare(raw_text):
            tokens = token_pattern.findall(self.sanitize(raw_text, HTML))
            if random.random() < 0.5:   # we delete the drop-outs half the time
                tokens = [tok for tok in tokens if tok not in self.drop_outs]
            try:
                alpha = 1./len(tokens)  #1./(1+log(len(tokens)))
                return ((tok.lower(), alpha) for tok in tokens)
            except ZeroDivisionError:
                return tuple()
        
        for chunk in chunkIterator:
            X = self.featureExtractor.transform(imap(prepare, chunk.text))
            y = np.array(self.outEncoder.fit_transform(chunk[column_label]))
            
            yield X,y
            gc.collect()

开发者ID:Previsou，项目名称:Sarcasticus，代码行数:32，代码来源:sarcalingua.py

示例6: to_ffm

def to_ffm(df, outfile, ycol, num_columns = []):
    df = df.copy()
    one_based = True
    hasher = FeatureHasher(input_type='string', non_negative=True)
    bs = 2**10
    value_pattern = u'%d:%d:%.16g'
    line_pattern = u'%d %s\n'
    with open(outfile, 'w') as out:
        pb = progressbar.ProgressBar(maxval=(df.shape[0]+bs+1) // bs).start()
        for i in xrange((df.shape[0]+bs+1) // bs):
            pb.update(i)
            s = slice(i*bs, (i+1)*bs)
            if ycol in df.columns:
                Xh = np.asarray(df.iloc[s].drop([ycol], axis=1).drop(num_columns,axis=1).astype('str'))
                Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
                y = df.iloc[s][ycol].values.astype('int')
            else:
                Xh = np.asarray(df.iloc[s].drop(num_columns,axis=1).astype('str'))
                Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
                y = np.zeros((bs,))
            Xt = scipy.sparse.hstack([Xv,hasher.transform(Xh)]).tocsr()
            for j in xrange(Xt.shape[0]):
                span = slice(Xt.indptr[j], Xt.indptr[j+1])
                row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span])
                st = " ".join(value_pattern % (j + one_based, fe + one_based, x) for j, fe, x in row if np.isnan(x) == False)
                feat = (y[j], st)
                out.write((line_pattern % feat).encode('ascii'))
        pb.finish()

开发者ID:PKostya，项目名称:kaggle，代码行数:28，代码来源:ffm.py

示例7: load_conll

def load_conll(f, features, n_features=(2 ** 16), split=False):
    """Load CoNLL file, extract features on the tokens and hash them.

    Parameters
    ----------
    f : {string, file-like}
        Input file.
    features : callable
        Feature extraction function. Must take a list of tokens (see below)
        and an index into this list.
    n_features : integer, optional
        Number of columns in the output.
    split : boolean, default=False
        Whether to split lines on whitespace beyond what is needed to parse
        out the labels. This is useful for CoNLL files that have extra columns
        containing information like part of speech tags.
    """
    fh = FeatureHasher(n_features=n_features, input_type="string")
    labels = []
    lengths = []

    with _open(f) as f:
        raw_X = _conll_sequences(f, features, labels, lengths, split)
        X = fh.transform(raw_X)

    return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)

开发者ID:fmailhot，项目名称:seqlearn，代码行数:26，代码来源:conll.py

示例8: test_hash_empty_input

def test_hash_empty_input():
    n_features = 16
    raw_X = [[], (), iter(range(0))]

    h = FeatureHasher(n_features=n_features, input_type="string")
    X = h.transform(raw_X)

    assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))

开发者ID:JohnFNovak，项目名称:scikit-learn，代码行数:8，代码来源:test_feature_hasher.py

示例9: ner

def ner(tokens):
    """Baseline NER tagger for Dutch, based on the CoNLL'02 dataset."""

    global _model

    X = [_features(tokens, i) for i in range(len(tokens))]
    hasher = FeatureHasher(2**16, input_type="string")
    return zip(tokens, _model.predict(hasher.transform(X)))

开发者ID:NLeSC，项目名称:xtas，代码行数:8，代码来源:_nl_conll_ner.py

示例10: test_feature_hasher_pairs

def test_feature_hasher_pairs():
    raw_X = (d.iteritems() for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 2], x1_nz)
    assert_equal([1, 3, 4], x2_nz)

开发者ID:JohnFNovak，项目名称:scikit-learn，代码行数:8，代码来源:test_feature_hasher.py

示例11: hash

 def hash(mat, num_features):
   """
   hashing trick
   """
   hasher = FeatureHasher(n_features=num_features, non_negative=True)
   X = hasher.transform(mat)
   X = X.toarray()
   return X

开发者ID:ai-se，项目名称:SMOTE，代码行数:8，代码来源:textMining_hash.py

示例12: test_feature_hasher_dicts

def test_feature_hasher_dicts():
    h = FeatureHasher(n_features=16)
    assert_equal("dict", h.input_type)

    raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}]
    X1 = FeatureHasher(n_features=16).transform(raw_X)
    gen = (d.iteritems() for d in raw_X)
    X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
    assert_array_equal(X1.toarray(), X2.toarray())

开发者ID:JohnFNovak，项目名称:scikit-learn，代码行数:9，代码来源:test_feature_hasher.py

示例13: load_seq2seq

def load_seq2seq(f, features, n_features=(2 ** 16)):
    fh = FeatureHasher(n_features=n_features, input_type="string")
    labels = []
    lengths = []

    with _open(f) as f:
        raw_X = _sequences(f, features, labels, lengths)
        X = fh.transform(raw_X)

    return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)

开发者ID:alexeyev，项目名称:chgk_exploration，代码行数:10，代码来源:seq2seq_datasets.py

示例14: encode_titles

def encode_titles(titles, num_features=2**14):
  '''
  Encode the titles formatted as a string as numerical values using
  the 'hashing trick'.
  The size of the feature vector can be specified using the
  num_features parameter'
  '''
  myHasher = FeatureHasher(input_type='string',
                           n_features= num_features,
                           non_negative=True)
  featureMatrix = myHasher.transform(titles)
  return featureMatrix, myHasher

开发者ID:grodrigues3，项目名称:IssueLabeler，代码行数:12，代码来源:transform.py

示例15: hash_features

def hash_features(features, arm_ids, use_id=True):
    n_features = np.shape(features)[1]
    feature_names = [str(x) for x in np.arange(n_features)]
    all_features = []
    for arm_id, feature_set in zip(arm_ids, features):
        temp_features = zip(feature_names, feature_set)
        if use_id == True:
            temp_features.append(("id_"+str(arm_id), 1))
        all_features.append(temp_features)

    f = FeatureHasher(input_type='pair')
    return f.transform(all_features)

开发者ID:stoddardg，项目名称:mab_simulation，代码行数:12，代码来源:bandits.py

注：本文中的sklearn.feature_extraction.FeatureHasher类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。