當前位置: 首頁>>代碼示例>>Python>>正文


Python TfidfVectorizer.build_tokenizer方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.build_tokenizer方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.build_tokenizer方法的具體用法?Python TfidfVectorizer.build_tokenizer怎麽用?Python TfidfVectorizer.build_tokenizer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction.text.TfidfVectorizer的用法示例。


在下文中一共展示了TfidfVectorizer.build_tokenizer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: file

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def file():
    cats = ["alt.atheism", "sci.electronics"]

    newsgroups_train = fetch_20newsgroups(subset="train", categories=cats)

    newsgroups_test = fetch_20newsgroups(subset="test", categories=cats)
    vectorizer = TfidfVectorizer()  # 把所有文檔都切詞,統計了

    vectors_train = vectorizer.fit_transform(newsgroups_train.data)
    vectors = vectorizer.transform(newsgroups_test.data)
    print vectors.shape[1]
    # f=open('test_all.txt','wb')
    for j in range(0, vectors.shape[0]):
        item_id = list()
        tokens = vectorizer.build_tokenizer()(newsgroups_test.data[j])  # 提取分詞結果
        # print tokens

        word_sort = np.argsort(-vectors[j].data)
        print "頂點" + str(j)
        for i in range(0, len(word_sort)):
            word = vectorizer.get_feature_names()[vectors[j].indices[word_sort[i]]]  # 這個是tf-idf詞
            for line in range(0, len(tokens)):
                if tokens[line].lower() == word:
                    item_id.append((line, word_sort[i]))

        pos_item = sorted(item_id, key=lambda jj: jj[0], reverse=True)  # 抽取tf-idf詞

        word_word = np.zeros([len(word_sort), len(word_sort)])
        for p in range(0, len(pos_item)):
            if p < (len(pos_item) - 1):
                ki = word_sort[pos_item[p][1]]
                kj = word_sort[pos_item[p + 1][1]]
                word_word[ki, kj] = word_word[ki, kj] + 1
開發者ID:yanshengli,項目名稱:DBN_Learning,代碼行數:35,代碼來源:file_to_graph1_test.py

示例2: create_vocab

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def create_vocab(train):
    init_time = time.time()
    vocab = set()
    t = TfidfVectorizer()
    tokenizer = t.build_tokenizer()
    for ex in train[0]:
        vocab.update(tokenizer(ex))
    end_time = time.time()
    print("it took " + str(end_time - init_time) + "to create the vocabulary")
    return vocab
開發者ID:Noahs-ARK,項目名稱:ARKcat,代碼行數:12,代碼來源:models_and_data.py

示例3: Analyzer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
class Analyzer(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer(min_df=1, binary=False, ngram_range=(1, 3), tokenizer=Tokenizer())
        self.tokens = self.tfidf.build_tokenizer()
        self.ngram = self.tfidf.build_analyzer()

    def __call__(self, sentence):
        ret = self.ngram(sentence)
        terms = self.tokens(sentence)
        for term in terms:
            cate = term_category(term)
            if term != cate:
                ret.append(cate)
        return ret
開發者ID:renning22,項目名稱:cortana,代碼行數:16,代碼來源:__init__.py

示例4: Vectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
class Vectorizer(object):
    def __init__(self):
        self.count_vec = TfidfVectorizer(binary = True,
                                         ngram_range = (1, 3),
                                         tokenizer = Tokenizer())

        self.last_vec = CountVectorizer(binary = True, ngram_range = (1, 1), tokenizer = Tokenizer())


    def collect_last_term(self, X):
        X_last = list()
        tokens = self.last_vec.build_tokenizer()
        _logger.debug("Extracting last term for each sentence")
        for sent in X:
            X_last.append(tokens(sent)[-1])
        _logger.debug("Fitting last-term vectorizer")
        return X_last
        

    def fit(self, X, y = None):
        _logger.debug("Fitting count vectorizer")
        self.count_vec.fit(X)
        X_last = self.collect_last_term(X)
        self.last_vec.fit(X_last)
        return self

    def transform(self, X, y = None):
        #return self.count_vec.transform(X)
        _logger.debug("Doing tfidf transform")
        Xc = self.count_vec.transform(X)

        X_last = self.collect_last_term(X)
        _logger.debug("Doing last term transform")
        Xl = self.last_vec.transform(X_last)
        _logger.debug("stacking features")
        ret = sparse.hstack([Xc, Xl])
        
        tokens = self.count_vec.build_tokenizer()
        l = list()
        for sent in X:
            terms = tokens(sent)
            l.append(1 if  ("__LOCATION__" in terms and "__ORGNIZATION__" in terms) else 0)

        l = np.array(l)
        l.shape = len(l), 1
        ret = sparse.hstack([ret, l])
        _logger.debug("vectorization transform done")

        return ret
開發者ID:luanjunyi,項目名稱:cortana,代碼行數:51,代碼來源:train.py

示例5: transform_cnn_data

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
    def transform_cnn_data(self, X_raw, feat_and_param):
        feat_and_param['feats']['ngram_range'] = (1,1)
        feat_and_param['feats']['use_idf'] = False
        feat_and_param['feats']['binary'] = False

        vectorizer = TfidfVectorizer(**feat_and_param['feats'])
        vectorizer.fit(X_raw)
        tokenizer = TfidfVectorizer.build_tokenizer(vectorizer)
        X_raw_tokenized = [tokenizer(ex) for ex in X_raw]
        train_X = []
        for example in X_raw_tokenized:
            for i in range(len(example)):
                example[i] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", example[i])
            train_X.append([vectorizer.transform(example)])
        index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
        for key in index_to_word:
            index_to_word[key] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", index_to_word[key])
        return train_X, index_to_word
開發者ID:katyasyc,項目名稱:ARKcat,代碼行數:20,代碼來源:models_and_data.py

示例6: process_joke

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def process_joke(joke):
    data = {}

    # Lowercase text.
    joke.text = joke.text.lower()

    # Replace text with dict.
    stop_words = set(stopwords.words('english'))
    vectorizer = TfidfVectorizer()
    tokenizer = vectorizer.build_tokenizer()

    def tokenize_text(text, prefix=''):
        d = {}
        for term in tokenizer(text):
            if term in stop_words:
                continue
            d[prefix + term] = d.get(prefix + term, 0) + 1
        return d

    data.update(tokenize_text(joke.text, 't_'))
    data.update({('cat_' + cat): 1 for cat in joke.categories})
    data.update({('subcat_' + cat): 1 for cat in joke.subcategories})

    return data
開發者ID:yxliang,項目名稱:what-makes-a-good-joke,代碼行數:26,代碼來源:classify.py

示例7: TfidfVectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
X_test = np.array([''.join(el) for el in nyt_data[trainset_size + 1:len(nyt_data)]])
y_test = np.array([el for el in nyt_labels[trainset_size + 1:len(nyt_labels)]])

#print(X_train)

vectorizer = TfidfVectorizer(min_df=2,
                             ngram_range=(1, 2),
                             stop_words='english',
                             strip_accents='unicode',
                             norm='l2')

test_string = unicode(nyt_data[0])

print "Example string: " + test_string
print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string)
print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string))
print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string))


X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)






svm_classifier = LinearSVC().fit(X_train, y_train)


開發者ID:Oregand,項目名稱:4THYEARPROJECT,代碼行數:30,代碼來源:DataProcessing3.py

示例8: main

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def main():


    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b')  #, tokenizer=StemTokenizer())

    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = 10  # max(10, args.fixk)

    args.fixk = None

    data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))


    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = experiment_utils.clean_html(data.train.data)
    data.test.data = experiment_utils.clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    train_test_data = Bunch()

    expert_data.sentence, train_test_data.pool = split_data(data.train)
    expert_data.oracle, train_test_data.test = split_data(data.test)

    data.train.data = train_test_data.pool.train.data
    data.train.target = train_test_data.pool.train.target

    data.test.data = train_test_data.test.train.data
    data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")

    labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector)

    expert_data.oracle.train.data = sent_train
    expert_data.oracle.train.target = np.array(labels)
    expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)


    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector)

    expert_data.sentence.train.data = sent_train
    expert_data.sentence.train.target = np.array(labels)
    expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)

    sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    #### TESTING THE CLASSIFERS

    test_target, test_data = split_data_sentences(data.test,sent_detector)
    test_data_bow = vct.transform(test_data)

    #pred_sent = sent_clf.predict(test_data_bow)
    pred_ora = exp_clf.predict(test_data_bow)
    y_probas = sent_clf.predict_proba(test_data_bow)
    pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)]
    ## just based on one class probability
    # order = np.argsort(y_probas[:,0])
    order = np.argsort(y_probas.max(axis=1))
    print "ORACLE\tSENTENCE\tMAX-SENT"
    # for i in order[:500]:
    #     print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
    for i in order[-500:]:
        print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
    print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent)
    print "Class distribution: %s" % pred_sent.sum()
    print "Size of data: %s" % pred_sent.shape[0]
    sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000]
#.........這裏部分代碼省略.........
開發者ID:mramire8,項目名稱:active,代碼行數:103,代碼來源:test_sent.py

示例9: CountVectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
        count_pos_test = count_neg_test + 1
        
label_test = test_data[:,1]
#vctr =  CountVectorizer(stop_words='english',min_df = 1)
#vctr2 = HashingVectorizer(stop_words='english') 
vctr = TfidfVectorizer(stop_words='english') #intailising vectorizers TF-IDF gives better accuracy by 1 percent compared to the other vectors
count_pos = 0
count_neg = 0

######################################################################################################
train = []
test = []
for i in range(len(train_data)):           #processing of the train data
    string = train_data[i,0]            
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    train.append(' '.join(string))

for i in range(len(test_data)):            #processing of the test data  
    string = test_data[i,0]
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    test.append(' '.join(string)) 

######################################################################################################
train_data1 = vctr.fit_transform(train).toarray() #fitting the dictionary for bag of words model using TF-IDF vectorizers
#X_test = vctr.transform(test).toarray()
y_train = np.asarray(label_train, dtype="|S6")
y_train = y_train.astype(int)
clf1 =   GradientBoostingClassifier(n_estimators = 500) #initialising classifiers
clf2 =   AdaBoostClassifier(n_estimators = 500)
開發者ID:mohankashyap,項目名稱:mohan,代碼行數:33,代碼來源:final_evalutionjob.py

示例10: train_extractor_from_lines

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
class FeatureExtractor:

	vectorizer = None
	feature_names = None
	feature_matrix = None

	def train_extractor_from_lines(self, train_lines, labels, test_lines):
		self.vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=DISTINCT_WORDS_CNT)
		self.vectorizer.fit(train_lines + test_lines)

		pass

	def load_vectorizer(self):
		input_file = open('../models/tfidf_vectorizer.pkl', 'rb')
		self.vectorizer = pickle.load(input_file)
		input_file.close()
		pass

	def save_vectorizer(self):
		output_file = open('../models/tfidf_vectorizer.pkl', 'wb')
		pickle.dump(self.vectorizer, output_file)
		output_file.close()
		pass

	def train_extractor(self, full = False):

		if not full:
			train_lines = file2lines('../data/train_lite.csv')
			labels = file2labels('../data/train_lite.csv')
			test_lines = file2lines('../data/test_lite.csv')
		else:
			train_lines = file2lines('../data/train.csv')
			labels = file2labels('../data/train.csv')
			test_lines = file2lines('../data/test.csv')

		self.train_extractor_from_lines(train_lines, labels, test_lines)

		pass

	def lines2words(self, lines):
		self.tokenizer = self.vectorizer.build_tokenizer()

		return [self.tokenizer(line) for line in lines]

	def lines2features(self, lines, use_tense = False):
		"""
		returns DataFrame(feature_matrix, feature_name)

		['word_rainny', 'word_'sunny'],
		array([
			[1, 0.4, 0.2],
			[0.2, 1, 0.2],
		])
		"""
		self.feature_names = []
		self.feature_matrix = None

		# tf-idf features
		data = self.vectorizer.transform(lines).toarray()

		self.feature_names = self.vectorizer.get_feature_names()
		self.feature_matrix = data

		# additional features
		add_features = []
		important_words = ['sunny', 'wind', 'humid', 'hot', 'cold', 'dry', 'ice', 'rain', 'snow', 'tornado', 'storm', 'hurricane']
		important_words = ['cloud', 'cold', 'dry', 'hot', 'humid', 'hurricane', 'ice', 'rain', 'snow', 'storm', 'sunny', 'tornado', 'wind']
		self.feature_names = self.feature_names + ['impt_words:' + word for word in important_words]
		if use_tense:
			self.feature_names = self.feature_names + ['past_tense_num', 'present_tense_num']

		all_words = self.lines2words(lines)
		for words in all_words:
			# important words
			important_words_ftr = [int(word in words) for word in important_words]
			add_features.append(important_words_ftr)

			# tense
			if use_tense:
				tagz = zip(*nltk.pos_tag(nltk.word_tokenize(words)))[1]
				past_num = len([v for v in tagz if v == 'VBD'])
				present_num = len([v for v in tagz if v in ['VBP', 'VB']])

				add_features.append([past_num, present_num])
    	
		self.feature_matrix = np.hstack((self.feature_matrix, add_features))

		return DataFrame(self.feature_matrix, columns = self.feature_names)
開發者ID:SolessChong,項目名稱:webmining,代碼行數:90,代碼來源:v3.py

示例11: TfidfVectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
# -*- coding: utf-8 -*-
from gensim import corpora, models, matutils
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer

dataset=fetch_20newsgroups(categories=['alt.atheism','talk.religion.misc','sci.space']) # berem toko 3 categorii
vect = TfidfVectorizer()
tok=vect.build_tokenizer() # хорошо токенизирует все
texts=[]
lem=WordNetLemmatizer()
lemms=[]
#for text in dataset.data:
#    for token in tok(text):
#        lemms.append(lem.lemmatize(token))
#    texts.append(lemms)
#models = models.Word2Vec(texts,size=100, window=5,min_count=5,workers=4)
#models.save('texts.dat')

model = models.Word2Vec.load('texts.dat')
#print(model['theory'])
#print(model.similarity('man','car'))
#print(model.most_similar(positive=['man'],negative=['computer']))
print model.doesnt_match("car wheel glass engine".split())
開發者ID:ayulit,項目名稱:karpov,代碼行數:26,代碼來源:lab7.py

示例12: WordNetLemmatizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        doc = str(doc)
        s = "".join(doc.split("__EOS__"))
        doc = s.translate(None, string.punctuation)
        tokens = doc.word_tokenize(doc)
        bi = list(p1+" "+p2 for p1,p2 in nltk.bigrams(tokens))
        tokens.extend(bi)
        return [self.wnl.lemmatize(t) for t in tokens]            


if _use_TFIDF_ :
    #vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1, stop_words=stoplist, max_features=no_of_features, tokenizer=LemmaTokenizer())
    vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1, stop_words=stoplist, max_features=no_of_features)
    func_tokenizer =vectorizer.build_tokenizer()

'''
I was using two functions earlier for tokenization and data preprocessing.
Later implemented the LemmaTokenizer class for this. 
'''

def ispunct(some_string):
    return not any(char.isalnum() for char in some_string)
    

def get_tokens(s):
#   Tokenize into words in sentences. Returns list of strs
    retval = []
    sents = sent_tokenize(s)
    for sent in sents:
開發者ID:suzinyou,項目名稱:Image-Classification,代碼行數:33,代碼來源:featureSelector.py

示例13: main

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer())
    #
    # vct = CountVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=True, ngram_range=(1, 1),
    #                       token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer())


    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = 10 # max(10, args.fixk)

    # if args.fixk < 0:
    args.fixk = None

    data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))
    print ("Vectorizer: %s" % vct)
    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = clean_html(data.train.data)
    data.test.data = clean_html(data.test.data)

    # labels, sent_train = split_data_sentences(data.train, sent_detector)
    #
    # data.train.data = sent_train
    # data.train.target = np.array(labels)

    # labels, sent_train = split_data_sentences(data.test, sent_detector)
    # data.test.data = sent_train
    # data.test.target = np.array(labels)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset
    # data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)


    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert


    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    # clf = set_classifier(args.classifier)

    student = structured.AALStructured(model=clf, accuracy_model=None, budget=args.budget, seed=args.seed, vcn=vct,
                                       subpool=250, cost_model=cost_model)
    student.set_score_model(exp_clf)



    print "\nStudent Classifier: %s" % clf


    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print ("Sentence Classification")
    t0 = time.time()
    tac = []
    tau = []

    # predition = exp_clf.predict(data.train.bow)


#.........這裏部分代碼省略.........
開發者ID:mramire8,項目名稱:active,代碼行數:103,代碼來源:sentfixk_cheat.py

示例14: TfidfVectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
__author__ = 'Alena'
from sklearn.datasets import fetch_20newsgroups
dataset=fetch_20newsgroups()
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()
tok=vect.build_tokenizer()
texts=[]

Y=vect.fit_transform(dataset.data)
first=Y.getcol(0)
second=Y.getcol(1)
word1=[]
for i, el in enumerate(first):
    word1.append(first._get_single_element(i,0))
    word2=[]
for i, el in enumerate(second):
    word2.append(second._get_single_element(i,0))

distance=0
for i in range(len(word2)):
    
    distance+=absmod=word1[i]-word2[i](mod)
print(distance)
開發者ID:bazanovalyubov,項目名稱:HW_NIS,代碼行數:26,代碼來源:Bardina+08.12.15.py

示例15: TfidfVectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
__author__ = '315-4'
# -*- coding: utf-8 -*-
from gensim import corpora, models, matutils
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

dataset=fetch_20newsgroups()  # датасет - 20 групп новостей

vect = TfidfVectorizer()  # конвертор в матрицу TF-IDF
tok = vect.build_tokenizer()  # токенизатор

texts = []
# токенизация текстов
for text in dataset.data:
    texts.append(tok(text))

# на сцену выходит gensim
# Convert document (a list of words) into the bag-of-words
dictionary = corpora.Dictionary(texts)  # создаем словарь (сет токенов)
corpus = [dictionary.doc2bow(text) for text in texts]  # корпус

new_vec = dictionary.doc2bow((tok('Hello world')))  # это нигде не используется

# Обучение LDA модели
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,\
                               num_topics=100, update_every=1, chunksize=10000, passes=1)

# выводим матрицу V из UEV разложения
for item in lda.print_topics(100):
    print (item)
    
開發者ID:ayulit,項目名稱:karpov,代碼行數:32,代碼來源:lab4.py


注:本文中的sklearn.feature_extraction.text.TfidfVectorizer.build_tokenizer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。