Python TfidfVectorizer.build_analyzer方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.build_analyzer方法的典型用法代碼示例。如果您正苦於以下問題：Python TfidfVectorizer.build_analyzer方法的具體用法？Python TfidfVectorizer.build_analyzer怎麽用？Python TfidfVectorizer.build_analyzer使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer的用法示例。

在下文中一共展示了TfidfVectorizer.build_analyzer方法的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: learn_vocabulary

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def learn_vocabulary(docs, only_noun_phrases=True):
    first_occurrence_all = []
    entropy_all = []
    #docs = [doc.decode('utf8', 'ignore') for doc in docs]

    '''
    noun_phrases = set()
    if only_noun_phrases:
        for i, doc in enumerate(docs):
            print "--extracting NP from doc", i
            #doc = doc.decode('utf8', 'ignore')
            noun_phrases.update([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])

    with open('./semeval_train_docs_noun_phrases.set', 'w') as f:
        pickle.dump(noun_phrases, f)
    '''

    print "loading pre-extracted set of noun_phrases"
    noun_phrases = set()
    with open('./semeval_train_docs_noun_phrases.set', 'r') as f:
        noun_phrases = pickle.load(f)

    vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
    analyzer = vectorizer.build_analyzer()
    vocab = set()
    print "--learning vocabulary"
    for i, doc in enumerate(docs):
        print "--learning doc", i
        first_occurrence = {}
        entropy = {}

        phrases = analyzer(doc) # all phrases from doc
        doc = preprocess(doc)
        doc_length = len(doc)
        chunks = get_chunks(doc)
        for i, phrase in enumerate(phrases):
            if valid_ngram(phrase, noun_phrases) and phrase not in first_occurrence:
                try:
                    pos = doc.find(phrase)
                except ValueError:
                    print "--phrase: '{}' not found".format(phrase)
                    continue
                first_occurrence[phrase] = pos / doc_length
                # calculate entropy
                entropy[phrase] = get_entropy(phrase, chunks)
                vocab.add(phrase)
        first_occurrence_all.append(first_occurrence)
        entropy_all.append(entropy)
    print "--size of vocabulary: ", len(vocab)
    return vocab, first_occurrence_all, entropy_all

開發者ID:imxuyue，項目名稱:498_keyword_extraction，代碼行數:52，代碼來源:feature_extraction.py

示例2: Analyzer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
class Analyzer(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer(min_df=1, binary=False, ngram_range=(1, 3), tokenizer=Tokenizer())
        self.tokens = self.tfidf.build_tokenizer()
        self.ngram = self.tfidf.build_analyzer()

    def __call__(self, sentence):
        ret = self.ngram(sentence)
        terms = self.tokens(sentence)
        for term in terms:
            cate = term_category(term)
            if term != cate:
                ret.append(cate)
        return ret

開發者ID:renning22，項目名稱:cortana，代碼行數:16，代碼來源:__init__.py

示例3: feed

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def feed(param):
    values=[]
    result={}
    tweetdata = rawtweets.find()
    json_str =json_util.dumps(tweetdata)
    tweetdata =json_util.loads(json_str)
    path = os.path.dirname(os.path.realpath(__file__))
    texts = []
    for tweetlist in tweetdata:
        tweet = tweetlist["text"]
        print(tweet)
        #d = datetime.strptime(tweetlist["_id"], '%Y/%m/%d/%H')
        text = unicodedata.normalize('NFKD', tweet).encode('ascii','ignore').decode('utf-8')
        texts.append(text)
    vectorizer = TfidfVectorizer(
        analyzer='char',
        #token_pattern=r'[a-z]{4,}',
        #use_idf=True,
        #strip_accents='unicode',
        #sublinear_tf=False
        )
    print(len(texts))
    vectorizer.build_analyzer()
    idf = vectorizer.fit_transform(texts)
    feature_names = np.asarray(vectorizer.get_feature_names())
    #print(idf.todense().T)
    #print((idf * idf.T).A)
    #print(idf.data)
    print("len ",(feature_names))
    z = (zip(feature_names,idf.data))
    
    d = {}
    for t in z:
        #print(t[0],t[1])
        d[t[0]] = t[1] 
    #print(d)
    return d

開發者ID:decmhugh，項目名稱:smartertraffic，代碼行數:39，代碼來源:TwitterServiceAA.py

示例4: train

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
    def train(self, segments, ignore_before=4, ignore_after=4):
        '''
        This uses the 20newsgroups dataset for idf
        
        Parameters:
        :segments: list of strings where each string is a segment
        '''        
        data = fetch_20newsgroups(subset='train').data
        stripped_data = []
        
        for d in data:
            lines = d.split('\n')
            if len(lines)>ignore_before+ignore_after:
                stripped_data.append('\n'.join(lines[ignore_before:-ignore_after]))
        
        txt = ''.join(segments)
        stripped_data.append(txt)
        
        # Train corpus tf-idf
        tfidf_corpus = TfidfVectorizer(stop_words='english')
        tfidf_corpus.fit(stripped_data)
        book_scores = tfidf_corpus.transform([txt])
        print 'Learned {} features CORPUS'.format(len(tfidf_corpus.get_feature_names()))
        
        # Train document segment-wise tf-idf 
        tfidf_book = TfidfVectorizer(vocabulary=tfidf_corpus.vocabulary_)
        segment_scores = tfidf_book.fit_transform(segments)
        print 'Learned {} features BOOK'.format(len(tfidf_book.get_feature_names()))

        # Now get word scores in each segment
        final_scores = book_scores.multiply(segment_scores)

        idx_to_word = tfidf_corpus.get_feature_names()
        word_scores = []
        for i, segment_scores in enumerate(final_scores):
            scores = {}
            for j in segment_scores.indices:        
                scores[idx_to_word[j]] = segment_scores[0, j]
            word_scores.append(scores)
            
        self.word_scores = word_scores
        self.analyze = tfidf_corpus.build_analyzer()

開發者ID:gabrielhuang，項目名稱:parse-novel，代碼行數:44，代碼來源:weight.py

示例5: keyword_extractor_tfidf

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def keyword_extractor_tfidf(corpus_list,is_stop_words_allowed,n_gram_min,n_gram_max):
    if n_gram_min > n_gram_max:
        raise Exception('Invalid input n_gram_min should be <= n_gram_max')
    corpus = []
    for doc in corpus_list:
        text = ''
        for word in doc:
            text = text +' '+ word
        corpus.append(text)
    if is_stop_words_allowed == False:     
        vectorizer = TfidfVectorizer(ngram_range=(n_gram_min, n_gram_max),stop_words='english')
    else:
        vectorizer = TfidfVectorizer(ngram_range=(n_gram_min, n_gram_max))
    analyzer = vectorizer.build_analyzer()
    analyzer(corpus[0])
    features_array = vectorizer.fit_transform(corpus).toarray()
    features_transform_list = features_array.tolist()[0]
    features_dictionary = dict(zip(vectorizer.get_feature_names(),features_transform_list))
    sorted_features_dictionary = OrderedDict(sorted(features_dictionary.items(),key=itemgetter(1)))
    return sorted_features_dictionary

開發者ID:vipinanandcpp，項目名稱:Metricle，代碼行數:22，代碼來源:nltk_metricle.py

示例6: sentence_tokenizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def sentence_tokenizer(dataset_name="pascal"):
    """
    Parameters
    ----------
    dataset_name : string
        'memorability' or 'pascal' or 'clipart'

    Returns
    -------
    analyze : object
        breaks sentences into words using scikit-learn tokenizer
    vectorizer : object of class TfidfVectorizer
        see scikit-learn documentation
    """

    if dataset_name == "memorability":
        mat = scipy.io.loadmat("../../data/sentences/memorability_888_img_5_sent.mat")
        sentences = mat["memorability_sentences"]

    elif dataset_name == "pascal":
        mat = scipy.io.loadmat("../../data/sentences/pascal_1000_img_50_sent.mat")
        sentences = mat["pascal_sentences"]

    elif dataset_name == "clipart":
        mat = scipy.io.loadmat("../../data/sentences/clipart_500_img_48_sent.mat")
        sentences = mat["clipart_sentences"]

    # Build corpus
    corpus = list()
    for sent_group in sentences:
        corpus.append(" ".join([sent[0] for sent in sent_group]))

    ### Build tf-idf vectorizer ###

    # at-least three letters in word
    vectorizer = TfidfVectorizer(token_pattern="(?u)\\b\\w\\w\\w+\\b")
    vectorizer.fit(corpus)
    analyze = vectorizer.build_analyzer()

    return analyze, vectorizer

開發者ID:jasmainak，項目名稱:specificity，代碼行數:42，代碼來源:similarity.py

示例7: extract_candidates_doc

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size = 450):

    #vocab = set(phrase_list)
    idf_dic = {}
    #print "phrase list len", len(phrase_list)
    #print "len idf_vec", len(idf_vec)
    for i, phrase in enumerate(phrase_list):
        idf_dic[phrase] = idf_vec[i]
    noun_phrases = set()
    print "--extracting NP"
    noun_phrases = set([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])

    vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
    analyzer = vectorizer.build_analyzer()
    phrases = list(set([phrase for phrase in analyzer(doc) if valid_ngram(phrase, noun_phrases)]))
    doc = preprocess(doc)

    #print "candidate phrases", phrases
    #tfidf = []
    #first_occurrence = []
    #entropy = []
    #length = []
    doc_len = len(doc)

    entropy = get_entropy_doc(doc, phrases)
    # get feature vectors
    features = []
    for i, phrase in enumerate(phrases):
        first_occurrence = doc.find(phrase) / doc_len
        tf = doc.count(phrase)
        if phrase in idf_dic:
            tfidf = tf * idf_dic[phrase]
        else:
            tfidf = tf * log10(training_size)
        feature_vec = get_feature_vector(phrase, tfidf, first_occurrence, entropy[i])
        features.append(feature_vec)
    return phrases, features

開發者ID:imxuyue，項目名稱:498_keyword_extraction，代碼行數:39，代碼來源:feature_extraction.py

示例8: main

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def main():

    reload(sys)
    sys.setdefaultencoding('utf-8')

    pprint(LemmaTokenizer()("this is testing the stemming functionality"))


    param_grid = [
        {'C': [.125, .25, .5, 1, 10, 100, 1000]},
        { 'penalty': ('l1','l2')}
    ]

    svm_param_grid = [
        {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
        {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
    ]

    lines = [line for line in fileinput.input()]

    sentences = map(lambda x: x.split('\t')[1], lines)
    Y =  map(lambda x: int(x.split('\t')[0]), lines)

    vectorizer = TfidfVectorizer(min_df=1,
                                 tokenizer=POSTokenizer(),
                                 preprocessor=preprocess_sentence,
                                 ngram_range=(2,2),
                                 stop_words='english')

    pipeline = Pipeline([
        ('vect', vectorizer),
        ('clf', SGDClassifier()),
    ])

    # pprint(parameters)
    # t0 = time()
    # grid_search.fit(sentences, Y)
    # print("done in %0.3fs" % (time() - t0))
    # print()

    # print("Best score: %0.3f" % grid_search.best_score_)

    X = vectorizer.fit_transform(sentences)
    num_samples = len(Y)
    num_train = int(num_samples * .8)
    print "Num training: %d" % num_train
    X_train = X[0:num_train]
    Y_train = Y[0:num_train]
    X_test  = X[num_train:]
    Y_test = Y[num_train:]
    analyze = vectorizer.build_analyzer()

    for sentence in sentences[0:10]:
        print preprocess_sentence(sentence)
        print analyze(sentence)
        print "LemmaTokenizer" +  str(LemmaTokenizer()(sentence))
        print StemmingTokenizer()(sentence)

    # tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    # tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    # chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    logistic = linear_model.LogisticRegression(C=.5, class_weight=None, dual=False,
                                               fit_intercept=True, intercept_scaling=1, max_iter=100,
                                               multi_class='ovr', penalty='l2', random_state=None,
                                               solver='liblinear', tol=0.0001, verbose=0)

    # grid_search = GridSearchCV(SVC(), svm_param_grid, n_jobs=-1, verbose=1)
    # grid_search.fit(X_train, Y_train)
    # print grid_search.score(X_test, Y_test)
    # best_parameters = grid_search.best_estimator_.get_params()
    # print best_parameters

    # grid_search = GridSearchCV(logistic, param_grid, n_jobs=-1, verbose=1)
    # grid_search.fit(X_train, Y_train)
    # print grid_search.score(X_test, Y_test)
    # best_parameters = grid_search.best_estimator_.get_params()
    # print best_parameters

    print logistic.fit(X_train,Y_train).score(X_test,Y_test)

    show_most_informative_features(vectorizer, logistic, 25)

    num_errors = 0

    feature_names = vectorizer.vocabulary_
    feature_index = inv_map = {v: k for k, v in feature_names.items()}
    y_pred = []
    for (i,x) in enumerate(X_test):
        y_hat = logistic.predict(x)
        y_pred.append(y_hat)
        if y_hat != Y_test[i]:
            num_errors += 1
            print "\n\nError predicting sentence: " + sentences[i + num_train]
            print print_features(x, feature_index)
            print "Label: " + str(Y_test[i])
    error_rate = float(num_errors) / len(Y_test)
    print "Accuracy : " + str(1 - error_rate)

開發者ID:muranava，項目名稱:ElectionCandidateAnalysis，代碼行數:100，代碼來源:feature_extractor.py

示例9: build_analyzer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
    def build_analyzer(self):
        analyzer = TfidfVectorizer.build_analyzer(self)

        return lambda doc: (StemmedTfidfVectorizer.english_stemmer.stem(w) for w in analyzer(doc))

開發者ID:zhouqingyu，項目名稱:aiml，代碼行數:6，代碼來源:my_stemmer.py

示例10: open

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
sentences = scipy.io.loadmat('../../data/sentences/memorability_888_img_5_sent.mat')
sentences = sentences['memorability_sentences']

f = open('../../automated_specificity.txt', 'w')

sent_pairs, scores_w = list(), list()

vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w\\w+\\b')
corpus = list()

# Build corpus
for sent_group in sentences:
    corpus.append(' '.join([sent[0] for sent in sent_group]))

vectorizer.fit(corpus)
analyze = vectorizer.build_analyzer()

specificity_max, specificity_w = list(), list()
for im_idx, sentence_group in enumerate(sentences):

    similarity_max, similarity_w = list(), list()
    for (sent1, sent2) in combinations(sentence_group, 2):

        words1, words2 = analyze(sent1[0]), analyze(sent2[0])

        sent1_weights = [vectorizer.transform(sent1).toarray()[0][vectorizer.vocabulary_.get(w)] for w in words1]
        sent2_weights = [vectorizer.transform(sent2).toarray()[0][vectorizer.vocabulary_.get(w)] for w in words2]

        print >> f, [w.encode('utf-8') for w in words1]
        print >> f, [PrettyFloat(w) for w in sent1_weights]
        print >> f, [w.encode('utf-8') for w in words2]

開發者ID:jasmainak，項目名稱:specificity，代碼行數:33，代碼來源:calculate_automated_specificity.py

示例11: TfidfVectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train);
tfidf_test = tfidftransformer.fit(counts_test).transform(counts_test);

#或者讓兩個tf-idf共享vocabulary
#method 2:TfidfVectorizer
print '*************************\nTfidfVectorizer\n*************************'
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(sublinear_tf = True,
                                    max_df = 0.5,
                                    stop_words = 'english');
tfidf_train_2 = tv.fit_transform(newsgroup_train.data);
tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_);
tfidf_test_2 = tv2.fit_transform(newsgroups_test.data);
print "the shape of train is "+repr(tfidf_train_2.shape)
print "the shape of test is "+repr(tfidf_test_2.shape)
analyze = tv.build_analyzer()
tv.get_feature_names()#statistical features/terms

#（準確率*召回率）/（準確率+召回率）
def calculate_result(actual,pred):
    m_precision = metrics.precision_score(actual,pred);
    m_recall = metrics.recall_score(actual,pred);
    print 'predict info:'
    print 'precision:{0:.3f}'.format(m_precision)
    print 'recall:{0:0.3f}'.format(m_recall);
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred));

#或者sklearn裏封裝好的抓feature函數，fetch_20newsgroups_vectorized
print '*************************\nfetch_20newsgroups_vectorized\n*************************'
from sklearn.datasets import fetch_20newsgroups_vectorized
tfidf_train_3 = fetch_20newsgroups_vectorized(subset = 'train');

開發者ID:GZJackCai，項目名稱:machineLearnTest，代碼行數:33，代碼來源:testScipy.py

示例12: main

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def main():
    global X
    logging.info('Started')
    # pulling primary bill sponsor to match with party information 
    sponsors_query = db.bills_details.find({},
        {'_id': 1,'sponsors.leg_id':1,'sponsors.type':1,'sponsors.name':1, 
                  'action_dates.signed': 1}) #able to limit number of records for testing

    sponsors = list(sponsors_query)
    bill_party = []
    # sponsors[0]['sponsors'][0]
    # Creates list of dict: bill database ID, passed status, legislator ID and party 
    for i in range(len(sponsors)):
        bill_dbid = sponsors[i]['_id']
        leg_id = sponsors[i]['sponsors'][0]['leg_id']
       
        if leg_id == None: 
            leg_id = 'CA0000'
            party = sponsors[i]['sponsors'][0]['name']
        else: 
            party = GetParty(leg_id)
            if party == None:
                party = sponsors[i]['sponsors'][0]['name']
       
        if sponsors[i]['action_dates']['signed'] == None:
            bill_signed = False
        else:
            bill_signed = True

        k = ['id', 'leg_id', 'party','passed']
        v = [bill_dbid, leg_id, party, bill_signed]
        bill_party.append(dict(zip(k,v)))

    logging.info('populated list of sponsor and party')    
    # note to self/presentation: show number of bills sponsored by non-legislators
    # graph bills by party that passed .....     

    # Do I need to create/ update a dictionary? This pulls MongoDB_Id and texts
    # all_legtext = list(db.legtext.find({}, {'text': 1}).limit(25))

    #adds vectorized features of bigrams using function
    # for i in range(len(bill_party)):
    #     vec = GetBigramsVector(bill_party[i]['id'])
    #     bill_party[i]['vec'] = vec
    # logging.info('loaded vectorized bigrams')

    bigram_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), token_pattern=r'\b\w+\b', min_df =1)
    analyze = bigram_vectorizer.build_analyzer()

    for i in range(len(bill_party)):
        #oid = bill_party[i]['id']
        #print "Getting text for item", i, bill_party[i]['id']
        leg_text = list(db.legtext.find({'_id': bill_party[i]['id']}, {'text': 1}))[0]['text']
        raw = nltk.clean_html(leg_text)
        # bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
        bigram_features = analyze(raw)
        bill_party[i]['features'] = bigram_features
        bill_party[i]['raw'] = raw
        # bill_party[i]['vec'] = bigram_vectorizer.fit_transform(bigram_features).toarray()
    
    party_options = {'democratic': 0, 'republican': 1}
    X = bigram_vectorizer.fit_transform([x['raw'] for x in bill_party if x['party'].lower() in party_options])
    print bigram_vectorizer
    logging.info('loaded tfidf vectorized bigrams')

    # Creates numpy arrays, results = party and features = vectorized words  
    # party only = democrat or republican and vectorized text
    bp_target = []
    bp_data = []
    for i in range(len(bill_party)):
        if bill_party[i]['party'].lower() in ('democratic', 'republican'): 
            bp_target.append( party_options[bill_party[i]['party'].lower()] )            
        else:
            continue

    targets = np.array(bp_target)
    data = X.toarray()

    #====================================================================================
    # Random Forests Modeling and Plotting 
    #===================================================================================
    
    # Parameters
    n_classes = 2
    n_estimators = 30
    plot_colors = "ryb"
    cmap = pl.cm.RdYlBu
    plot_step = 0.02  # fine step width for decision surface contours
    plot_step_coarser = 0.5  # step widths for coarse classifier guesses
    RANDOM_SEED = 9  # fix the seed on each iteration ???

    plot_idx = 1

    models = [DecisionTreeClassifier(max_depth=None),
              RandomForestClassifier(n_estimators=n_estimators),
              ExtraTreesClassifier(n_estimators=n_estimators),
              AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                                 n_estimators=n_estimators)]

   
#.........這裏部分代碼省略.........

開發者ID:javiapp，項目名稱:CA-leg-predict，代碼行數:103，代碼來源:PartyWords_RF.py

示例13: build_analyzer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
 def build_analyzer(self):
     analyzer = TfidfVectorizer.build_analyzer(self)
     english_stemmer = SnowballStemmer('english')
     return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))

開發者ID:jichao06，項目名稱:datascience，代碼行數:6，代碼來源:clustering.py

示例14: init

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
	def __init__(self, n_features, voc_file):
		self.n_features = n_features
		self.voc_file = voc_file
		self.word_clusters, self.grouped_words = self.read_word_cluster(voc_file)
		tfidf = TfidfVectorizer(encoding = 'iso-8859-1', stop_words='english')
		self.vectorize = tfidf.build_analyzer()

開發者ID:alyssonbispo，項目名稱:fun_with_kaggle，代碼行數:8，代碼來源:evergreen.py

示例15: fetch_20newsgroups

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
        data_full.append(SiteData('fb/srsplit/fullfbsearch_results_combined{i:02d}'.format(i=file_counter),categories, full_candidate_dict))
"""
data_train = fetch_20newsgroups(subset='train', categories=categories,
                               shuffle=True, random_state=42)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                              shuffle=True, random_state=42)
"""
print 'data loaded'
import conversions as conv
from ersatzpg.utffile import utffile
special_terms = []
vocabulary = []
basic_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, use_idf=False,
                             stop_words='english')
basic_analyze = basic_vectorizer.build_analyzer()
with utffile('searchterms.csv') as f:
    for s in f:
        if s.startswith('<'):
            special_terms.append(s.strip('<>'))
        else:
            vocabulary.append(s.decode('utf-8').strip())
fb_page_data = {}
with open('fb/facebookpolsurls_bkp.csv') as f:
    csvr = csv.DictReader(f)
    for l in csvr:
        fb_page_data.update({l['url']:{'fans':l['Fan Count'].replace(',',''),'authentic':l['Authentic Category']}})

def analyze(s):
    d=eval(s)
    special_keys = []

開發者ID:natgaertner，項目名稱:candidate_classifier，代碼行數:33，代碼來源:classify_fb_combined.py

注：本文中的sklearn.feature_extraction.text.TfidfVectorizer.build_analyzer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。