Python KaggleWord2VecUtility.review_to_wordlist方法代码示例

本文整理汇总了Python中KaggleWord2VecUtility.KaggleWord2VecUtility.review_to_wordlist方法的典型用法代码示例。如果您正苦于以下问题：Python KaggleWord2VecUtility.review_to_wordlist方法的具体用法？Python KaggleWord2VecUtility.review_to_wordlist怎么用？Python KaggleWord2VecUtility.review_to_wordlist使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类KaggleWord2VecUtility.KaggleWord2VecUtility的用法示例。

在下文中一共展示了KaggleWord2VecUtility.review_to_wordlist方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def main():
    start_time = datetime.now()

    df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
    # test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3)
    train, test, y, y_test = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0)

    print "Cleaning and parsing movie reviews...\n"      
    traindata = []
    for i in xrange(0, len(train)):
        traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], False)))
    testdata = []
    for i in xrange(0, len(test)):
        testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], False)))
    print 'vectorizing... ',
    tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                          ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
    X_all = traindata + testdata
    lentrain = len(traindata)

    print "fitting pipeline... ",
    tfv.fit(X_all)
    X_all = tfv.transform(X_all)

    X = X_all[:lentrain]
    X_test = X_all[lentrain:]

    model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                               C=1, fit_intercept=True, intercept_scaling=1.0,
                               class_weight=None, random_state=None)
    print "10 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=10, scoring='roc_auc'))

    print "Retrain on all training data, predicting test labels...\n"
    model.fit(X, y)
    # result = model.predict_proba(X_test)[:,1] # predict as probability
    result = model.predict(X_test)
    # output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

    # Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
    output = pd.DataFrame(data={"sentiment":y_test, "predict_sentiment":result})
    output['succeed'] = output['sentiment'] == output['predict_sentiment']

    groupby = output.groupby('succeed')
    print 'Result Evaluation'
    print groupby['sentiment'].agg(['count'])

    # Use pandas to write the comma-separated output file
    output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model_linear.csv'), index=False, quoting=3)
    print "Wrote results to Bag_of_Words_model_linear.csv"

    print datetime.now() - start_time

开发者ID:Josephu，项目名称:DeepLearningMovies，代码行数:53，代码来源:linear.py

示例2: tokenize

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def tokenize(sentence, grams):
    words = KaggleWord2VecUtility.review_to_wordlist(sentence)
    tokens = []
    for gram in grams:
        for i in range(len(words) - gram + 1):
            tokens += ["_*_".join(words[i : i + gram])]
    return tokens

开发者ID:StevenLOL，项目名称:kaggle-word2vec-movie-reviews，代码行数:9，代码来源:nbsvm.py

示例3: create_task

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def create_task():
    if not request.json or not 'id' in request.json:
        abort(400)
    task = {
        'id': request.json['id'],
        'text': request.json['text'],
    }
    clean_test_descripciones = []
    app.logger.info('petition_classification: ' + task['text'])
    features = review_words(task['text'])
    clean_test_descripciones.append(u" ".join(
        KaggleWord2VecUtility.review_to_wordlist(features, True)))

    # Uses chord to run two jobs and a callback after processing ends
    # 1) A text classifier
    # 2) A profanity filter
    # 3) A callback to put all together in a JSON
    callback = update_remote_petition.subtask()
    chord([
        evaluate_petition.s(task['id'], clean_test_descripciones),
        catch_bad_words_in_text.s(task['text'])
    ])(callback)

    return jsonify({'id': request.json['id'],
                    'text': request.json['text']}), 201

开发者ID:CodeandoMonterrey，项目名称:sara，代码行数:27，代码来源:ml_classifier.py

示例4: getCleanDescriptions

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def getCleanDescriptions(descriptions):
    clean_descriptions = []
    local_counter=0
    for description in descriptions["description"]:
        clean_descriptions.append( KaggleWord2VecUtility.review_to_wordlist( description, remove_stopwords=True ))
        local_counter=local_counter+1
        print('Adding line : '+str(local_counter))
    return clean_descriptions

开发者ID:sduprey，项目名称:PYTHON_WEB，代码行数:10，代码来源:deep_learning_prototyping.py

示例5: getCleanLabeledReviews

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized

开发者ID:StevenLOL，项目名称:kaggle-word2vec-movie-reviews，代码行数:11，代码来源:predict.py

示例6: review_to_sentences

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence,               remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

开发者ID:Gabrielvon，项目名称:Sentiment-Analysis，代码行数:20，代码来源:Get_averge_Word2vec.py

示例7: xrange

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
from sklearn import cross_validation

model = Word2Vec.load_word2vec_format(constants.GOOGLE_WORD2VEC, binary=True)

train = pd.read_csv(os.path.join(
	os.path.dirname(__file__), '..', 'fixtures', 'labeledTrainData.tsv'),
	header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
test = pd.read_csv(os.path.join(
	os.path.dirname(__file__), '..', 'fixtures', 'testData.tsv'),
	header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange(0, len(train["review"])):
    traindata.append(" ".join(
        KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)))
testdata = []
for i in xrange(0, len(test["review"])):
    testdata.append(" ".join(
        KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True)))

X_all = traindata + testdata
lentrain = len(traindata)

print "fitting pipeline... ",
vectorizer = CountVectorizer(min_df=4)
vectorizer.fit(X_all)

start = time.time()

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an

开发者ID:JasonWayne，项目名称:kaggle-sentiment，代码行数:33，代码来源:word2vec_centroids.py

示例8: xrange

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
from KaggleWord2VecUtility import KaggleWord2VecUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'labeledTrainData.tsv'), header=0, \
                delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'testData.tsv'), header=0, delimiter="\t", \
               quoting=3 )
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange( 0, len(train["review"])):
    traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in xrange(0,len(test["review"])):
    testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3,  max_features=None,
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)

print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)

开发者ID:AbelJiang，项目名称:kaggle-word2vec，代码行数:32，代码来源:linear.py

示例9: TfidfVectorizer

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
train_file = 'data/labeledTrainData.tsv' 
test_file = 'data/testData.tsv'
output_file = 'data/bow_predictions.csv'

#

train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 )
test = pd.read_csv( test_file, header = 0, delimiter = "\t", quoting = 3 )

#

print "Parsing train reviews..."

clean_train_reviews = []
for review in train['review']:
	clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))

print "Parsing test reviews..."

clean_test_reviews = []
for review in test['review']:
	clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))

#

print "Vectorizing train..."

vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ), 
	sublinear_tf = True )
train_x = vectorizer.fit_transform( clean_train_reviews )

开发者ID:Isaac1989，项目名称:classifying-text，代码行数:32，代码来源:bow_predict.py

示例10: clean_review_function

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def clean_review_function(review):
    list_of_words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=False)
    return ' '.join(list_of_words)

开发者ID:ddboline，项目名称:kaggle_imdb_sentiment_model，代码行数:5，代码来源:bag_of_words_model.py

示例11:

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]

    # Initialize an empty list to hold the clean reviews
    traindata = []
    testdata = []
     
    Y1=[]
    Y2=[]
    
    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    
    for i in train:
        buf=[]
        traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i][0], True)))
        for j in train[i][3].split():
            if j in tag_dic:
               buf.append(tag_dic[j])
        Y1.append(buf)



    for i in test:
        buf=[]
        testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i][0], True)))
        for j in test[i][3].split():
            if j in tag_dic:
               buf.append(tag_dic[j])
        Y2.append(buf)
    # ****** Create a bag of words from the training set

开发者ID:weichenzhao，项目名称:CS544_Project，代码行数:32，代码来源:BogOfWords.py

示例12: train_test_split

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
train_i, test_i = data.ix[:,11] != -1, data.ix[:,11] == -1

train = data.ix[train_i]
test = data.ix[test_i]

#train_i, valid_i = train_test_split( np.arange( len( train )), train_size = 0.8, random_state = 88 )
#train = train.ix[train_i]
#validation = train.ix[valid_i]

#

print "Parsing train job titles..."

clean_train_reviews = []
for title in train['abstract']:
	clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False)))

print "Parsing test reviews..."

clean_test_reviews = []
for title in test['abstract']:
	clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False)))

#print "Parsing validation reviews..."

#clean_valid_reviews = []
#for title in validation['title']:
#	clean_valid_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False)))

#

开发者ID:ivanliu1989，项目名称:Melbourne_Datathon_2016_Kaggle，代码行数:32，代码来源:bow_validate_tfidf.py

示例13: xrange

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
    test_pkl = 'shortened_' + test_pkl

try:
    traindata = pickle.load(open(os.path.join(base_path, 'data',
                                              train_pkl), 'r'))
    testdata = pickle.load(open(os.path.join(base_path, 'data',
                                             test_pkl), 'r'))
except IOError as e:
    if e.errno != errno.ENOENT:
        raise e
    else:
        _logger.info('cleaning and parsing movie reviews')

        traindata = []
        for i in xrange(0, len(train["review"])):
            review = KaggleWord2VecUtility.review_to_wordlist(train["review"][i],
                                                              False)
            if SHORT_REVIEW:
                review = review[:4]
            traindata.append(' '.join(review))
        testdata = []
        for i in xrange(0, len(test["review"])):
            review = KaggleWord2VecUtility.review_to_wordlist(test["review"][i],
                                                              False)
            if SHORT_REVIEW:
                review = review[:4]
            testdata.append(' '.join(review))

        pickle.dump(traindata, open(os.path.join(base_path, 'data',
                                                 train_pkl), 'w'))
        pickle.dump(testdata, open(os.path.join(base_path, 'data',
                                                test_pkl), 'w'))

开发者ID:waltherg，项目名称:DeepLearningMovies，代码行数:34，代码来源:linear.py

示例14: main

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def main():
    start_time = datetime.now()

    df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
    if GO_FOR_REAL:
        test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )
        train = df['review']
        train_sentiment = df['sentiment']
        test_id = test['id'].str.replace('"', '')
        test = test['review']
    else:
        train, test, train_sentiment, test_sentiment = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0)

    print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...'
    #nltk.download()  # Download text data sets, including stop words

    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    print "Cleaning and parsing the training set movie reviews...\n"
    for i in xrange(0, len(train)):
        clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], True)))


    # ****** Create a bag of words from the training set
    #
    print "Creating the bag of words...\n"


    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
    # CountVectorizer transform each document into a word count vector, with each word as a feature.
    # Stop words are very frequent words in a language that may not have huge semantic impact, may dissolve the importance of other more meaning for words
    # N-gram provide word combination as a new feature.
    # stop_words: 'english' will use stop words from sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS, seem to make result more unstable.
    # max_feature: limit only the more commonly appeared words in document to be in array, None will allow all features, and increase vector size.
    # vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = 'english', max_features = None)

    # Tfidf: a normalization method to reduce weight of words that appear too frequent in dataset
    # TfidfVectorizer: CountVectorizer that run a tfidf normalization during transform
    vectorizer = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                 ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    print 'Train data feature shape: ' + str(train_data_features.shape)
    print 'Number of vocabularies/features: %d\n' %len(vectorizer.get_feature_names())

    # Numpy arrays are easy to work with, so convert the result to an
    # array
    train_data_features = train_data_features.toarray()

    # ******* Train a model using the bag of words
    #
    print "Training the model (this may take a while)..."


    # Initialize a Random Forest classifier with 100 trees
    # clf = RandomForestClassifier(n_estimators=100)
    # clf = svm.LinearSVC(C=1)
    clf = LogisticRegressionCV(cv=3, scoring='roc_auc', solver='liblinear', Cs=[3, 4, 5, 6, 7])

    # Cross validation, this takes a long time ...
    # print "4 Fold CV Score: ", np.mean(cross_validation.cross_val_score(clf, train_data_features, train_sentiment, cv=4, scoring='accuracy', n_jobs=4))

    # Fit the svc to the training set, using the bag of words as
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    model = clf.fit(train_data_features, train_sentiment)

    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = []

    print "Cleaning and parsing the test set movie reviews...\n"
    for i in xrange(0, len(test)):
        clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], True)))

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    # Use svc to make sentiment label predictions
    print "Predicting test labels...\n"

    # Copy the results to a pandas dataframe with an "id" column and
    # a "sentiment" column
    if GO_FOR_REAL:
        result = model.predict_proba(test_data_features)[:, 1] # predict as probability
        output = pd.DataFrame(data={"id": test_id, "sentiment":result})
    else:
        result = model.predict(test_data_features)
        output = pd.DataFrame(data={"sentiment":test_sentiment, "predict_sentiment":result})
        output['succeed'] = output['sentiment'] == output['predict_sentiment']
#.........这里部分代码省略.........

开发者ID:Josephu，项目名称:DeepLearningMovies，代码行数:103，代码来源:BagOfWords.py

示例15: clean_review_function

# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def clean_review_function(review):
    global master_word_dict, number_of_rows
    list_of_words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=False)
    return ' '.join(list_of_words)

开发者ID:ddboline，项目名称:kaggle_imdb_sentiment_model，代码行数:6，代码来源:my_model.py

注：本文中的KaggleWord2VecUtility.KaggleWord2VecUtility.review_to_wordlist方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。