本文整理汇总了Python中KaggleWord2VecUtility.KaggleWord2VecUtility.review_to_wordlist方法的典型用法代码示例。如果您正苦于以下问题:Python KaggleWord2VecUtility.review_to_wordlist方法的具体用法?Python KaggleWord2VecUtility.review_to_wordlist怎么用?Python KaggleWord2VecUtility.review_to_wordlist使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类KaggleWord2VecUtility.KaggleWord2VecUtility
的用法示例。
在下文中一共展示了KaggleWord2VecUtility.review_to_wordlist方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def main():
start_time = datetime.now()
df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
# test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3)
train, test, y, y_test = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0)
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange(0, len(train)):
traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], False)))
testdata = []
for i in xrange(0, len(test)):
testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
X_all = traindata + testdata
lentrain = len(traindata)
print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)
X = X_all[:lentrain]
X_test = X_all[lentrain:]
model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
C=1, fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
print "10 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=10, scoring='roc_auc'))
print "Retrain on all training data, predicting test labels...\n"
model.fit(X, y)
# result = model.predict_proba(X_test)[:,1] # predict as probability
result = model.predict(X_test)
# output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame(data={"sentiment":y_test, "predict_sentiment":result})
output['succeed'] = output['sentiment'] == output['predict_sentiment']
groupby = output.groupby('succeed')
print 'Result Evaluation'
print groupby['sentiment'].agg(['count'])
# Use pandas to write the comma-separated output file
output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model_linear.csv'), index=False, quoting=3)
print "Wrote results to Bag_of_Words_model_linear.csv"
print datetime.now() - start_time
示例2: tokenize
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def tokenize(sentence, grams):
words = KaggleWord2VecUtility.review_to_wordlist(sentence)
tokens = []
for gram in grams:
for i in range(len(words) - gram + 1):
tokens += ["_*_".join(words[i : i + gram])]
return tokens
示例3: create_task
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def create_task():
if not request.json or not 'id' in request.json:
abort(400)
task = {
'id': request.json['id'],
'text': request.json['text'],
}
clean_test_descripciones = []
app.logger.info('petition_classification: ' + task['text'])
features = review_words(task['text'])
clean_test_descripciones.append(u" ".join(
KaggleWord2VecUtility.review_to_wordlist(features, True)))
# Uses chord to run two jobs and a callback after processing ends
# 1) A text classifier
# 2) A profanity filter
# 3) A callback to put all together in a JSON
callback = update_remote_petition.subtask()
chord([
evaluate_petition.s(task['id'], clean_test_descripciones),
catch_bad_words_in_text.s(task['text'])
])(callback)
return jsonify({'id': request.json['id'],
'text': request.json['text']}), 201
示例4: getCleanDescriptions
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def getCleanDescriptions(descriptions):
clean_descriptions = []
local_counter=0
for description in descriptions["description"]:
clean_descriptions.append( KaggleWord2VecUtility.review_to_wordlist( description, remove_stopwords=True ))
local_counter=local_counter+1
print('Adding line : '+str(local_counter))
return clean_descriptions
示例5: getCleanLabeledReviews
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def getCleanLabeledReviews(reviews):
clean_reviews = []
for review in reviews["review"]:
clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
labelized = []
for i, id_label in enumerate(reviews["id"]):
labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
return labelized
示例6: review_to_sentences
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.strip())
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, remove_stopwords ))
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
示例7: xrange
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
from sklearn import cross_validation
model = Word2Vec.load_word2vec_format(constants.GOOGLE_WORD2VEC, binary=True)
train = pd.read_csv(os.path.join(
os.path.dirname(__file__), '..', 'fixtures', 'labeledTrainData.tsv'),
header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
test = pd.read_csv(os.path.join(
os.path.dirname(__file__), '..', 'fixtures', 'testData.tsv'),
header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange(0, len(train["review"])):
traindata.append(" ".join(
KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)))
testdata = []
for i in xrange(0, len(test["review"])):
testdata.append(" ".join(
KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True)))
X_all = traindata + testdata
lentrain = len(traindata)
print "fitting pipeline... ",
vectorizer = CountVectorizer(min_df=4)
vectorizer.fit(X_all)
start = time.time()
# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
示例8: xrange
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
from KaggleWord2VecUtility import KaggleWord2VecUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np
train = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'labeledTrainData.tsv'), header=0, \
delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'testData.tsv'), header=0, delimiter="\t", \
quoting=3 )
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange( 0, len(train["review"])):
traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in xrange(0,len(test["review"])):
testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)
print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)
示例9: TfidfVectorizer
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
train_file = 'data/labeledTrainData.tsv'
test_file = 'data/testData.tsv'
output_file = 'data/bow_predictions.csv'
#
train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 )
test = pd.read_csv( test_file, header = 0, delimiter = "\t", quoting = 3 )
#
print "Parsing train reviews..."
clean_train_reviews = []
for review in train['review']:
clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))
print "Parsing test reviews..."
clean_test_reviews = []
for review in test['review']:
clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))
#
print "Vectorizing train..."
vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ),
sublinear_tf = True )
train_x = vectorizer.fit_transform( clean_train_reviews )
示例10: clean_review_function
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def clean_review_function(review):
list_of_words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=False)
return ' '.join(list_of_words)
示例11:
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
# Initialize an empty list to hold the clean reviews
traindata = []
testdata = []
Y1=[]
Y2=[]
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list
for i in train:
buf=[]
traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i][0], True)))
for j in train[i][3].split():
if j in tag_dic:
buf.append(tag_dic[j])
Y1.append(buf)
for i in test:
buf=[]
testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i][0], True)))
for j in test[i][3].split():
if j in tag_dic:
buf.append(tag_dic[j])
Y2.append(buf)
# ****** Create a bag of words from the training set
示例12: train_test_split
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
train_i, test_i = data.ix[:,11] != -1, data.ix[:,11] == -1
train = data.ix[train_i]
test = data.ix[test_i]
#train_i, valid_i = train_test_split( np.arange( len( train )), train_size = 0.8, random_state = 88 )
#train = train.ix[train_i]
#validation = train.ix[valid_i]
#
print "Parsing train job titles..."
clean_train_reviews = []
for title in train['abstract']:
clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False)))
print "Parsing test reviews..."
clean_test_reviews = []
for title in test['abstract']:
clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False)))
#print "Parsing validation reviews..."
#clean_valid_reviews = []
#for title in validation['title']:
# clean_valid_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False)))
#
示例13: xrange
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
test_pkl = 'shortened_' + test_pkl
try:
traindata = pickle.load(open(os.path.join(base_path, 'data',
train_pkl), 'r'))
testdata = pickle.load(open(os.path.join(base_path, 'data',
test_pkl), 'r'))
except IOError as e:
if e.errno != errno.ENOENT:
raise e
else:
_logger.info('cleaning and parsing movie reviews')
traindata = []
for i in xrange(0, len(train["review"])):
review = KaggleWord2VecUtility.review_to_wordlist(train["review"][i],
False)
if SHORT_REVIEW:
review = review[:4]
traindata.append(' '.join(review))
testdata = []
for i in xrange(0, len(test["review"])):
review = KaggleWord2VecUtility.review_to_wordlist(test["review"][i],
False)
if SHORT_REVIEW:
review = review[:4]
testdata.append(' '.join(review))
pickle.dump(traindata, open(os.path.join(base_path, 'data',
train_pkl), 'w'))
pickle.dump(testdata, open(os.path.join(base_path, 'data',
test_pkl), 'w'))
示例14: main
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def main():
start_time = datetime.now()
df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
if GO_FOR_REAL:
test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )
train = df['review']
train_sentiment = df['sentiment']
test_id = test['id'].str.replace('"', '')
test = test['review']
else:
train, test, train_sentiment, test_sentiment = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0)
print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...'
#nltk.download() # Download text data sets, including stop words
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list
print "Cleaning and parsing the training set movie reviews...\n"
for i in xrange(0, len(train)):
clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], True)))
# ****** Create a bag of words from the training set
#
print "Creating the bag of words...\n"
# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
# CountVectorizer transform each document into a word count vector, with each word as a feature.
# Stop words are very frequent words in a language that may not have huge semantic impact, may dissolve the importance of other more meaning for words
# N-gram provide word combination as a new feature.
# stop_words: 'english' will use stop words from sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS, seem to make result more unstable.
# max_feature: limit only the more commonly appeared words in document to be in array, None will allow all features, and increase vector size.
# vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = 'english', max_features = None)
# Tfidf: a normalization method to reduce weight of words that appear too frequent in dataset
# TfidfVectorizer: CountVectorizer that run a tfidf normalization during transform
vectorizer = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)
print 'Train data feature shape: ' + str(train_data_features.shape)
print 'Number of vocabularies/features: %d\n' %len(vectorizer.get_feature_names())
# Numpy arrays are easy to work with, so convert the result to an
# array
train_data_features = train_data_features.toarray()
# ******* Train a model using the bag of words
#
print "Training the model (this may take a while)..."
# Initialize a Random Forest classifier with 100 trees
# clf = RandomForestClassifier(n_estimators=100)
# clf = svm.LinearSVC(C=1)
clf = LogisticRegressionCV(cv=3, scoring='roc_auc', solver='liblinear', Cs=[3, 4, 5, 6, 7])
# Cross validation, this takes a long time ...
# print "4 Fold CV Score: ", np.mean(cross_validation.cross_val_score(clf, train_data_features, train_sentiment, cv=4, scoring='accuracy', n_jobs=4))
# Fit the svc to the training set, using the bag of words as
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
model = clf.fit(train_data_features, train_sentiment)
# Create an empty list and append the clean reviews one by one
clean_test_reviews = []
print "Cleaning and parsing the test set movie reviews...\n"
for i in xrange(0, len(test)):
clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], True)))
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()
# Use svc to make sentiment label predictions
print "Predicting test labels...\n"
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
if GO_FOR_REAL:
result = model.predict_proba(test_data_features)[:, 1] # predict as probability
output = pd.DataFrame(data={"id": test_id, "sentiment":result})
else:
result = model.predict(test_data_features)
output = pd.DataFrame(data={"sentiment":test_sentiment, "predict_sentiment":result})
output['succeed'] = output['sentiment'] == output['predict_sentiment']
#.........这里部分代码省略.........
示例15: clean_review_function
# 需要导入模块: from KaggleWord2VecUtility import KaggleWord2VecUtility [as 别名]
# 或者: from KaggleWord2VecUtility.KaggleWord2VecUtility import review_to_wordlist [as 别名]
def clean_review_function(review):
global master_word_dict, number_of_rows
list_of_words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=False)
return ' '.join(list_of_words)