本文整理汇总了Python中KaggleWord2VecUtility.KaggleWord2VecUtility类的典型用法代码示例。如果您正苦于以下问题:Python KaggleWord2VecUtility类的具体用法?Python KaggleWord2VecUtility怎么用?Python KaggleWord2VecUtility使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了KaggleWord2VecUtility类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main():
start_time = datetime.now()
df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
# test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3)
train, test, y, y_test = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0)
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange(0, len(train)):
traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], False)))
testdata = []
for i in xrange(0, len(test)):
testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
X_all = traindata + testdata
lentrain = len(traindata)
print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)
X = X_all[:lentrain]
X_test = X_all[lentrain:]
model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
C=1, fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
print "10 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=10, scoring='roc_auc'))
print "Retrain on all training data, predicting test labels...\n"
model.fit(X, y)
# result = model.predict_proba(X_test)[:,1] # predict as probability
result = model.predict(X_test)
# output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame(data={"sentiment":y_test, "predict_sentiment":result})
output['succeed'] = output['sentiment'] == output['predict_sentiment']
groupby = output.groupby('succeed')
print 'Result Evaluation'
print groupby['sentiment'].agg(['count'])
# Use pandas to write the comma-separated output file
output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model_linear.csv'), index=False, quoting=3)
print "Wrote results to Bag_of_Words_model_linear.csv"
print datetime.now() - start_time
示例2: tokenize
def tokenize(sentence, grams):
words = KaggleWord2VecUtility.review_to_wordlist(sentence)
tokens = []
for gram in grams:
for i in range(len(words) - gram + 1):
tokens += ["_*_".join(words[i : i + gram])]
return tokens
示例3: create_task
def create_task():
if not request.json or not 'id' in request.json:
abort(400)
task = {
'id': request.json['id'],
'text': request.json['text'],
}
clean_test_descripciones = []
app.logger.info('petition_classification: ' + task['text'])
features = review_words(task['text'])
clean_test_descripciones.append(u" ".join(
KaggleWord2VecUtility.review_to_wordlist(features, True)))
# Uses chord to run two jobs and a callback after processing ends
# 1) A text classifier
# 2) A profanity filter
# 3) A callback to put all together in a JSON
callback = update_remote_petition.subtask()
chord([
evaluate_petition.s(task['id'], clean_test_descripciones),
catch_bad_words_in_text.s(task['text'])
])(callback)
return jsonify({'id': request.json['id'],
'text': request.json['text']}), 201
示例4: getCleanDescriptions
def getCleanDescriptions(descriptions):
clean_descriptions = []
local_counter=0
for description in descriptions["description"]:
clean_descriptions.append( KaggleWord2VecUtility.review_to_wordlist( description, remove_stopwords=True ))
local_counter=local_counter+1
print('Adding line : '+str(local_counter))
return clean_descriptions
示例5: getCleanLabeledReviews
def getCleanLabeledReviews(reviews):
clean_reviews = []
for review in reviews["review"]:
clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
labelized = []
for i, id_label in enumerate(reviews["id"]):
labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
return labelized
示例6: review_to_sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.strip())
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, remove_stopwords ))
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
示例7: xrange
from sklearn import cross_validation
model = Word2Vec.load_word2vec_format(constants.GOOGLE_WORD2VEC, binary=True)
train = pd.read_csv(os.path.join(
os.path.dirname(__file__), '..', 'fixtures', 'labeledTrainData.tsv'),
header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
test = pd.read_csv(os.path.join(
os.path.dirname(__file__), '..', 'fixtures', 'testData.tsv'),
header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange(0, len(train["review"])):
traindata.append(" ".join(
KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)))
testdata = []
for i in xrange(0, len(test["review"])):
testdata.append(" ".join(
KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True)))
X_all = traindata + testdata
lentrain = len(traindata)
print "fitting pipeline... ",
vectorizer = CountVectorizer(min_df=4)
vectorizer.fit(X_all)
start = time.time()
# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
示例8: xrange
test_pkl = 'shortened_' + test_pkl
try:
traindata = pickle.load(open(os.path.join(base_path, 'data',
train_pkl), 'r'))
testdata = pickle.load(open(os.path.join(base_path, 'data',
test_pkl), 'r'))
except IOError as e:
if e.errno != errno.ENOENT:
raise e
else:
_logger.info('cleaning and parsing movie reviews')
traindata = []
for i in xrange(0, len(train["review"])):
review = KaggleWord2VecUtility.review_to_wordlist(train["review"][i],
False)
if SHORT_REVIEW:
review = review[:4]
traindata.append(' '.join(review))
testdata = []
for i in xrange(0, len(test["review"])):
review = KaggleWord2VecUtility.review_to_wordlist(test["review"][i],
False)
if SHORT_REVIEW:
review = review[:4]
testdata.append(' '.join(review))
pickle.dump(traindata, open(os.path.join(base_path, 'data',
train_pkl), 'w'))
pickle.dump(testdata, open(os.path.join(base_path, 'data',
test_pkl), 'w'))
示例9: xrange
from KaggleWord2VecUtility import KaggleWord2VecUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np
train = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'labeledTrainData.tsv'), header=0, \
delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'testData.tsv'), header=0, delimiter="\t", \
quoting=3 )
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange( 0, len(train["review"])):
traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in xrange(0,len(test["review"])):
testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)
print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)
示例10: clean_review_function
def clean_review_function(review):
list_of_sentences = KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=False)
return list_of_sentences
示例11:
# Initialize an empty list to hold the clean reviews
traindata = []
testdata = []
Y1=[]
Y2=[]
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list
for i in train:
buf=[]
traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i][0], True)))
for j in train[i][3].split():
if j in tag_dic:
buf.append(tag_dic[j])
Y1.append(buf)
for i in test:
buf=[]
testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i][0], True)))
for j in test[i][3].split():
if j in tag_dic:
buf.append(tag_dic[j])
Y2.append(buf)
# ****** Create a bag of words from the training set
示例12: getCleanTestReviews
def getCleanTestReviews(skucollection):
clean_skucollection = []
for sku in skucollection["query"]:
clean_skucollection.append( KaggleWord2VecUtility.sku_to_wordlist( sku, remove_stopwords=False ))
return clean_skucollection
示例13:
test["review"].size, unlabeled_train["review"].size )
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# ****** Split the labeled and unlabeled training sets into clean sentences
#
sentences = [] # Initialize an empty list of sentences
print "Parsing sentences from training set"
for review in train["review"]:
sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
print "Parsing sentences from unlabeled set"
for review in unlabeled_train["review"]:
sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
# ****** Set parameters and train the word2vec model
#
# Import the built-in logging module and configure it so that Word2Vec
# creates nice output messages
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
level=logging.INFO)
# Set values for various parameters
num_features = 300 # Word vector dimensionality
min_word_count = 40 # Minimum word count
示例14: centroids
# Create clean_train_reviews and clean_test_reviews as we did before
#
# Read data from files
train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )
print "Cleaning training reviews"
clean_train_reviews = []
for review in train["review"]:
clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
remove_stopwords=True ))
print "Cleaning test reviews"
clean_test_reviews = []
for review in test["review"]:
clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
remove_stopwords=True ))
# ****** Create bags of centroids
#
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train["review"].size, num_clusters), \
dtype="float32" )
# Transform the training set reviews into bags of centroids
示例15: MongoClient
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list
# making a connection to mongoDB
client = MongoClient('localhost', 27017)
db = client.cs336
db.create_collection("unlabeled_review")
print "Cleaning and parsing the training set movie reviews...\n"
# for i in xrange( 0, len(train["review"])):
for i in xrange(0, 500):
#clean_train_revie0ws.append(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)
#clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_worddict(train["review"][i], True)))
print i
clean_train_review = KaggleWord2VecUtility.review_to_worddict(train["review"][i], True)
#pprint(clean_train_reviews)
record = {}
record["id"] = train["id"][i]
# record["sentiment"] = train["sentiment"][i]
record["review"] = clean_train_review
#pprint(record)
db.unlabeled_review.insert_one(record)
print "Inserted all documents to the collection"
#pprint(clean_train_reviews[0])
# ****** Create a bag of words from the training set