当前位置: 首页>>代码示例>>Python>>正文


Python KaggleWord2VecUtility.KaggleWord2VecUtility类代码示例

本文整理汇总了Python中KaggleWord2VecUtility.KaggleWord2VecUtility的典型用法代码示例。如果您正苦于以下问题:Python KaggleWord2VecUtility类的具体用法?Python KaggleWord2VecUtility怎么用?Python KaggleWord2VecUtility使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了KaggleWord2VecUtility类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

def main():
    start_time = datetime.now()

    df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
    # test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3)
    train, test, y, y_test = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0)

    print "Cleaning and parsing movie reviews...\n"      
    traindata = []
    for i in xrange(0, len(train)):
        traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], False)))
    testdata = []
    for i in xrange(0, len(test)):
        testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], False)))
    print 'vectorizing... ',
    tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                          ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
    X_all = traindata + testdata
    lentrain = len(traindata)

    print "fitting pipeline... ",
    tfv.fit(X_all)
    X_all = tfv.transform(X_all)

    X = X_all[:lentrain]
    X_test = X_all[lentrain:]

    model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                               C=1, fit_intercept=True, intercept_scaling=1.0,
                               class_weight=None, random_state=None)
    print "10 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=10, scoring='roc_auc'))

    print "Retrain on all training data, predicting test labels...\n"
    model.fit(X, y)
    # result = model.predict_proba(X_test)[:,1] # predict as probability
    result = model.predict(X_test)
    # output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

    # Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
    output = pd.DataFrame(data={"sentiment":y_test, "predict_sentiment":result})
    output['succeed'] = output['sentiment'] == output['predict_sentiment']

    groupby = output.groupby('succeed')
    print 'Result Evaluation'
    print groupby['sentiment'].agg(['count'])

    # Use pandas to write the comma-separated output file
    output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model_linear.csv'), index=False, quoting=3)
    print "Wrote results to Bag_of_Words_model_linear.csv"

    print datetime.now() - start_time
开发者ID:Josephu,项目名称:DeepLearningMovies,代码行数:51,代码来源:linear.py

示例2: tokenize

def tokenize(sentence, grams):
    words = KaggleWord2VecUtility.review_to_wordlist(sentence)
    tokens = []
    for gram in grams:
        for i in range(len(words) - gram + 1):
            tokens += ["_*_".join(words[i : i + gram])]
    return tokens
开发者ID:StevenLOL,项目名称:kaggle-word2vec-movie-reviews,代码行数:7,代码来源:nbsvm.py

示例3: create_task

def create_task():
    if not request.json or not 'id' in request.json:
        abort(400)
    task = {
        'id': request.json['id'],
        'text': request.json['text'],
    }
    clean_test_descripciones = []
    app.logger.info('petition_classification: ' + task['text'])
    features = review_words(task['text'])
    clean_test_descripciones.append(u" ".join(
        KaggleWord2VecUtility.review_to_wordlist(features, True)))

    # Uses chord to run two jobs and a callback after processing ends
    # 1) A text classifier
    # 2) A profanity filter
    # 3) A callback to put all together in a JSON
    callback = update_remote_petition.subtask()
    chord([
        evaluate_petition.s(task['id'], clean_test_descripciones),
        catch_bad_words_in_text.s(task['text'])
    ])(callback)

    return jsonify({'id': request.json['id'],
                    'text': request.json['text']}), 201
开发者ID:CodeandoMonterrey,项目名称:sara,代码行数:25,代码来源:ml_classifier.py

示例4: getCleanDescriptions

def getCleanDescriptions(descriptions):
    clean_descriptions = []
    local_counter=0
    for description in descriptions["description"]:
        clean_descriptions.append( KaggleWord2VecUtility.review_to_wordlist( description, remove_stopwords=True ))
        local_counter=local_counter+1
        print('Adding line : '+str(local_counter))
    return clean_descriptions
开发者ID:sduprey,项目名称:PYTHON_WEB,代码行数:8,代码来源:deep_learning_prototyping.py

示例5: getCleanLabeledReviews

def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized
开发者ID:StevenLOL,项目名称:kaggle-word2vec-movie-reviews,代码行数:9,代码来源:predict.py

示例6: review_to_sentences

def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence,               remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences
开发者ID:Gabrielvon,项目名称:Sentiment-Analysis,代码行数:18,代码来源:Get_averge_Word2vec.py

示例7: xrange

from sklearn import cross_validation

model = Word2Vec.load_word2vec_format(constants.GOOGLE_WORD2VEC, binary=True)

train = pd.read_csv(os.path.join(
	os.path.dirname(__file__), '..', 'fixtures', 'labeledTrainData.tsv'),
	header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
test = pd.read_csv(os.path.join(
	os.path.dirname(__file__), '..', 'fixtures', 'testData.tsv'),
	header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange(0, len(train["review"])):
    traindata.append(" ".join(
        KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)))
testdata = []
for i in xrange(0, len(test["review"])):
    testdata.append(" ".join(
        KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True)))

X_all = traindata + testdata
lentrain = len(traindata)

print "fitting pipeline... ",
vectorizer = CountVectorizer(min_df=4)
vectorizer.fit(X_all)

start = time.time()

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
开发者ID:JasonWayne,项目名称:kaggle-sentiment,代码行数:31,代码来源:word2vec_centroids.py

示例8: xrange

    test_pkl = 'shortened_' + test_pkl

try:
    traindata = pickle.load(open(os.path.join(base_path, 'data',
                                              train_pkl), 'r'))
    testdata = pickle.load(open(os.path.join(base_path, 'data',
                                             test_pkl), 'r'))
except IOError as e:
    if e.errno != errno.ENOENT:
        raise e
    else:
        _logger.info('cleaning and parsing movie reviews')

        traindata = []
        for i in xrange(0, len(train["review"])):
            review = KaggleWord2VecUtility.review_to_wordlist(train["review"][i],
                                                              False)
            if SHORT_REVIEW:
                review = review[:4]
            traindata.append(' '.join(review))
        testdata = []
        for i in xrange(0, len(test["review"])):
            review = KaggleWord2VecUtility.review_to_wordlist(test["review"][i],
                                                              False)
            if SHORT_REVIEW:
                review = review[:4]
            testdata.append(' '.join(review))

        pickle.dump(traindata, open(os.path.join(base_path, 'data',
                                                 train_pkl), 'w'))
        pickle.dump(testdata, open(os.path.join(base_path, 'data',
                                                test_pkl), 'w'))
开发者ID:waltherg,项目名称:DeepLearningMovies,代码行数:32,代码来源:linear.py

示例9: xrange

from KaggleWord2VecUtility import KaggleWord2VecUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'labeledTrainData.tsv'), header=0, \
                delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'testData.tsv'), header=0, delimiter="\t", \
               quoting=3 )
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange( 0, len(train["review"])):
    traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in xrange(0,len(test["review"])):
    testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3,  max_features=None,
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)

print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)
开发者ID:AbelJiang,项目名称:kaggle-word2vec,代码行数:30,代码来源:linear.py

示例10: clean_review_function

def clean_review_function(review):
    list_of_sentences = KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=False)
    return list_of_sentences
开发者ID:ddboline,项目名称:kaggle_imdb_sentiment_model,代码行数:3,代码来源:average_word_vectors.py

示例11:


    # Initialize an empty list to hold the clean reviews
    traindata = []
    testdata = []
     
    Y1=[]
    Y2=[]
    
    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    
    for i in train:
        buf=[]
        traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i][0], True)))
        for j in train[i][3].split():
            if j in tag_dic:
               buf.append(tag_dic[j])
        Y1.append(buf)



    for i in test:
        buf=[]
        testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i][0], True)))
        for j in test[i][3].split():
            if j in tag_dic:
               buf.append(tag_dic[j])
        Y2.append(buf)
    # ****** Create a bag of words from the training set
开发者ID:weichenzhao,项目名称:CS544_Project,代码行数:29,代码来源:BogOfWords.py

示例12: getCleanTestReviews

def getCleanTestReviews(skucollection):
    clean_skucollection = []
    for sku in skucollection["query"]:
        clean_skucollection.append( KaggleWord2VecUtility.sku_to_wordlist( sku, remove_stopwords=False ))
    return clean_skucollection
开发者ID:aviralmathur,项目名称:Word2Vec,代码行数:5,代码来源:SKU_Desc.py

示例13:

     test["review"].size, unlabeled_train["review"].size )



    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

    print "Parsing sentences from training set"
    for review in train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    print "Parsing sentences from unlabeled set"
    for review in unlabeled_train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    # ****** Set parameters and train the word2vec model
    #
    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    num_features = 300    # Word vector dimensionality
    min_word_count = 40   # Minimum word count
开发者ID:fujun-liu,项目名称:phone-scraping,代码行数:31,代码来源:analyze_comments.py

示例14: centroids




    # Create clean_train_reviews and clean_test_reviews as we did before
    #

    # Read data from files
    train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )


    print "Cleaning training reviews"
    clean_train_reviews = []
    for review in train["review"]:
        clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))

    print "Cleaning test reviews"
    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))


    # ****** Create bags of centroids
    #
    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros( (train["review"].size, num_clusters), \
        dtype="float32" )

    # Transform the training set reviews into bags of centroids
开发者ID:navink,项目名称:Kaggle-Deep-Learning,代码行数:28,代码来源:Word2VecCentroids.py

示例15: MongoClient

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    # making a connection to mongoDB
    client = MongoClient('localhost', 27017)
    db = client.cs336
    db.create_collection("unlabeled_review")

    print "Cleaning and parsing the training set movie reviews...\n"
   # for i in xrange( 0, len(train["review"])):
    for i in xrange(0, 500):
        #clean_train_revie0ws.append(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True)
        #clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_worddict(train["review"][i], True)))
        print i
        clean_train_review = KaggleWord2VecUtility.review_to_worddict(train["review"][i], True)
        #pprint(clean_train_reviews)
        record = {}
        record["id"] = train["id"][i]
       # record["sentiment"] = train["sentiment"][i]
        record["review"] = clean_train_review
        
        #pprint(record)

        db.unlabeled_review.insert_one(record)
    
    print "Inserted all documents to the collection"

    #pprint(clean_train_reviews[0])

    # ****** Create a bag of words from the training set
开发者ID:dataspy,项目名称:mongo_cs336,代码行数:30,代码来源:qhu.py


注:本文中的KaggleWord2VecUtility.KaggleWord2VecUtility类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。