当前位置: 首页>>代码示例>>Python>>正文


Python tokenize.word_tokenize函数代码示例

本文整理汇总了Python中nltk.tokenize.word_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python word_tokenize函数的具体用法?Python word_tokenize怎么用?Python word_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了word_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: createTrainingVectors

def createTrainingVectors(tokenized_texts_dict):
    """
        Given the filenames and their contents, this methods creates the training 
        vectors by creating a unique list of all words together in the training
        set
    """
    print("Creating vectors for training data")

    unique_words = []
    for filename, text in tokenized_texts_dict.iteritems():
        # print("Reading {0} and adding to unique word list".format(filename))
        unique_words.extend(word_tokenize(text))

    unique_words = set(unique_words)

    # Creating the initial vector with counts 0 for all training sets
    zero_vector = OrderedDict(zip(unique_words, [0] * len(unique_words)))
    print("Creating the zero vector")

    # For each training file, create an OrderedDict containing its word counts (together with zero counts),
    # and store it in a dict, indexed by its corresponding filename
    vectors = {}
    for filename, token_list in tokenized_texts_dict.iteritems():
        current_vector = zero_vector.copy()
        current_vector.update(Counter(word_tokenize(token_list)))
        vectors[filename] = current_vector

    return vectors, zero_vector
开发者ID:gkeswani92,项目名称:N-Gram-Language-Modeling,代码行数:28,代码来源:KNearestNeighbourClassifier.py

示例2: max_similarity

def max_similarity(context_sentence, ambiguous_word, option="path", 
                   lemma=True, context_is_lemmatized=False, pos=None, best=True):
    """
    Perform WSD by maximizing the sum of maximum similarity between possible 
    synsets of all words in the context sentence and the possible synsets of the 
    ambiguous words (see http://goo.gl/XMq2BI):
    {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    if context_is_lemmatized:
        context_sentence = word_tokenize(context_sentence)
    else:
        context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)]
    result = {}
    for i in wn.synsets(ambiguous_word):
        try:
            if pos and pos != str(i.pos()):
                continue
        except:
            if pos and pos != str(i.pos):
                continue 
        result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \
                        for j in context_sentence)
    
    if option in ["res","resnik"]: # lower score = more similar
        result = sorted([(v,k) for k,v in result.items()])
    else: # higher score = more similar
        result = sorted([(v,k) for k,v in result.items()],reverse=True)
    ##print result
    if best: return result[0][1];
    return result
开发者ID:ChenglongChen,项目名称:pywsd,代码行数:34,代码来源:similarity.py

示例3: main

def main():
    # Load up txt files
    speech_file = open('trump-speeches/speeches.txt').read()
    tweets = json.load(open('trump_tweets.json'))
    tweet_list = []
    for tweet in tweets:
        tweet_list.append(tweet['text'])
    tweet_list = ' '.join(tweet_list)

    # Tokenize
    logging.info('Formatting training text')
    speech_token = word_tokenize(speech_file)
    tweet_token = word_tokenize(tweet_list)

    # Train trigram models
    logging.info('Setting up models')
    speech_gram, speech_format = ngram(speech_token, 3)
    tweet_gram, tweet_format = ngram(tweet_token, 3)

    # Generate responses
    cont = True
    while cont:
        response = input("Hello sir, what can I Trumpinate for you?: ")
        num_words = input("And how many words should I write?: ")

        # Print Phrases
        gen_phrase(speech_gram, int(num_words), starter_word=[response])
        print('')
        gen_phrase(tweet_gram, int(num_words), starter_word=[response])
        more = input("Would you like to generate more? (Yes, No): ")
        if more != 'Yes':
            cont = False
开发者ID:bhagerman00,项目名称:bh_lant,代码行数:32,代码来源:trumpinator.py

示例4: getBigramBeginWithNotCount

def getBigramBeginWithNotCount(sent):
    negative_keywords = ["bad", "sad", "don't", "could not", "crappy", "unfortunately", "remove", "why", "poor",
                     "bothersome", "terrible", "although", "complaints", "outrageous", "isn't", "poorly",
                     "drawback", "annoying", "against", "irritating", "wouldn't", "won't", "wasn't", "couldn't",
                     "awful", "didn't", "hasn't", "difficult", "hate", "incorrect", "junk", "trash", "removed",
                         "complain", "complained", "hated", "negative"]
    bigramPostiveCount = 0
    '''
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories='news')
    brown_sents = brown.sents(categories='news')
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)

    for bigram in nltk.bigrams(word_tokenize(sent)):
        if bigram[0].lower() == "not" and bigram[1].lower() in negative_keywords:
            print sent
            print bigram
            print unigram_tagger.tag(word_tokenize(sent))
            bigramNotCount += 1
    '''
    for i, word in enumerate(word_tokenize(sent)):
        if word.lower() == "not":
            if word_tokenize(sent)[i + 1] in negative_keywords : # e.g. NOT bad
                bigramPostiveCount += 1
            if i < len(word_tokenize(sent)) - 2 and word_tokenize(sent)[i + 2] in negative_keywords: # e.g. NOT too bad
                bigramPostiveCount += 1
            else:                                                # e.g. NOT good
                bigramPostiveCount -= 1
    return bigramPostiveCount
开发者ID:seekshreyas,项目名称:nlp-reviews-classifier,代码行数:29,代码来源:extractor.py

示例5: test

def test(testAccents, testNoAccents, dictnoAccents):
    
    count = 0
    correct = 0
    notWord = []
    result = []
    incorrect = {}
    wordCount = 0
    nonWordCount = 0
    for i in range(len(testAccents)):
       
       
        sent = ""
        sentenceAccents = testAccents[i]
        sentenceNoAccents = testNoAccents[i]
            
        tokensAccents = word_tokenize(sentenceAccents)
        tokensNoAccents = word_tokenize(sentenceNoAccents)
        
        if len(tokensAccents) == len(tokensNoAccents):
            for j in range(len(tokensAccents)):
                tA = tokensAccents[j]
                tNA = tokensNoAccents[j]
                if tNA not in punctuation and not tNA.isdigit():
                    wordCount +=1
                    if tNA in dictnoAccents.keys():
                        
                        newToken = max(dictnoAccents[tNA], key=dictnoAccents[tNA].get)
                        #print(newToken)
                        #print("YES")
                    else:
                        newToken = tNA
                    if newToken == tA:
                        correct +=1
                    else:
                        incorrect[newToken] = tA
                       # print(newToken)
                       # print(tA)
                    count +=1
                    
                    #print("HI")
                    if j != 0:
                        newToken = " " + newToken
                else:   
                    
                    nonWordCount  +=1
                   
                    
                    notWord.append(tNA)
                    newToken = tNA
                sent = sent + newToken
       
            result.append(sent)
      
    print("Le nombre de mot dans le corpus: " + str(wordCount) )
    print("Le nombre de ponctuation et de nombres dans le corpus: " + str(nonWordCount))
    print("Nombre au total de changements/non changements possibles " + str(count ))
    print("Nombre au total de decisions correctes " + str(correct))
    print("Accuracy: " + str(correct/count) )
    return([incorrect,correct/count, wordCount, nonWordCount])
开发者ID:Alex-Fabbri,项目名称:DiacriticRestoration,代码行数:60,代码来源:accents.py

示例6: load_data

def load_data(loc='./data/'):
    """
    Load MSRP dataset
    """
    trainloc = os.path.join(loc, 'msr_paraphrase_train.txt')
    testloc = os.path.join(loc, 'msr_paraphrase_test.txt')

    trainA, trainB, testA, testB = [],[],[],[]
    trainS, devS, testS = [],[],[]

    f = open(trainloc, 'rb')
    for line in f:
        text = line.strip().split('\t')
        trainA.append(' '.join(word_tokenize(text[3])))
        trainB.append(' '.join(word_tokenize(text[4])))
        trainS.append(text[0])
    f.close()
    f = open(testloc, 'rb')
    for line in f:
        text = line.strip().split('\t')
        testA.append(' '.join(word_tokenize(text[3])))
        testB.append(' '.join(word_tokenize(text[4])))
        testS.append(text[0])
    f.close()

    trainS = [int(s) for s in trainS[1:]]
    testS = [int(s) for s in testS[1:]]

    return [trainA[1:], trainB[1:]], [testA[1:], testB[1:]], [trainS, testS]
开发者ID:2020zyc,项目名称:nlg-eval,代码行数:29,代码来源:eval_msrp.py

示例7: tokenize

def tokenize(s, stem=True, digit=False, stop=True, use_re=False):
    """
    :type s: str
    :type stem: bool
    :type use_re: bool
    :rtype: set(str)
    """
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    wordnet = WordNetLemmatizer()
    table = string.maketrans("","")

    if use_re:
        s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)

    if digit:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits)))
    else:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation)))

    if stop:
        tokens = set(word for word in tokens if word not in stop_words)

    if stem:
        tokens = set(stemmer.stem(word) for word in tokens)

    return tokens
开发者ID:lingcheng99,项目名称:search-term-relevance-home-depot,代码行数:27,代码来源:preprocess.py

示例8: clean_raw_txt

def clean_raw_txt(body, headline, punct_dct=None, stopwrds_set=None): 
    """Clean the body and headline to remove punctuation, stopwords, etc.

    Args: 
    ----
        body: str
        headline: str
        punct_dct (optional): dict 
            Translation dict resulting from a `str.maketrans()` call             
        stopwords_set (optional): set  

    Return: 
    ------
        (body, headline): tuple
    """

    if punct_dct: 
        body = body.translate(punct_dct)
        headline = headline.translate(punct_dct)

    body_wrds = word_tokenize(body)
    headline_wrds = word_tokenize(headline)

    stopwrds_set = set() if stopwrds_set is None else stopwrds_set

    body_wrds = [wrd.lower() for wrd in body_wrds if wrd.lower() not in stopwrds_set] 
    headline_wrds = [wrd.lower() for wrd in headline_wrds if wrd.lower() not in stopwrds_set]

    return (body_wrds, headline_wrds)
开发者ID:sallamander,项目名称:headline-generation,代码行数:29,代码来源:twenty_news_gen.py

示例9: obtaindata

def obtaindata(pos_file,neg_file):
     ##read the input files
    short_pos = open(pos_file, "r").read()
    short_neg = open(neg_file, "r").read()

    documents = []  # documents is gonna be a list of tuples that have a line of review and a class (pos or neg)

    for r in short_pos.split('\n'):
        documents.append((r, "pos"))
    for r in short_neg.split('\n'):
        documents.append((r, "neg"))

    all_words = []  # gonna contain all the words in both corpuses combined (nonunique)

    short_pos_words = word_tokenize(short_pos)
    short_neg_words = word_tokenize(short_neg)

    for w in short_pos_words:
        all_words.append(w.lower())
    for w in short_neg_words:
        all_words.append(w.lower())

    all_words = nltk.FreqDist(all_words)
    word_features = list(all_words.keys())[:5000]#gets the top 5000 most common words to use as features
    featuresets = [(find_features(rev,word_features), category) for (rev, category) in documents]
    random.shuffle(featuresets)
    return featuresets
开发者ID:akshaynavada,项目名称:NLP,代码行数:27,代码来源:sentimentPractice.py

示例10: load_samples

def load_samples(question, prop_labels):
    samples = []
    q = word_tokenize(question)
    for label in prop_labels:
        text = word_tokenize(label.lower())
        samples.append({'qtext': ' '.join(q), 'label': 0, 'atext': ' '.join(text)})    
    return samples
开发者ID:BenjaminHess,项目名称:dataset-sts,代码行数:7,代码来源:scoring-api.py

示例11: _doc2vec_doc_stream

def _doc2vec_doc_stream(paths, n, sentences=True):
    """
    Generator to feed sentences to the dov2vec model.
    """
    phrases = Bigram()

    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)

                # We do minimal pre-processing here so the model can learn
                # punctuation
                line = line.lower()

                if sentences:
                    for sent in sent_tokenize(line):
                        tokens = word_tokenize(sent)
                        yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)])
                else:
                    tokens = word_tokenize(line)
                    yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)])
开发者ID:frnsys,项目名称:factory,代码行数:25,代码来源:doc2vec.py

示例12: load_sick2014

def load_sick2014(dsfile, mode='relatedness'):
    """ load a dataset in the sick2014 tsv .txt format;

    mode='relatedness': use the sts relatedness score as label
    mode='entailment': use -1 (contr.), 0 (neutral), 1 (ent.) as label """
    s0 = []
    s1 = []
    labels = []
    with open(dsfile) as f:
        first = True
        for line in f:
            if first:
                # skip first line with header
                first = False
                continue
            line = line.rstrip()
            pair_ID, sentence_A, sentence_B, relatedness_score, entailment_judgement = line.split('\t')
            if mode == 'relatedness':
                label = float(relatedness_score)
            elif mode == 'entailment':
                if entailment_judgement == 'CONTRADICTION':
                    label = -1
                elif entailment_judgement == 'NEUTRAL':
                    label = 0
                elif entailment_judgement == 'ENTAILMENT':
                    label = +1
                else:
                    raise ValueError('invalid label on line: %s' % (line,))
            else:
                raise ValueError('invalid mode: %s' % (mode,))
            labels.append(label)
            s0.append(word_tokenize(sentence_A))
            s1.append(word_tokenize(sentence_B))
    return (s0, s1, np.array(labels))
开发者ID:quinsulon,项目名称:dataset-sts,代码行数:34,代码来源:loader.py

示例13: load_anssel

def load_anssel(dsfile, subsample0=3):
    """ load a dataset in the anssel csv format;

    subsample0=N denotes that only every N-th 0-labelled sample
    should be loaded; so e.g. N=3 reduces 80k negatives to 28k
    negatives in the training set (vs. 4k positives); N=10k
    gets you just 8k negatives, etc. """
    s0 = []
    s1 = []
    labels = []
    i = 0
    with open(dsfile) as f:
        c = csv.DictReader(f)
        for l in c:
            label = int(l['label'])
            if label == 0 and (i % subsample0) != 0:
                i += 1
                continue
            labels.append(label)
            try:
                qtext = l['qtext'].decode('utf8')
                atext = l['atext'].decode('utf8')
            except AttributeError:  # python3 has no .decode()
                qtext = l['qtext']
                atext = l['atext']
            s0.append(word_tokenize(qtext))
            s1.append(word_tokenize(atext))
            i += 1
    return (s0, s1, np.array(labels))
开发者ID:quinsulon,项目名称:dataset-sts,代码行数:29,代码来源:loader.py

示例14: testing

def testing():
    # - tokenize on sentence and word
    ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
    print(sent_tokenize(ex_txt))
    print(word_tokenize(ex_txt, language='english'))

    # - stop words (pre-defined by nltk)
    stop_words = set(stopwords.words('english'))
    print(stop_words)
    words = word_tokenize(ex_txt)
    print(words)
    filtered_sent = []
    for w in words:
        if w not in stop_words:
            filtered_sent.append(w)
    print(filtered_sent)
    filtered_sent = [w for w in words if not w in stop_words]
    print(filtered_sent)

    # - stemming
    ps = PorterStemmer()
    example_words = [python,pythoner,pythoning,pythoned,pythonly]
    # for w in example_words:
    #     print(ps.stem(w))
    new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
    words = word_tokenize(new_text)
    for w in words:
        print(ps.stem(w))
开发者ID:gbartusk,项目名称:coursera_data_science_capstone,代码行数:28,代码来源:capstone.py

示例15: __init__

    def __init__(self, txt_type: str, txt: str):
        self.txt_type = txt_type

        if txt_type is "paragraph":
            self.sentences = [word_tokenize(w) for w in sent_tokenize(txt)]
        else:
            self.title = word_tokenize(txt)
开发者ID:NGrech,项目名称:FYP,代码行数:7,代码来源:indexers.py


注:本文中的nltk.tokenize.word_tokenize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。