Python nltk.word_tokenize方法代码示例

本文整理汇总了Python中nltk.word_tokenize方法的典型用法代码示例。如果您正苦于以下问题：Python nltk.word_tokenize方法的具体用法？Python nltk.word_tokenize怎么用？Python nltk.word_tokenize使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk的用法示例。

在下文中一共展示了nltk.word_tokenize方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: analyze_en

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def analyze_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'English Sentence Lengths Distribution'
    plt.title(title)
    plt.show()

开发者ID:foamliu，项目名称:Machine-Translation，代码行数:22，代码来源:analyze_data.py

示例2: profile

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:21，代码来源:textcat.py

示例3: predict

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def predict(self, query):
        usr = query
        print('Mem2Seq usr:', usr)
        #example input: 'please find a restaurant called nusha .'
        self.t += 1
        print('Mem2Seq turn:', self.t)
        usr = ' '.join(word_tokenize(usr.lower()))
        self.memory += generate_memory(usr, '$u', self.t)
        src_plain = (self.memory+[['$$$$']*MEM_TOKEN_SIZE],)
        src_seqs = plain2tensor(self.lang.word2index, src_plain[0])
        words = self.model.evaluate_batch(1, src_seqs, [len(src_plain[0])], None, None, None, None, src_plain)
        row = np.transpose(words)[0].tolist()
        if '<EOS>' in row:
            row = row[:row.index('<EOS>')]
        sys = ' '.join(row)
        sys = denormalize(sys)
        print('Mem2Seq sys:', sys)
        self.memory += generate_memory(sys, '$s', self.t)
        return sys

开发者ID:ConvLab，项目名称:ConvLab，代码行数:21，代码来源:Mem2Seq.py

示例4: extract_features

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus

开发者ID:PacktPublishing，项目名称:Hands-on-NLP-with-NLTK-and-scikit-learn-，代码行数:20，代码来源:nlp-5-document-classification.py

示例5: init

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens")

开发者ID:davidsbatista，项目名称:Snowball，代码行数:27，代码来源:VectorSpaceModel.py

示例6: preprocess

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def preprocess(text):
	min_length = 3
	text = re.sub('\d+','#',text)
	text = re.sub('\.',' eos ',text)
	# Tokenize
	words = map(lambda word: word.lower(), word_tokenize(text))
	tokens = words
	# Remove non characters
	p = re.compile('[a-zA-Z#]+')
	# Filter tokens (we do not remove stopwords)
	filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens))
	# Encode to ascii
	filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens]

	return filtered_tokens


# Modify this path

开发者ID:AlexGidiotis，项目名称:Document-Classifier-LSTM，代码行数:20，代码来源:data_prep.py

示例7: text_to_num

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print("text2num -> ", text)
    return text

开发者ID:abhi007tyagi，项目名称:JARVIS，代码行数:23，代码来源:math_expression_calculator.py

示例8: init

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html

开发者ID:JasonKessler，项目名称:scattertext，代码行数:25，代码来源:phrasemachine.py

示例9: get_story_question_answer_triples

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples

开发者ID:sujitpal，项目名称:dl-models-for-qa，代码行数:20，代码来源:kaggle.py

示例10: convert_string

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def convert_string(text):
    """Convert a str of text into tokenized and selected list of words.

    Parameters
    ----------
    text : str
        Text as one long string.

    Returns
    -------
    words_cleaned : list of str
        List of tokenized words, after processing.

    Notes
    -----
    This function sets text to lower case, and removes stopwords and punctuation.
    """

    words = word_tokenize(text)
    words_cleaned = [word.lower() for word in words if (
        (not word.lower() in stopwords.words('english')) & word.isalnum())]

    return words_cleaned

开发者ID:lisc-tools，项目名称:lisc，代码行数:25，代码来源:utils.py

示例11: evaluate_sentiment

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' '))
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score

开发者ID:iamshang1，项目名称:Projects，代码行数:23，代码来源:sentiwordnet.py

示例12: evaluate_sentiment

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text)
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score

开发者ID:iamshang1，项目名称:Projects，代码行数:23，代码来源:combined.py

示例13: _nltk_process_sents

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def _nltk_process_sents(self, sents):
        for sentence in sents:
            if isinstance(sentence, STRING_TYPES):
                if self._tokenizer_lang is None:
                    raise ValueError(
                        "No word tokenizer available for this language. "
                        "Please tokenize before calling the parser."
                        )
                sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)

            if IS_PY2:
                sentence = [
                    word.decode('utf-8', 'ignore') if isinstance(word, str) else word
                    for word in sentence
                    ]

            if not self._provides_tags:
                sentence = nltk.pos_tag(sentence)
                yield [word for word, tag in sentence], sentence
            else:
                yield sentence, sentence

开发者ID:nikitakit，项目名称:self-attentive-parser，代码行数:23，代码来源:nltk_plugin.py

示例14: encode_text

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def encode_text(text, word_embeddings, max_sen_len):
    '''
    Encode a sequence of words into corresponding vector representation
    Input:
        text (string) : text (space separated words, etc..)
        word_embeddings (dict) : dictionary mapping from words to their representation
        max_sen_len (int) : maximum sentence length (in words)
    Returns:
        X (np.matrix) : matrix of shape (max_sen_len, embedding_size) after zero padding
    '''
    
    default_embed = np.zeros(300)
    words = word_tokenize(text)[:max_sen_len]
    embeds = [word_embeddings.get(x, default_embed) for x in words]
    embeds += [default_embed] * (max_sen_len - len(embeds))
    return np.array(embeds, dtype=np.float32)

开发者ID:AnubhavGupta3377，项目名称:Text-Classification-Models-Pytorch，代码行数:18，代码来源:utils.py

示例15: encode_text

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def encode_text(text, word_embeddings):
    '''
    Encode a sequence of words into corresponding vector representation
    Input:
        text (string) : text (space separated words, etc..)
        word_embeddings (dict) : dictionary mapping from words to their representation
        max_sent_len (int) : maximum sentence length (in words)
    Returns:
        X (np.array) : array of shape (embedding_size,) averaging all word vectors of text
    '''
    
    embed = np.zeros(300)
    count = 0
    words = word_tokenize(text)
    for word in words:
        if word in word_embeddings:
            embed += word_embeddings[word]
            count += 1
    return embed / count

开发者ID:AnubhavGupta3377，项目名称:Text-Classification-Models-Pytorch，代码行数:21，代码来源:utils.py

注：本文中的nltk.word_tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。