本文整理汇总了Python中nltk.word_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.word_tokenize方法的具体用法?Python nltk.word_tokenize怎么用?Python nltk.word_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.word_tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: analyze_en
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def analyze_en():
translation_path = os.path.join(train_translation_folder, train_translation_en_filename)
with open(translation_path, 'r') as f:
sentences = f.readlines()
sent_lengths = []
for sentence in tqdm(sentences):
sentence_en = sentence.strip().lower()
tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
seg_list = list(jieba.cut(sentence.strip()))
# Update word frequency
sent_lengths.append(len(seg_list))
num_bins = 100
n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
title = 'English Sentence Lengths Distribution'
plt.title(title)
plt.show()
示例2: profile
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
示例3: predict
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def predict(self, query):
usr = query
print('Mem2Seq usr:', usr)
#example input: 'please find a restaurant called nusha .'
self.t += 1
print('Mem2Seq turn:', self.t)
usr = ' '.join(word_tokenize(usr.lower()))
self.memory += generate_memory(usr, '$u', self.t)
src_plain = (self.memory+[['$$$$']*MEM_TOKEN_SIZE],)
src_seqs = plain2tensor(self.lang.word2index, src_plain[0])
words = self.model.evaluate_batch(1, src_seqs, [len(src_plain[0])], None, None, None, None, src_plain)
row = np.transpose(words)[0].tolist()
if '<EOS>' in row:
row = row[:row.index('<EOS>')]
sys = ' '.join(row)
sys = denormalize(sys)
print('Mem2Seq sys:', sys)
self.memory += generate_memory(sys, '$s', self.t)
return sys
示例4: extract_features
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def extract_features(corpus):
'''Extract TF-IDF features from corpus'''
stop_words = nltk.corpus.stopwords.words("english")
# vectorize means we turn non-numerical data into an array of numbers
count_vectorizer = feature_extraction.text.CountVectorizer(
lowercase=True, # for demonstration, True by default
tokenizer=nltk.word_tokenize, # use the NLTK tokenizer
min_df=2, # minimum document frequency, i.e. the word must appear more than once.
ngram_range=(1, 2),
stop_words=stop_words
)
processed_corpus = count_vectorizer.fit_transform(corpus)
processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
processed_corpus)
return processed_corpus
开发者ID:PacktPublishing,项目名称:Hands-on-NLP-with-NLTK-and-scikit-learn-,代码行数:20,代码来源:nlp-5-document-classification.py
示例5: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def __init__(self, sentences_file, stopwords):
self.dictionary = None
self.corpus = None
f_sentences = codecs.open(sentences_file, encoding='utf-8')
documents = list()
count = 0
print("Gathering sentences and removing stopwords")
for line in f_sentences:
line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)
# remove stop words and tokenize
document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
documents.append(document)
count += 1
if count % 10000 == 0:
sys.stdout.write(".")
f_sentences.close()
self.dictionary = corpora.Dictionary(documents)
self.corpus = [self.dictionary.doc2bow(text) for text in documents]
self.tf_idf_model = TfidfModel(self.corpus)
print(len(documents), "documents red")
print(len(self.dictionary), " unique tokens")
示例6: preprocess
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def preprocess(text):
min_length = 3
text = re.sub('\d+','#',text)
text = re.sub('\.',' eos ',text)
# Tokenize
words = map(lambda word: word.lower(), word_tokenize(text))
tokens = words
# Remove non characters
p = re.compile('[a-zA-Z#]+')
# Filter tokens (we do not remove stopwords)
filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens))
# Encode to ascii
filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens]
return filtered_tokens
# Modify this path
示例7: text_to_num
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def text_to_num(text):
tokenized = nltk.word_tokenize(text);
tags = nltk.pos_tag(tokenized)
print(tags)
chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
chunkParser = nltk.RegexpParser(chunkPattern)
chunkedData = chunkParser.parse(tags)
print(chunkedData)
for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
exp = ""
for l in subtree.leaves():
exp += str(l[0]) + " "
exp = exp[:-1]
print(exp)
try:
text = text.replace(exp, str(t2n.text2num(exp)))
except Exception as e:
print("error text2num ->", e.args)
print("text2num -> ", text)
return text
示例8: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
#return pkgutil.get_data('scattertext',
# 'data/viz/semiotic_new.html').decode('utf-8')
path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
tokenizer_fn = path + 'punkt.english.pickle'
tagger_fn = path + 'averaged_perceptron_tagger.pickle'
#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
示例9: get_story_question_answer_triples
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def get_story_question_answer_triples(sqa_file):
sqatriples = []
fsqa = open(sqa_file, "rb")
for line in fsqa:
line = line.strip().decode("utf8").encode("ascii", "ignore")
if line.startswith("#"):
continue
story, question, answer, correct = line.split("\t")
swords = []
story_sents = nltk.sent_tokenize(story)
for story_sent in story_sents:
swords.extend(nltk.word_tokenize(story_sent))
qwords = nltk.word_tokenize(question)
awords = nltk.word_tokenize(answer)
is_correct = int(correct) == 1
sqatriples.append((swords, qwords, awords, is_correct))
fsqa.close()
return sqatriples
示例10: convert_string
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def convert_string(text):
"""Convert a str of text into tokenized and selected list of words.
Parameters
----------
text : str
Text as one long string.
Returns
-------
words_cleaned : list of str
List of tokenized words, after processing.
Notes
-----
This function sets text to lower case, and removes stopwords and punctuation.
"""
words = word_tokenize(text)
words_cleaned = [word.lower() for word in words if (
(not word.lower() in stopwords.words('english')) & word.isalnum())]
return words_cleaned
示例11: evaluate_sentiment
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def evaluate_sentiment(text):
pos_score = 0
neg_score = 0
tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' '))
pos_pairs = nltk.pos_tag(tokened)
for tuple in pos_pairs:
pos = ''
if tuple[1] == "NN":
pos = 'n/'
if tuple[1] == "JJ":
pos = 'a/'
if tuple[1] == "VB":
pos = 'v/'
if tuple[1] == "RB":
pos = 'r/'
try:
pos_score += sentiwordnet[pos+tuple[0].lower()][0]
neg_score += sentiwordnet[pos+tuple[0].lower()][1]
except:
pass
return pos_score, neg_score
示例12: evaluate_sentiment
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def evaluate_sentiment(text):
pos_score = 0
neg_score = 0
tokened = nltk.word_tokenize(text)
pos_pairs = nltk.pos_tag(tokened)
for tuple in pos_pairs:
pos = ''
if tuple[1] == "NN":
pos = 'n/'
if tuple[1] == "JJ":
pos = 'a/'
if tuple[1] == "VB":
pos = 'v/'
if tuple[1] == "RB":
pos = 'r/'
try:
pos_score += sentiwordnet[pos+tuple[0].lower()][0]
neg_score += sentiwordnet[pos+tuple[0].lower()][1]
except:
pass
return pos_score, neg_score
示例13: _nltk_process_sents
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def _nltk_process_sents(self, sents):
for sentence in sents:
if isinstance(sentence, STRING_TYPES):
if self._tokenizer_lang is None:
raise ValueError(
"No word tokenizer available for this language. "
"Please tokenize before calling the parser."
)
sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)
if IS_PY2:
sentence = [
word.decode('utf-8', 'ignore') if isinstance(word, str) else word
for word in sentence
]
if not self._provides_tags:
sentence = nltk.pos_tag(sentence)
yield [word for word, tag in sentence], sentence
else:
yield sentence, sentence
示例14: encode_text
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def encode_text(text, word_embeddings, max_sen_len):
'''
Encode a sequence of words into corresponding vector representation
Input:
text (string) : text (space separated words, etc..)
word_embeddings (dict) : dictionary mapping from words to their representation
max_sen_len (int) : maximum sentence length (in words)
Returns:
X (np.matrix) : matrix of shape (max_sen_len, embedding_size) after zero padding
'''
default_embed = np.zeros(300)
words = word_tokenize(text)[:max_sen_len]
embeds = [word_embeddings.get(x, default_embed) for x in words]
embeds += [default_embed] * (max_sen_len - len(embeds))
return np.array(embeds, dtype=np.float32)
示例15: encode_text
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import word_tokenize [as 别名]
def encode_text(text, word_embeddings):
'''
Encode a sequence of words into corresponding vector representation
Input:
text (string) : text (space separated words, etc..)
word_embeddings (dict) : dictionary mapping from words to their representation
max_sent_len (int) : maximum sentence length (in words)
Returns:
X (np.array) : array of shape (embedding_size,) averaging all word vectors of text
'''
embed = np.zeros(300)
count = 0
words = word_tokenize(text)
for word in words:
if word in word_embeddings:
embed += word_embeddings[word]
count += 1
return embed / count