本文整理汇总了Python中nltk.tokenize.WordPunctTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.WordPunctTokenizer方法的具体用法?Python tokenize.WordPunctTokenizer怎么用?Python tokenize.WordPunctTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize
的用法示例。
在下文中一共展示了tokenize.WordPunctTokenizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: words
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
示例2: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``CorpusReader`` constructor.
"""
# Add the default category pattern if not passed into the class.
if not any(key.startswith('cat_') for key in kwargs.keys()):
kwargs['cat_pattern'] = CAT_PATTERN
CategorizedCorpusReader.__init__(self, kwargs)
CorpusReader.__init__(self, root, fileids)
self._word_tokenizer = WordPunctTokenizer()
self._sent_tokenizer = nltk.data.LazyLoader(
'tokenizers/punkt/english.pickle')
示例3: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, root, fileids=DOC_PATTERN,
word_tokenizer=WordPunctTokenizer(),
sent_tokenizer=nltk.data.LazyLoader(
'tokenizers/punkt/english.pickle'),
encoding='latin-1', **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining
arguments are passed to the ``CorpusReader`` constructor.
"""
# Add the default category pattern if not passed into the class.
if not any(key.startswith('cat_') for key in kwargs.keys()):
kwargs['cat_pattern'] = CAT_PATTERN
CategorizedCorpusReader.__init__(self, kwargs)
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._tags = TAGS
示例4: words
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
示例5: __getitem__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __getitem__(self, idx):
idx, start, end = self.lst[idx]
dialog = self.raw[idx][start:end]
source, target = dialog[:-1], dialog[-1]
spks, utts = list(zip(*[(speaker, WordPunctTokenizer().tokenize(uttr)) for speaker, uttr, _ in source]))
spks = list(spks)
while len(spks) < 10:
spks.append(0)
source = '|||'.join([' '.join(uttr) for uttr in utts])
target_test = ' '.join(WordPunctTokenizer().tokenize(target[1]))
return spks, source, target_test, target[0]
示例6: getredundantComponents
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def getredundantComponents(sentences):
window_size=4
introList=[]
midlist=[]
endlist=[]
for sent in sentences:
words = WordPunctTokenizer().tokenize(sent)
length_sent=len(words)
f_point = (length_sent)//3
m_point=(length_sent)//2
index_span=window_size//2
intro=' '.join(word for word in words[0:window_size])
mid=' '.join(word for word in words[m_point-index_span:m_point+index_span])
end=' '.join(word for word in words[-window_size:])
introList.append(intro)
midlist.append(mid)
endlist.append(end)
return introList, midlist, endlist
示例7: words
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer = WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
示例8: data_tockenize
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def data_tockenize(text):
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(text)
return (" ".join(tokens)).strip()
示例9: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, model_path):
self.model_path = model_path
print("loading fastText model ...")
#self.model = pickle.load(open(self.model_path,"rb"))
self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore')
print("done fastText loading model")
self.tokenizer = WordPunctTokenizer()
self.stemmer = ARLSTem()
self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
self.vocab = self.model.vocab
示例10: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, P):
self.tokenizer = WordPunctTokenizer()
self.stemmer = ARLSTem()
self.docs = self.get_answer_canditates(P)
docs_stem = []
for doc in self.docs:
docs_stem.append(self.stem_string(doc))
self.stopwords = stopwords.words('arabic')
self.vectorizer = TfidfVectorizer(ngram_range=(1, 4), norm=None) # , stop_words=self.stopwords)
self.tfidf_matrix = self.vectorizer.fit_transform(docs_stem)
示例11: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self):
self.tokenizer = WordPunctTokenizer()
self.stemmer = ARLSTem()
self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
示例12: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None):
self.k = k # number of documents to return
self.tokenizer = WordPunctTokenizer()
self.stemmer = ARLSTem()
self.docs = docs
self.stopwords = stopwords.words('arabic')
self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None, stop_words=self.stopwords)
if tfidf_matrix is None or vectorizer is None:
docs_stemmed = self.docs_stem()
self.tfidf_matrix = self.vectorizer.fit_transform(docs_stemmed)
else:
self.vectorizer = vectorizer
self.tfidf_matrix = tfidf_matrix
示例13: __get_words_boundaries
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __get_words_boundaries(self):
"""
function to tokenize words in the document and return words
boundaries of each sentence using a tokenizer.
:return:
"""
tokenizer = WordPunctTokenizer()
words = list(tokenizer.span_tokenize(self.text))
return words
示例14: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self):
self._sent_analyzer = SIA()
self._word_tokenizer = WordPunctTokenizer().tokenize
self._sent_tokenizer = nltk.data.LazyLoader(
'tokenizers/punkt/english.pickle'
).tokenize
self._ids = []
示例15: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self):
self.tokenizers = {
'en': TweetTokenizer(),
'de': WordPunctTokenizer(),
'it': WordPunctTokenizer(),
'fr': WordPunctTokenizer(),
'default': WordPunctTokenizer()
}
self.tokenizer = TweetTokenizer()