本文整理汇总了Python中nltk.tokenize.RegexpTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.RegexpTokenizer方法的具体用法?Python tokenize.RegexpTokenizer怎么用?Python tokenize.RegexpTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize
的用法示例。
在下文中一共展示了tokenize.RegexpTokenizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean_text
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def clean_text(df, wrong_words_dict, autocorrect=True):
df.fillna("__NA__", inplace=True)
tokinizer = RegexpTokenizer(r'\w+')
regexps = [re.compile("([a-zA-Z]+)([0-9]+)"), re.compile("([0-9]+)([a-zA-Z]+)")]
texts = df.tolist()
result = []
for text in tqdm(texts):
tokens = tokinizer.tokenize(text.lower())
tokens = [split_text_and_digits(token, regexps) for token in tokens]
tokens = [substitute_repeats(token, 3) for token in tokens]
text = ' '.join(tokens)
if autocorrect:
for wrong, right in wrong_words_dict.items():
text = text.replace(wrong, right)
result.append(text)
return result
示例2: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
示例3: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding=None):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt')
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
示例4: get_ngram_features_from_map
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def get_ngram_features_from_map(tweets, ngram_map, n):
regexp_tknzr = RegexpTokenizer(r'\w+')
tweet_tknzr = TweetTokenizer()
features = []
for tweet in tweets:
feature_list = [0] * np.zeros(len(ngram_map))
tweet = tweet.lower()
ngram_list = get_ngram_list(tweet_tknzr, tweet, 1)
if n > 1:
ngram_list += get_ngram_list(regexp_tknzr, tweet, 2)
if n > 2:
ngram_list += get_ngram_list(regexp_tknzr, tweet, 3)
for gram in ngram_list:
if gram in ngram_map:
feature_list[ngram_map[gram]] += 1.0
features.append(feature_list)
return features
示例5: vocab_index_descriptions
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def vocab_index_descriptions(vocab_file, vectors_file):
#load lookups
vocab = set()
with open(vocab_file, 'r') as vocabfile:
for i,line in enumerate(vocabfile):
line = line.strip()
if line != '':
vocab.add(line)
ind2w = {i+1:w for i,w in enumerate(sorted(vocab))}
w2ind = {w:i for i,w in ind2w.items()}
desc_dict = datasets.load_code_descriptions()
tokenizer = RegexpTokenizer(r'\w+')
with open(vectors_file, 'w') as of:
w = csv.writer(of, delimiter=' ')
w.writerow(["CODE", "VECTOR"])
for code, desc in tqdm(desc_dict.items()):
#same preprocessing steps as in get_discharge_summaries
tokens = [t.lower() for t in tokenizer.tokenize(desc) if not t.isnumeric()]
inds = [w2ind[t] if t in w2ind.keys() else len(w2ind)+1 for t in tokens]
w.writerow([code] + [str(i) for i in inds])
示例6: process
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def process(input_text):
# Create a regular expression tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Create a Snowball stemmer
stemmer = SnowballStemmer('english')
# Get the list of stop words
stop_words = stopwords.words('english')
# Tokenize the input string
tokens = tokenizer.tokenize(input_text.lower())
# Remove the stop words
tokens = [x for x in tokens if not x in stop_words]
# Perform stemming on the tokenized words
tokens_stemmed = [stemmer.stem(x) for x in tokens]
return tokens_stemmed
示例7: transform
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def transform(self, texts, y=None):
tokenizer = RegexpTokenizer(r'[a-z]+|\d+')
tokenized_texts = []
stoplist = []
if self.ignore_stopwords:
stoplist = stopwords.words('english')
for text in texts:
tokenized_text = []
for word in tokenizer.tokenize(text.lower()):
if word not in stoplist:
tokenized_text.append(word.strip())
tokenized_texts.append(tokenized_text)
return tokenized_texts
示例8: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(
self,
root,
fileids,
sep='/',
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1',
):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
示例9: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self, root, items, encoding='utf8'):
gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
TaggedCorpusReader.__init__(self, root, items, sep='_',
sent_tokenizer=sent_tokenizer)
#: A list of all documents and their titles in ycoe.
示例10: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
self.stop = stop
self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
'have', 'are', 'were', 'and', 'very', '.', ','])
self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
'denied'])
# Try to tokenize so that abbreviations like U.S.and monetary amounts
# like "$23.00" are kept as tokens.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
#Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if lemmatize:
self.text_words = set(lemmatize(token) for token in self.text_tokens)
self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
示例11: get_tokens
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def get_tokens(text):
"""Tokenize the input text."""
soup = BeautifulSoup(text, "html.parser")
tokenizer = RegexpTokenizer(r'\w+')
return tokenizer.tokenize(soup.get_text())
示例12: load_captions
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def load_captions(self, data_dir, filenames):
all_captions = []
for i in range(len(filenames)):
cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
with open(cap_path, "r") as f:
captions = f.read().decode('utf8').split('\n')
cnt = 0
for cap in captions:
if len(cap) == 0:
continue
cap = cap.replace("\ufffd\ufffd", " ")
# picks out sequences of alphanumeric characters as tokens
# and drops everything else
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(cap.lower())
# print('tokens', tokens)
if len(tokens) == 0:
print('cap', cap)
continue
tokens_new = []
for t in tokens:
t = t.encode('ascii', 'ignore').decode('ascii')
if len(t) > 0:
tokens_new.append(t)
all_captions.append(tokens_new)
cnt += 1
if cnt == self.embeddings_num:
break
if cnt < self.embeddings_num:
print('ERROR: the captions for %s less than %d'
% (filenames[i], cnt))
return all_captions
示例13: __tokenize
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __tokenize(self, docs):
output = []
for doc in docs:
tokenizer = RegexpTokenizer(r'\w\w\w\w\w+')
output.append(tokenizer.tokenize(doc.lower()))
return output
示例14: cleanDocuments
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def cleanDocuments(self):
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
en_stop = set(stopwords.words('english'))
self.cleaned = []
for doc in self.documents:
lowercase_doc = doc.lower()
words = tokenizer.tokenize(lowercase_doc)
non_stopped_words = [i for i in words if not i in en_stop]
self.cleaned.append(non_stopped_words)
print("INFO: Clearning {} documents completed".format(len(self.documents)))
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:12,代码来源:IdentifyingTopic.py
示例15: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self):
# Create a regular expression tokenizer
self.tokenizer = RegexpTokenizer(r'\w+')
# get the list of stop words
self.stop_words_english = stopwords.words('english')
# Create a Snowball stemmer
self.stemmer = SnowballStemmer('english')
# Tokenizing, stop word removal, and stemming
开发者ID:PacktPublishing,项目名称:Python-Machine-Learning-Cookbook-Second-Edition,代码行数:13,代码来源:topic_modeling.py