本文整理汇总了Python中nltk.corpus.words.words方法的典型用法代码示例。如果您正苦于以下问题:Python words.words方法的具体用法?Python words.words怎么用?Python words.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.words
的用法示例。
在下文中一共展示了words.words方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: word_finder
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def word_finder():
from nltk.corpus import words
wordlist = words.words()
random.shuffle(wordlist)
wordlist = wordlist[:200]
wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
grid, used = wordfinder(wordlist)
print("Word Finder\n")
for i in range(len(grid)):
for j in range(len(grid[i])):
print(grid[i][j], end=' ')
print()
print()
for i in range(len(used)):
print("%d:" % (i+1), used[i])
示例2: word_finder
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def word_finder():
from nltk.corpus import words
wordlist = words.words()
random.shuffle(wordlist)
wordlist = wordlist[:200]
wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
grid, used = wordfinder(wordlist)
print "Word Finder\n"
for i in range(len(grid)):
for j in range(len(grid[i])):
print grid[i][j],
print
print
for i in range(len(used)):
print "%d:" % (i+1), used[i]
示例3: build_vocabulary
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def build_vocabulary(vocab_filename, lines, minimum_occurrence=1):
if not os.path.exists(vocab_filename):
stopwords = get_stopwords_list(filename="stopwords_loose.txt")
print("Building vocabulary...")
vocabulary = Counter()
for line in lines:
vocabulary.update([l.lower() for l in line.split() if l not in stopwords])
print("The top 10 most common words: ", vocabulary.most_common(10))
# Filter all words that appear too rarely or too frequently to be conclusive
vocabulary = {key: vocabulary[key] for key in vocabulary
if vocabulary[key] >= minimum_occurrence}
utils.save_file(vocabulary.keys(), vocab_filename)
print("Vocabulary saved to file \"%s\"" % vocab_filename)
vocabulary = set(utils.load_file(vocab_filename))
print("Loaded vocabulary of size ", len(vocabulary))
return vocabulary
示例4: split_hashtag
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def split_hashtag(hashtag, word_list):
split_words = []
if hashtag != hashtag.lower() and hashtag != hashtag.upper():
split_words = camel_case_split(hashtag)
else:
j = 0
while j <= len(hashtag):
loc = j
for i in range(j + 1, len(hashtag) + 1, 1):
if hashtag[j:i].lower() in word_list:
loc = i
if loc == j:
j += 1
else:
split_words.append(hashtag[j:loc])
j = loc
split_words = ['#' + str(s) for s in split_words]
return split_words
# Select the best possible hashtag split based on upper-case
# or component words maximizing the length of the possible word split
示例5: __get_words_to_ignore
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def __get_words_to_ignore(self):
"""Compiles list of all words to ignore.
:return: List of words to ignore.
"""
# Stop words in English.
english_stopwords = stopwords.words("english")
here = path.abspath(path.dirname(__file__))
# Languages in git repositories.
git_languages = []
with open(path.join(here, "gitlang/languages.txt"), "r") as langauges:
git_languages = [line.strip() for line in langauges]
# Other words to avoid in git repositories.
words_to_avoid = []
with open(path.join(here, "gitlang/others.txt"), "r") as languages:
words_to_avoid = [line.strip() for line in languages]
return set(
itertools.chain(english_stopwords, git_languages, words_to_avoid)
)
示例6: __init__
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def __init__(self, w2v, sif, log=None):
# A NLPWord2Vec to get the vec for a word
self.word2vec = w2v
# A ResSIF used to get word count
self.sif = sif
# util to pre-process data
self.utils = WordSentenceUtils()
self.log = log if log else logging.getLogger(__name__)
self.sentence_vectors = []
self.feature_size = 0
# download nltk resource if necessary
nltk.download('words', quiet=True)
self.setofwords = set(nltk_words.words())
# pca vector
self.pca_u = []
示例7: word_finder
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def word_finder():
from nltk.corpus import words
wordlist = words.words()
random.shuffle(wordlist)
wordlist = wordlist[:200]
wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
grid, used = wordfinder(wordlist)
print("Word Finder\n")
for i in range(len(grid)):
for j in range(len(grid[i])):
print(grid[i][j], end=' ')
print()
print()
for i in range(len(used)):
print("%d:" % (i + 1), used[i])
示例8: _english_wordlist
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def _english_wordlist(self):
try:
wl = self._en_wordlist
except AttributeError:
from nltk.corpus import words
self._en_wordlist = set(words.words('en-basic'))
wl = self._en_wordlist
return wl
示例9: postag_tree
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def postag_tree(tree):
# Part-of-speech tagging.
words = tree.leaves()
tag_iter = (pos for (word, pos) in pos_tag(words))
newtree = Tree('S', [])
for child in tree:
if isinstance(child, Tree):
newtree.append(Tree(child.label(), []))
for subchild in child:
newtree[-1].append( (subchild, next(tag_iter)) )
else:
newtree.append( (child, next(tag_iter)) )
return newtree
示例10: WPS
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def WPS(text):
count = 0
for word in text.split():
if word in set(w.lower() for w in words.words()):
count += 1
return count
#Average Number Of Syllables In Sentence(Returns Float):
示例11: __init__
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def __init__(self, config, abc):
# def __init__(self):
self.config = config
self.seed()
self.all_words = words.words()
self.english_alpha = ''.join([c for c in abc if c in 'abcdefghijklmnopqrstuvwxyz0123456789'])+"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
self.english_symbols = ''.join([c for c in abc if c not in 'abcdefghijklmnopqrstuvwxyz0123456789'])
self.english = self.english_alpha + self.english_symbols
self.transparent_mean = 0.8
self.transparent_gaussian = 0.06
self.prob_lexi = 0.5
self.symbol_word = 1
self.art_font_size_range = self.config['augmentation']['font_range']
self.border_range = self.config['augmentation']['border_range']
self.font_dir_name='/home/Common/Datasets_SSD/Dataset_Text/ART/fonts_for_text'
#probabilty distribution for length of words
self.probability_dist = np.array([0.1, 0.6, 2.6, 5.2, 8.5, 12.2, 14, 14, 12.6, 10.1, 7.5])#, 5.2, 3.2, 2, 1, 0.6, 0.3, 0.2, 0.1, 0.1
self.probability_dist = self.probability_dist/np.sum(self.probability_dist)
list_of_files = self.get_list_of_files(self.font_dir_name)
self.all_fonts = []
for i in range(len(list_of_files)):
with open(list_of_files[i],"rb") as f:
font_bytes=f.read()
self.all_fonts.append(font_bytes)
self.image_net_location = "/media/mayank/0b40607e-7efc-4216-b12f-8bb86facfaed/Dataset_HDD/Image_Net/ILSVRC/Data/CLS-LOC/test/"
# self.image_net_location = "/home/Common/ImageNet/test"
self.images_orig = self.get_imagenet_images(self.config['augmentation']['imagenet_no'])#self.config['augmentation']['base_number']
# self.image_save_location = '/home/Common/Mayank/Text/Segmentation/Dataset/ART/Images/'
# self.label_save_location = '/home/Common/Mayank/Text/Segmentation/Dataset/ART/Labels/'
示例12: postag_tree
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def postag_tree(tree):
# Part-of-speech tagging.
words = tree.leaves()
tag_iter = (pos for (word, pos) in pos_tag(words))
newtree = Tree('S', [])
for child in tree:
if isinstance(child, Tree):
newtree.append(Tree(child.node, []))
for subchild in child:
newtree[-1].append( (subchild, tag_iter.next()) )
else:
newtree.append( (child, tag_iter.next()) )
return newtree
示例13: camel_case_split
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def camel_case_split(term):
term = re.sub(r'([0-9]+)', r' \1', term)
term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
splits = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', term)
return [s.group(0) for s in splits]
# Split a long, compound hash tag into its component tags. Given the character limit of tweets,
# people would stick words together to save space so this is a useful tool.
# Examples of hash splits from real data (train set) are in /stats/hashtag_splits.txt
# Implementation adapted from https://github.com/matchado/HashTagSplitter
示例14: split_hashtag_long_version
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def split_hashtag_long_version(hashtag):
word_file = path + "/res/word_list.txt"
word_list = utils.load_file(word_file).split()
word_dictionary = list(set(words.words()))
for alphabet in "bcdefghjklmnopqrstuvwxyz":
word_dictionary.remove(alphabet)
all_poss = split_hashtag_to_words_all_possibilities(hashtag.lower(), word_dictionary)
max_p = 0
min_len = 1000
found = False
best_p = []
for poss in all_poss:
counter = 0
for p in poss:
if p in word_list:
counter += 1
if counter == len(poss) and min_len > counter:
found = True
min_len = counter
best_p = poss
else:
if counter > max_p and not found:
max_p = counter
best_p = poss
best_p_v2 = split_hashtag(hashtag, word_list)
if best_p != [] and best_p_v2 != []:
split_words = best_p if len(best_p) < len(best_p_v2) else best_p_v2
else:
if best_p == [] and best_p_v2 == []:
split_words = [hashtag]
else:
split_words = best_p if best_p_v2 == [] else best_p_v2
split_words = ['#' + str(s) for s in split_words]
return split_words
示例15: __get_words_to_consider
# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def __get_words_to_consider(self):
"""Compiles list of all words to consider.
:return: List of words to consider.
"""
return set(words.words())