当前位置: 首页>>代码示例>>Python>>正文


Python words.words方法代码示例

本文整理汇总了Python中nltk.corpus.words.words方法的典型用法代码示例。如果您正苦于以下问题:Python words.words方法的具体用法?Python words.words怎么用?Python words.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.words的用法示例。


在下文中一共展示了words.words方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: word_finder

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def word_finder():
    from nltk.corpus import words
    wordlist = words.words()
    random.shuffle(wordlist)
    wordlist = wordlist[:200]
    wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
    grid, used = wordfinder(wordlist)

    print("Word Finder\n")
    for i in range(len(grid)):
        for j in range(len(grid[i])):
            print(grid[i][j], end=' ')
        print()
    print()

    for i in range(len(used)):
        print("%d:" % (i+1), used[i]) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:19,代码来源:wordfinder.py

示例2: word_finder

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def word_finder():
    from nltk.corpus import words
    wordlist = words.words()
    random.shuffle(wordlist)
    wordlist = wordlist[:200]
    wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
    grid, used = wordfinder(wordlist)

    print "Word Finder\n"
    for i in range(len(grid)):
        for j in range(len(grid[i])):
            print grid[i][j],
        print
    print

    for i in range(len(used)):
        print "%d:" % (i+1), used[i] 
开发者ID:blackye,项目名称:luscan-devel,代码行数:19,代码来源:wordfinder.py

示例3: build_vocabulary

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def build_vocabulary(vocab_filename, lines, minimum_occurrence=1):
    if not os.path.exists(vocab_filename):
        stopwords = get_stopwords_list(filename="stopwords_loose.txt")
        print("Building vocabulary...")
        vocabulary = Counter()
        for line in lines:
            vocabulary.update([l.lower() for l in line.split() if l not in stopwords])
        print("The top 10 most common words: ", vocabulary.most_common(10))
        # Filter all words that appear too rarely or too frequently to be conclusive
        vocabulary = {key: vocabulary[key] for key in vocabulary
                      if vocabulary[key] >= minimum_occurrence}
        utils.save_file(vocabulary.keys(), vocab_filename)
        print("Vocabulary saved to file \"%s\"" % vocab_filename)
    vocabulary = set(utils.load_file(vocab_filename))
    print("Loaded vocabulary of size ", len(vocabulary))
    return vocabulary 
开发者ID:MirunaPislar,项目名称:Sarcasm-Detection,代码行数:18,代码来源:data_processing.py

示例4: split_hashtag

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def split_hashtag(hashtag, word_list):
    split_words = []
    if hashtag != hashtag.lower() and hashtag != hashtag.upper():
        split_words = camel_case_split(hashtag)
    else:
        j = 0
        while j <= len(hashtag):
            loc = j
            for i in range(j + 1, len(hashtag) + 1, 1):
                if hashtag[j:i].lower() in word_list:
                    loc = i
            if loc == j:
                j += 1
            else:
                split_words.append(hashtag[j:loc])
                j = loc
    split_words = ['#' + str(s) for s in split_words]
    return split_words


# Select the best possible hashtag split based on upper-case
# or component words maximizing the length of the possible word split 
开发者ID:MirunaPislar,项目名称:Sarcasm-Detection,代码行数:24,代码来源:data_processing.py

示例5: __get_words_to_ignore

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def __get_words_to_ignore(self):
        """Compiles list of all words to ignore.

        :return: List of words to ignore.
        """
        # Stop words in English.
        english_stopwords = stopwords.words("english")

        here = path.abspath(path.dirname(__file__))

        # Languages in git repositories.
        git_languages = []
        with open(path.join(here, "gitlang/languages.txt"), "r") as langauges:
            git_languages = [line.strip() for line in langauges]

        # Other words to avoid in git repositories.
        words_to_avoid = []
        with open(path.join(here, "gitlang/others.txt"), "r") as languages:
            words_to_avoid = [line.strip() for line in languages]

        return set(
            itertools.chain(english_stopwords, git_languages, words_to_avoid)
        ) 
开发者ID:csurfer,项目名称:gitsuggest,代码行数:25,代码来源:suggest.py

示例6: __init__

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def __init__(self, w2v, sif, log=None):
        # A NLPWord2Vec to get the vec for a word
        self.word2vec = w2v
        # A ResSIF used to get word count
        self.sif = sif
        # util to pre-process data
        self.utils = WordSentenceUtils()
        self.log = log if log else logging.getLogger(__name__)
        self.sentence_vectors = []
        self.feature_size = 0
        # download nltk resource if necessary
        nltk.download('words', quiet=True)
        self.setofwords = set(nltk_words.words())

        # pca vector
        self.pca_u = [] 
开发者ID:ibmresilient,项目名称:resilient-community-apps,代码行数:18,代码来源:res_sen2vec.py

示例7: word_finder

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def word_finder():
    from nltk.corpus import words

    wordlist = words.words()
    random.shuffle(wordlist)
    wordlist = wordlist[:200]
    wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
    grid, used = wordfinder(wordlist)

    print("Word Finder\n")
    for i in range(len(grid)):
        for j in range(len(grid[i])):
            print(grid[i][j], end=' ')
        print()
    print()

    for i in range(len(used)):
        print("%d:" % (i + 1), used[i]) 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:20,代码来源:wordfinder.py

示例8: _english_wordlist

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def _english_wordlist(self):
        try:
            wl = self._en_wordlist
        except AttributeError:
            from nltk.corpus import words
            self._en_wordlist = set(words.words('en-basic'))
            wl = self._en_wordlist
        return wl 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:10,代码来源:named_entity.py

示例9: postag_tree

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def postag_tree(tree):
    # Part-of-speech tagging.
    words = tree.leaves()
    tag_iter = (pos for (word, pos) in pos_tag(words))
    newtree = Tree('S', [])
    for child in tree:
        if isinstance(child, Tree):
            newtree.append(Tree(child.label(), []))
            for subchild in child:
                newtree[-1].append( (subchild, next(tag_iter)) )
        else:
            newtree.append( (child, next(tag_iter)) )
    return newtree 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:15,代码来源:named_entity.py

示例10: WPS

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def WPS(text):
    count = 0
    for word in text.split():
        if word in set(w.lower() for w in words.words()):
            count += 1
    return count

#Average Number Of Syllables In Sentence(Returns Float): 
开发者ID:GauravBh1010tt,项目名称:DeepLearn,代码行数:10,代码来源:rd_ft.py

示例11: __init__

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def __init__(self, config, abc):
	# def __init__(self):

		self.config = config
		self.seed()

		self.all_words = words.words()
		self.english_alpha = ''.join([c for c in abc if c in 'abcdefghijklmnopqrstuvwxyz0123456789'])+"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
		self.english_symbols = ''.join([c for c in abc if c not in 'abcdefghijklmnopqrstuvwxyz0123456789'])

		self.english = self.english_alpha + self.english_symbols

		self.transparent_mean = 0.8		
		self.transparent_gaussian = 0.06
		
		self.prob_lexi = 0.5
		self.symbol_word = 1

		self.art_font_size_range = self.config['augmentation']['font_range']
		self.border_range = self.config['augmentation']['border_range']

		self.font_dir_name='/home/Common/Datasets_SSD/Dataset_Text/ART/fonts_for_text'

		#probabilty distribution for length of words
		self.probability_dist = np.array([0.1, 0.6, 2.6, 5.2, 8.5, 12.2, 14, 14, 12.6, 10.1, 7.5])#, 5.2, 3.2, 2, 1, 0.6, 0.3, 0.2, 0.1, 0.1
		self.probability_dist = self.probability_dist/np.sum(self.probability_dist)

		list_of_files = self.get_list_of_files(self.font_dir_name)

		self.all_fonts = []
		for i in range(len(list_of_files)):
			with open(list_of_files[i],"rb") as f:
				font_bytes=f.read()
				self.all_fonts.append(font_bytes)

		self.image_net_location = "/media/mayank/0b40607e-7efc-4216-b12f-8bb86facfaed/Dataset_HDD/Image_Net/ILSVRC/Data/CLS-LOC/test/"
		# self.image_net_location = "/home/Common/ImageNet/test"
		self.images_orig = self.get_imagenet_images(self.config['augmentation']['imagenet_no'])#self.config['augmentation']['base_number']
		# self.image_save_location = '/home/Common/Mayank/Text/Segmentation/Dataset/ART/Images/'
		# self.label_save_location = '/home/Common/Mayank/Text/Segmentation/Dataset/ART/Labels/' 
开发者ID:mayank-git-hub,项目名称:Text-Recognition,代码行数:42,代码来源:art.py

示例12: postag_tree

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def postag_tree(tree):
    # Part-of-speech tagging.
    words = tree.leaves()
    tag_iter = (pos for (word, pos) in pos_tag(words))
    newtree = Tree('S', [])
    for child in tree:
        if isinstance(child, Tree):
            newtree.append(Tree(child.node, []))
            for subchild in child:
                newtree[-1].append( (subchild, tag_iter.next()) )
        else:
            newtree.append( (child, tag_iter.next()) )
    return newtree 
开发者ID:blackye,项目名称:luscan-devel,代码行数:15,代码来源:named_entity.py

示例13: camel_case_split

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def camel_case_split(term):
    term = re.sub(r'([0-9]+)', r' \1', term)
    term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
    splits = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', term)
    return [s.group(0) for s in splits]


# Split a long, compound hash tag into its component tags. Given the character limit of tweets,
# people would stick words together to save space so this is a useful tool.
# Examples of hash splits from real data (train set) are in /stats/hashtag_splits.txt
# Implementation adapted from https://github.com/matchado/HashTagSplitter 
开发者ID:MirunaPislar,项目名称:Sarcasm-Detection,代码行数:13,代码来源:data_processing.py

示例14: split_hashtag_long_version

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def split_hashtag_long_version(hashtag):
    word_file = path + "/res/word_list.txt"
    word_list = utils.load_file(word_file).split()
    word_dictionary = list(set(words.words()))
    for alphabet in "bcdefghjklmnopqrstuvwxyz":
        word_dictionary.remove(alphabet)
    all_poss = split_hashtag_to_words_all_possibilities(hashtag.lower(), word_dictionary)
    max_p = 0
    min_len = 1000
    found = False
    best_p = []
    for poss in all_poss:
        counter = 0
        for p in poss:
            if p in word_list:
                counter += 1
        if counter == len(poss) and min_len > counter:
            found = True
            min_len = counter
            best_p = poss
        else:
            if counter > max_p and not found:
                max_p = counter
                best_p = poss
    best_p_v2 = split_hashtag(hashtag, word_list)
    if best_p != [] and best_p_v2 != []:
        split_words = best_p if len(best_p) < len(best_p_v2) else best_p_v2
    else:
        if best_p == [] and best_p_v2 == []:
            split_words = [hashtag]
        else:
            split_words = best_p if best_p_v2 == [] else best_p_v2
    split_words = ['#' + str(s) for s in split_words]
    return split_words 
开发者ID:MirunaPislar,项目名称:Sarcasm-Detection,代码行数:36,代码来源:data_processing.py

示例15: __get_words_to_consider

# 需要导入模块: from nltk.corpus import words [as 别名]
# 或者: from nltk.corpus.words import words [as 别名]
def __get_words_to_consider(self):
        """Compiles list of all words to consider.

        :return: List of words to consider.
        """
        return set(words.words()) 
开发者ID:csurfer,项目名称:gitsuggest,代码行数:8,代码来源:suggest.py


注:本文中的nltk.corpus.words.words方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。