当前位置: 首页>>代码示例>>Python>>正文


Python brown.words方法代码示例

本文整理汇总了Python中nltk.corpus.brown.words方法的典型用法代码示例。如果您正苦于以下问题:Python brown.words方法的具体用法?Python brown.words怎么用?Python brown.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.brown的用法示例。


在下文中一共展示了brown.words方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: collocations

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; ")) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:26,代码来源:text.py

示例2: splitter

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def splitter(data, num_words):
    words = data.split(' ')
    output = []

    cur_count = 0
    cur_words = []
    for word in words:
        cur_words.append(word)
        cur_count += 1
        if cur_count == num_words:
            output.append(' '.join(cur_words))
            cur_words = []
            cur_count = 0

    output.append(' '.join(cur_words) )

    return output 
开发者ID:PacktPublishing,项目名称:Python-Machine-Learning-Cookbook-Second-Edition,代码行数:19,代码来源:chunking.py

示例3: collocations

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print tokenwrap(colloc_strings, separator="; ") 
开发者ID:blackye,项目名称:luscan-devel,代码行数:26,代码来源:text.py

示例4: splitter

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def splitter(content, num_of_words):
	words = content.split(' ')
	result = []

	current_count = 0
	current_words = []

	for word in words:
	 current_words.append(word)
	 current_count += 1

         if current_count == num_of_words:
	  result.append(' '.join(current_words))
          current_words = []
	  current_count = 0

        result.append(' '.join(current_words))
        return result 
开发者ID:PacktPublishing,项目名称:Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition,代码行数:20,代码来源:chunking.py

示例5: brown_data

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def brown_data():
  """return the text_length first tokens of the brown corpus tagged in pyrata format"""
  tokens = brown.words()
  tokens = tokens[:text_length]

  pos_tags = nltk.pos_tag(tokens)

  return [{'raw':w, 'pos':p} for (w, p) in pos_tags]


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST 
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""


# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 
开发者ID:nicolashernandez,项目名称:PyRATA,代码行数:18,代码来源:phrase-extraction.py

示例6: common_contexts

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:", " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(
                c for w in words for c in self._word_to_contexts[w] if c in common
            )
            return fd 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:27,代码来源:text.py

示例7: __init__

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def __init__(self, tokens, key=lambda x: x):
        """
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""

        self._key = key
        """Function mapping each token to an index key (or None)."""

        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset indices."""
        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index) 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:27,代码来源:text.py

示例8: word_similarity_dict

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def word_similarity_dict(self, word):
        """
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        """
        word = self._key(word)
        word_contexts = set(self._word_to_contexts[word])

        scores = {}
        for w, w_contexts in self._word_to_contexts.items():
            scores[w] = f_measure(word_contexts, set(w_contexts))

        return scores 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:16,代码来源:text.py

示例9: common_contexts

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:",
                             " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(c for w in words
                          for c in self._word_to_contexts[w]
                          if c in common)
            return fd 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:28,代码来源:text.py

示例10: __init__

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def __init__(self, tokens, key=lambda x:x):
        """
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""

        self._key = key
        """Function mapping each token to an index key (or None)."""

        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset
           indices."""

        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:29,代码来源:text.py

示例11: print_concordance

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def print_concordance(self, word, width=75, lines=25):
        """
        Print a concordance for ``word`` with the specified context window.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int
        """
        half_width = (width - len(word) - 2) // 2
        context = width // 4 # approx number of words of context

        offsets = self.offsets(word)
        if offsets:
            lines = min(lines, len(offsets))
            print("Displaying %s of %s matches:" % (lines, len(offsets)))
            for i in offsets:
                if lines <= 0:
                    break
                left = (' ' * half_width +
                        ' '.join(self._tokens[i-context:i]))
                right = ' '.join(self._tokens[i+1:i+context])
                left = left[-half_width:]
                right = right[:half_width]
                print(left, self._tokens[i], right)
                lines -= 1
        else:
            print("No matches") 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:32,代码来源:text.py

示例12: dispersion_plot

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def dispersion_plot(self, words):
        """
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        """
        from nltk.draw import dispersion_plot
        dispersion_plot(self, words) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:13,代码来源:text.py

示例13: demo

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def demo():
    from nltk.corpus import brown
    text = Text(brown.words(categories='news'))
    print(text)
    print()
    print("Concordance:")
    text.concordance('news')
    print()
    print("Distributionally similar words:")
    text.similar('news')
    print()
    print("Collocations:")
    text.collocations()
    print()
    #print("Automatically generated text:")
    #text.generate()
    #print()
    print("Dispersion plot:")
    text.dispersion_plot(['news', 'report', 'said', 'announced'])
    print()
    print("Vocabulary plot:")
    text.plot(50)
    print()
    print("Indexing:")
    print("text[3]:", text[3])
    print("text[3:5]:", text[3:5])
    print("text.vocab()['news']:", text.vocab()['news']) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:29,代码来源:text.py

示例14: get_nltk_freq_words

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def get_nltk_freq_words():
    """Use Brown corpus frequent words
    More corpora: https://www.nltk.org/book/ch02.html
    """
    freq_dict = nltk.FreqDist(brown.words())

    for fileid in gutenberg.fileids():
        freq_dict.update(nltk.FreqDist(gutenberg.words(fileid)))

    freq_words = [k for k, v in freq_dict.items() if v > 10]
    return freq_words, freq_dict 
开发者ID:easonnie,项目名称:combine-FEVER-NSMN,代码行数:13,代码来源:nltk_utils.py

示例15: print_concordance

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def print_concordance(self, word, width=75, lines=25):
        """
        Print a concordance for ``word`` with the specified context window.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int
        """
        half_width = (width - len(word) - 2) / 2
        context = width/4 # approx number of words of context

        offsets = self.offsets(word)
        if offsets:
            lines = min(lines, len(offsets))
            print "Displaying %s of %s matches:" % (lines, len(offsets))
            for i in offsets:
                if lines <= 0:
                    break
                left = (' ' * half_width +
                        ' '.join(self._tokens[i-context:i]))
                right = ' '.join(self._tokens[i+1:i+context])
                left = left[-half_width:]
                right = right[:half_width]
                print left, self._tokens[i], right
                lines -= 1
        else:
            print "No matches" 
开发者ID:blackye,项目名称:luscan-devel,代码行数:32,代码来源:text.py


注:本文中的nltk.corpus.brown.words方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。