当前位置: 首页>>代码示例>>Python>>正文


Python util.tokenwrap函数代码示例

本文整理汇总了Python中nltk.util.tokenwrap函数的典型用法代码示例。如果您正苦于以下问题:Python tokenwrap函数的具体用法?Python tokenwrap怎么用?Python tokenwrap使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了tokenwrap函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: demo_similar

def demo_similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.
        
        @param word: The word used to seed the similarity search
        @type word: C{str} 
        @param num: The number of words to generate (default=20)
        @type num: C{int}
        @seealso: L{ContextIndex.similar_words()}
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = nltk.text.ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        while 1:
          word = raw_input('Enter a Chinese word such as "開心"(type 0 to exit):'); 
          print "word='"+ word + "'"
          if word == '0': break
          word = word.decode('utf-8')
          wci = self._word_context_index._word_to_contexts
          if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = fd.keys()[:num]
            print tokenwrap(words)
          else:
            print "No matches"
开发者ID:dreampocketit,项目名称:bocard,代码行数:33,代码来源:NLTK_tools.py

示例2: demo_common_context

def demo_common_context(self, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.
        @seealso: L{ContextIndex.common_contexts()}
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = nltk.text.ContextIndex(self.tokens,
                                                    key=lambda s:s.lower())
        while 1:
          inp = raw_input('Enter two Chinese words such as "我 你"(type 0 to exit):'); 
          print "inp='"+ inp+"'"
          if inp == '0': break
          inp = inp.decode('utf-8')
          words = inp.split(u' ')
          try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print "No common contexts were found"
            else:
                ranked_contexts = fd.keys()[:num]
                print tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts)
          except ValueError, e:
            print e
开发者ID:dreampocketit,项目名称:bocard,代码行数:25,代码来源:NLTK_tools.py

示例3: demo_collocations

def demo_collocations(self, num=40, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        @seealso: L{find_collocations}
        @param num: The maximum number of collocations to print.
        @type num: C{int}
        @param window_size: The number of tokens spanned by a collocation (default=2)
        @type window_size: C{int}
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size
            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            from nltk.collocations import BigramCollocationFinder
            finder = BigramCollocationFinder.from_words(self.tokens, window_size) 
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            from nltk.metrics import f_measure, BigramAssocMeasures
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations]
        print "List {0} collocations".format(num)
        print tokenwrap(colloc_strings, separator=u'; ')
开发者ID:dreampocketit,项目名称:bocard,代码行数:26,代码来源:NLTK_tools.py

示例4: similar

    def similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        """
        if '_word_context_index' not in self.__dict__:
            # print('Building word-context index...')
            self._word_context_index = ContextIndex(
                self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()
            )

        # words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = Counter(
                w
                for w in wci.conditions()
                for c in wci[w]
                if c in contexts and not w == word
            )
            words = [w for w, _ in fd.most_common(num)]
            print(tokenwrap(words))
        else:
            print("No matches")
开发者ID:prz3m,项目名称:kind2anki,代码行数:33,代码来源:text.py

示例5: collocations

    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            '_collocations' in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))
开发者ID:prz3m,项目名称:kind2anki,代码行数:29,代码来源:text.py

示例6: findall

    def findall(self, regexp):
        """
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> print('hack'); from nltk.book import text1, text5, text9
        hack...
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        """

        if "_token_searcher" not in self.__dict__:
            self._token_searcher = TokenSearcher(self)

        hits = self._token_searcher.findall(regexp)
        hits = [' '.join(h) for h in hits]
        print(tokenwrap(hits, "; "))
开发者ID:prz3m,项目名称:kind2anki,代码行数:30,代码来源:text.py

示例7: sandwich

    def sandwich(cls, word):
        """
        """
        ind = cls.corpora_health.index(max(cls.corpora_health))
        results = cls.corpora[ind].sandwich(word)
#        results = [corpus.sandwich(word) for corpus in cls.corpora]
        return tokenwrap(results)
开发者ID:jktong,项目名称:content-consumption,代码行数:7,代码来源:managers.py

示例8: common_contexts

    def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if '_word_context_index' not in self.__dict__:
            # print('Building word-context index...')
            self._word_context_index = ContextIndex(
                self.tokens, key=lambda s: s.lower()
            )

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print("No common contexts were found")
            else:
                ranked_contexts = [w for w, _ in fd.most_common(num)]
                print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))

        except ValueError as e:
            print(e)
开发者ID:prz3m,项目名称:kind2anki,代码行数:27,代码来源:text.py

示例9: gen

def gen(context='', hashtag='', tries=30):
    tokens = nltk.word_tokenize(corpus)
    text = nltk.Text(tokens)
    text.generate(0) #generate model

    n = 10
    r = tokenwrap(text._trigram_model.generate(n, context))
    return r[:140-len(hashtag)]+' '+hashtag
开发者ID:mdamien,项目名称:twitter-poetry,代码行数:8,代码来源:gen.py

示例10: preprocessing

def preprocessing(comment):

    """
    Function to clean the comment. Lower all words and remove stop words
    """

    words = nltk.word_tokenize(comment)
    clean_words = [word.lower() for word in words if word.lower() not in stopwords.words('danish')]
    cleaned_comment = tokenwrap(clean_words)

    return cleaned_comment
开发者ID:dtu-02819-projects-fall2014,项目名称:InfoMine,代码行数:11,代码来源:gender_classifier.py

示例11: collocations

    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """

        collocation_strings = [w1 + ' ' + w2 for w1, w2 in self.collocation_list(num, window_size)]
        print(tokenwrap(collocation_strings, separator="; "))
开发者ID:rmalouf,项目名称:nltk,代码行数:12,代码来源:text.py

示例12: demo_findall

def demo_findall(text):
  while 1:
    inp = raw_input('Enter two Chinese words such as "我:2 手:4"(type 0 to exit):'); 
    print "inp='"+ inp+"'"
    if inp == '0': break
    inp = inp.decode('big5')
    reg = "<1> <2> <3> <4> <5>"
    if len(inp) == 0:
      print 'no input words'
    else:
      for wp in inp.split(' '):	
        (w, p) = wp.split(':')
  #        reg = re.sub(p, w, reg)
        reg = re.sub(p, ''.join(['.*', w, '.*']), reg)
    reg = re.sub('\d', '.*', reg)
    print "reg=", reg
#    text.findall(reg)
    if "_token_searcher" not in text.__dict__:
      text._token_searcher = nltk.text.TokenSearcher(text)
    hits = text._token_searcher.findall(reg)
    hits = [' '.join(h) for h in hits]
    print tokenwrap(hits, u"; ") 
开发者ID:dreampocketit,项目名称:bocard,代码行数:22,代码来源:NLTK_tools.py

示例13: generate

    def generate(self, length=100, context=()):
        """
        Return random text, generated using a trigram language model.

        :param length: The length of text to generate (default=100)
        :type length: int
        :seealso: NgramModel
        """
        if '_trigram_model' not in self.__dict__:
            print "Building ngram index..."
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._trigram_model = NgramModel(3, self, estimator=estimator)
        text = self._trigram_model.generate(length, context=context)
        return tokenwrap(text)
开发者ID:cssndrx,项目名称:writers-block,代码行数:14,代码来源:nltk_custom.py

示例14: synonyms

    def synonyms(word):
        ## todo: this should move because we want to cache the results so we can calculate health!!
        results = []
        for synset in wn.synsets(word):
            results.extend(synset.lemma_names)

        result_set = set(results)        
        if word in result_set:
            result_set.remove(word)

        ### todo: stopped here... should filter these down to some reasonable thing
        ############ todo:check if the above needs to be cached somewhere (maybe it is cached by wn.synsets?)
        results = list(result_set)
        results = results[:MAX_SYNONYMS_TO_RETURN]

        return tokenwrap(results)
开发者ID:jktong,项目名称:content-consumption,代码行数:16,代码来源:managers.py

示例15: similar

    def similar(self, word, num=20):
        """
        Returns as a string similar words
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = nltk.ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = fd.keys()[:num]
            return tokenwrap(words)
        else:
            print "No matches"
开发者ID:cssndrx,项目名称:writers-block,代码行数:22,代码来源:nltk_custom.py


注:本文中的nltk.util.tokenwrap函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。