当前位置: 首页>>代码示例>>Python>>正文


Python treebank.TreebankWordTokenizer方法代码示例

本文整理汇总了Python中nltk.tokenize.treebank.TreebankWordTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python treebank.TreebankWordTokenizer方法的具体用法?Python treebank.TreebankWordTokenizer怎么用?Python treebank.TreebankWordTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize.treebank的用法示例。


在下文中一共展示了treebank.TreebankWordTokenizer方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.')

        if 'detokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.')

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
            raise

        super().__init__(
            *args,
            tokenize=TreebankWordTokenizer().tokenize,
            detokenize=TreebankWordDetokenizer().detokenize,
            **kwargs) 
开发者ID:PetrochukM,项目名称:PyTorch-NLP,代码行数:27,代码来源:treebank_encoder.py

示例2: word_tokenize

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def word_tokenize(text, language='english', preserve_line=False):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
    :type preserve_line: bool
    """
    sentences = [text] if preserve_line else sent_tokenize(text, language)
    return [
        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
    ] 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:21,代码来源:__init__.py

示例3: word_tokenize

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def word_tokenize(text, language='english'):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    """
    return [token for sent in sent_tokenize(text, language)
            for token in _treebank_word_tokenize(sent)] 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:15,代码来源:__init__.py

示例4: demo_liu_hu_lexicon

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent))) # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1) # negative
        else:
            y.append(0) # neutral

    if pos_words > neg_words:
        print('Positive')
    elif pos_words < neg_words:
        print('Negative')
    elif pos_words == neg_words:
        print('Neutral')

    if plot == True:
        _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:42,代码来源:util.py

示例5: tokenize_text

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def tokenize_text(text, language="english"):
    '''Tokenize a string into a list of tokens.
    Use NLTK's Treebankwordtokenizer.
    Note that we first split into sentences using NLTK's sent_tokenize.
    We additionally call a filtering function to remove un-wanted tokens.
    
    IN:
    - text, str
    OUT:
    - list of strings
    '''
    ## list of tokens
    list_tokens = []
    
    ## split text into sentences
    sentences=sent_tokenize(text, language=language)
    
    ## define the tokenizer
    tokenizer = TreebankWordTokenizer()
    ## loop over all sentences
    for sent in sentences:
        ## tokenize the sentence
        sent_tokenized = tokenizer.tokenize(sent)
        ## lowercase the tokens
        ## add tokens to list of tokens
        list_tokens += sent_tokenized
    list_tokens = filter_tokens(list_tokens)
    return list_tokens 
开发者ID:pgcorpus,项目名称:gutenberg,代码行数:30,代码来源:tokenizer.py

示例6: tokenize

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def tokenize(self, text: str):
        """
        :rtype: list
        :param text: text to be tokenized into sentences
        :type text: str
        """
        sents = self.sent_tokenizer.tokenize(text)
        tokenizer = TreebankWordTokenizer()
        return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist] 
开发者ID:cltk,项目名称:cltk,代码行数:11,代码来源:word.py

示例7: tokenize

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def tokenize(self, text: str):
        """
        :rtype: list
        :param text: text to be tokenized into sentences
        :type text: str
        :param model: tokenizer object to used # Should be in init?
        :type model: object
        """
        sents = self.sent_tokenizer.tokenize(text)
        tokenizer = TreebankWordTokenizer()
        return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist] 
开发者ID:cltk,项目名称:cltk,代码行数:13,代码来源:word.py

示例8: word_tokenize

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def word_tokenize(text):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently :class:`.TreebankWordTokenizer`).
    This tokenizer is designed to work on a sentence at a time.
    """
    return _word_tokenize(text) 
开发者ID:blackye,项目名称:luscan-devel,代码行数:10,代码来源:__init__.py

示例9: word_tokenize

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def word_tokenize(text, language='english'):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    """
    if sys.version_info[0] < 3:
        return [token for token in _treebank_word_tokenize(text)]
    else:
        return [token for token in _treebank_word_tokenize(text.decode("UTF-8"))] 
开发者ID:jinfengr,项目名称:neural-tweet-search,代码行数:17,代码来源:utils.py

示例10: __init__

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def __init__(self):
        super().__init__()
        self.tokenizer = TreebankWordTokenizer() 
开发者ID:asyml,项目名称:forte,代码行数:5,代码来源:nltk_processors.py

示例11: __init__

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def __init__(self, language):
        """Take language as argument to the class. Check availability and
        setup class variables."""
        self.language = language
        self.available_languages = ['akkadian',
                                    'arabic',
                                    'french',  # defaults to old_french
                                    'greek',
                                    'latin',
                                    'middle_english',
                                    'middle_french',
                                    'middle_high_german',
                                    'old_french',
                                    'old_norse',
                                    'sanskrit',
                                    'multilingual']

        assert self.language in self.available_languages, \
            "Specific tokenizer not available for '{0}'. Only available for: '{1}'.".format(
                self.language,
                self.available_languages)

        # raise languages-specific warnings
        if self.language == 'french':
            self.language = 'old_french'
            LOG.warning("'french' defaults to 'old_french'. 'middle_french' also available.")  # pylint: disable=line-too-long

        if self.language == 'arabic':
            self.toker = BaseArabyWordTokenizer('arabic')
        elif self.language == 'french':
            self.toker = BaseRegexWordTokenizer('old_french',
                                                OldFrenchTokenizerPatterns)
        elif self.language == 'greek':
            self.toker = BasePunktWordTokenizer('greek',
                                                GreekRegexSentenceTokenizer)
        elif self.language == 'latin':
            self.toker = LatinWordTokenizer()
        elif self.language == 'old_norse':
            self.toker = BaseRegexWordTokenizer('old_norse',
                                                OldNorseTokenizerPatterns)
        elif self.language == 'middle_english':
            self.toker = BaseRegexWordTokenizer('middle_english',
                                                MiddleEnglishTokenizerPatterns)
        elif self.language == 'middle_french':
            self.toker = BaseRegexWordTokenizer('old_french',
                                                OldFrenchTokenizerPatterns)
        elif self.language == 'middle_high_german':
            self.toker = BaseRegexWordTokenizer('middle_high_german',
                                                MiddleHighGermanTokenizerPatterns)
        elif self.language == 'old_french':
            self.toker = BaseRegexWordTokenizer('old_french',
                                                OldFrenchTokenizerPatterns)
        else:
            LOG.warning("Falling back to default tokenizer, the NLTK's `TreebankWordTokenizer()`.")
            self.toker = TreebankWordTokenizer() 
开发者ID:cltk,项目名称:cltk,代码行数:57,代码来源:word.py

示例12: demo_liu_hu_lexicon

# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent)))  # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

    if pos_words > neg_words:
        print('Positive')
    elif pos_words < neg_words:
        print('Negative')
    elif pos_words == neg_words:
        print('Neutral')

    if plot == True:
        _show_plot(
            x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']
        ) 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:44,代码来源:util.py


注:本文中的nltk.tokenize.treebank.TreebankWordTokenizer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。