当前位置: 首页>>代码示例>>Python>>正文


Python nltk.download方法代码示例

本文整理汇总了Python中nltk.download方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.download方法的具体用法?Python nltk.download怎么用?Python nltk.download使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk的用法示例。


在下文中一共展示了nltk.download方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(self, hyperparameters, lookup_table):
        """Constructor for initializing ASAP-AES datasets.

        Args:
            - hyperparameters: hyperparameters of the experiments.
            - lookup_table: word embedding lookup table, which should be a dict
                            mapping words into their NumPy vector repre-
                            sentation.
        """
        # This constructor tries to detect or download NLTK's tokenizer
        # automatically.
        try:
            self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        except LookupError:
            nltk.download("punkt")
            self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        # Also load hyperparameters and lookup table.
        self.lookup_table = lookup_table
        self.hp = hyperparameters 
开发者ID:zlliang,项目名称:essaysense,代码行数:21,代码来源:utils.py

示例2: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        ) 
开发者ID:HazyResearch,项目名称:metal,代码行数:26,代码来源:ngram_featurizer.py

示例3: install_deps

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def install_deps(*plugins):
    installed = False
    nltk_resources = set()
    requirements = []
    for info in plugins:
        requirements = info.get('requirements', [])
        if requirements:
            requirements += missing_requirements(requirements)
        nltk_resources |= set(info.get('nltk_resources', []))
    if requirements:
        logger.info('Installing requirements: ' + str(requirements))
        pip_args = [sys.executable, '-m', 'pip', 'install']
        for req in requirements:
            pip_args.append(req)
        process = subprocess.Popen(
            pip_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        _log_subprocess_output(process)
        exitcode = process.wait()
        installed = True
        if exitcode != 0:
            raise models.Error(
                "Dependencies not properly installed: {}".format(pip_args))
    installed |= download(list(nltk_resources))
    return installed 
开发者ID:gsi-upm,项目名称:senpy,代码行数:26,代码来源:__init__.py

示例4: setUpClass

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def setUpClass(self):
        try:
            corpus_importer = CorpusImporter("latin")
            corpus_importer.import_corpus("latin_models_cltk")
            corpus_importer.import_corpus("latin_text_latin_library")
        except:
            raise Exception("Failure to download test corpus")
        self.reader = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        )
        self.reader._fileids = ["pervig.txt"]
        # Need a additional instance because tests below change internals #TO-DO Fix
        self.reader_2 = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        )
        self.reader_3 = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        )
        self.reader_4 = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        ) 
开发者ID:cltk,项目名称:cltk,代码行数:23,代码来源:test_corpus.py

示例5: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.')

        if 'detokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.')

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
            raise

        super().__init__(
            *args,
            tokenize=TreebankWordTokenizer().tokenize,
            detokenize=TreebankWordDetokenizer().detokenize,
            **kwargs) 
开发者ID:PetrochukM,项目名称:PyTorch-NLP,代码行数:27,代码来源:treebank_encoder.py

示例6: print_corpus_download_warning

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def print_corpus_download_warning():
    corpus_warning = """
    Hmm...
    ---------------------

    We had some trouble downloading the NLTK corpuses.. 
    Try running the following from a command line. This should 
    download the needed packages.. but it might also tell you if 
    there is another issue.

    $ python3 -m nltk.downloader punkt averaged_perceptron_tagger
    """
    logger.warning(corpus_warning)


# Helpers 
开发者ID:korymath,项目名称:talk-generator,代码行数:18,代码来源:language_util.py

示例7: get_sentence_tokenizer

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def get_sentence_tokenizer():
    """
    Loads the nltk sentence tokenizer.
    """
    try:
        import nltk
    except ImportError:
        raise ImportError('Please install nltk (e.g. pip install nltk).')
    # nltk-specific setup
    st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
    try:
        sent_tok = nltk.data.load(st_path)
    except LookupError:
        nltk.download('punkt')
        sent_tok = nltk.data.load(st_path)
    return sent_tok 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:18,代码来源:agents.py

示例8: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
        try:
            from nltk.corpus import wordnet
            import nltk
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires nltk to be installed."
            )

        self.wn = wordnet

        try:
            import spacy
            from spacy.tokens import Token
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires spaCy and a language "
                "model to be installed (for part of speech tagging)."
            )

        if not skip_download_check:
            nltk.download("wordnet")

        self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
        Token.set_extension("replacement", default=None, force=True) 
开发者ID:RTIInternational,项目名称:gobbli,代码行数:27,代码来源:wordnet.py

示例9: download

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def download():
    '''
    Install required libraries.
    Note this library will install nltk dependencies into your
    user directory.
    '''

    click.echo("Installing nltk packages into your user directories in " +
               "the following order of existence (first found):\n" +
               '\n'.join(nltk.data.path))

    extensions = [("taggers", "averaged_perceptron_tagger"),
                  ("corpora", "wordnet"),
                  ("tokenizers", "punkt")]

    missing = check_packages_exist(extensions)

    for ext_tuple in missing:
        nltk.download(ext_tuple[1]) 
开发者ID:learntextvis,项目名称:textkit,代码行数:21,代码来源:download.py

示例10: get_sentence_tokenizer

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def get_sentence_tokenizer():
    """
    Loads the nltk sentence tokenizer
    """
    try:
        import nltk
    except ImportError:
        raise ImportError('Please install nltk (e.g. pip install nltk).')
    # nltk-specific setup
    st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
    try:
        sent_tok = nltk.data.load(st_path)
    except LookupError:
        nltk.download('punkt')
        sent_tok = nltk.data.load(st_path)
    return sent_tok 
开发者ID:natashamjaques,项目名称:neural_chat,代码行数:18,代码来源:agents.py

示例11: get

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def get(self, lang):
        if lang not in self._spacys:
            import spacy
            # Hack to dynamically download languages on cluster machines,
            # you can remove if you have the models installed and just do:
            # cls._spacys[lang] = spacy.load(lang)
            try:
                old_exit = sys.exit
                sys.exit = None
                try:
                    self._spacys[lang] = spacy.load(lang)
                except Exception:
                    spacy.cli.download(lang)
                    self._spacys[lang] = spacy.load(lang)
            except Exception as e:
                raise Exception(
                    "Failed to find or download language {0}: {1}"
                    .format(lang, e))
            finally:
                sys.exit = old_exit

        return self._spacys[lang] 
开发者ID:sparklingpandas,项目名称:sparklingml,代码行数:24,代码来源:transformation_functions.py

示例12: test_token_removal_filter

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def test_token_removal_filter():
    stim = TextStim(text='this is not a very long sentence')
    filt = TokenRemovalFilter()
    assert filt.transform(stim).text == 'long sentence'

    filt2 = TokenRemovalFilter(tokens=['a', 'the', 'is'])
    assert filt2.transform(stim).text == 'this not very long sentence'

    stim2 = TextStim(text='More. is Real, sentence that\'ll work')
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
    from nltk.corpus import stopwords
    tokens = set(stopwords.words('english')) | set(string.punctuation)
    filt3 = TokenRemovalFilter(tokens=tokens)
    assert filt3.transform(stim2).text == 'More Real sentence \'ll work' 
开发者ID:tyarkoni,项目名称:pliers,代码行数:19,代码来源:test_text_filters.py

示例13: nltk_download

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def nltk_download(name, ignore_errors=True):
    r"""Like nltk.download, but be quiet about it, and get a room (separate python process)

    Does some simple whitespace normalization on `name`, but doesn't yet do fuzzy matching
    Caches the normalized names of packages already attempted, so they aren't re-tried

    >>> nltk_download('nonexistent dataset name', ignore_errors=True)
    False
    >>> nltk_download('WordNet', ignore_errors=True)
    True
    >>> nltk_download('wordnet', ignore_errors=True)
    True
    """
    name = re.sub(r"[-\s=+']+", '_', name.lower())
    if name in nltk_download.done:
        return nltk_download.done[name]
    proc = subprocess.Popen(["python", "-c", "import nltk; nltk.download('{}')".format(name)], stdout=subprocess.PIPE)
    msgs = [s for s in proc.communicate() if s is not None]
    if any(re.match(r'^\[nltk_data\]\s+Error', msg, flags=re.IGNORECASE) for msg in msgs):
        nltk_download.done[name] = False
        if ignore_errors:
            return nltk_download.done[name]
        raise ValueError('Unable to download the requested NLTK dataset: {}'.format('\n'.join(msgs)))
    nltk_download.done[name] = True
    return nltk_download.done[name] 
开发者ID:totalgood,项目名称:twip,代码行数:27,代码来源:nlp.py

示例14: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(self, ngrams: Union[int, List[int]] = 1,
                 exclude_stopwords: bool = False,
                 stop_words: Optional[List] = None) -> None:
        """ Initialize the NGramsTokenizer

        Parameters
        ----------
        ngrams : Union[int, List[int]], optional
            [description], by default 1
        exclude_stopwords: bool
            [description], by default False
        stop_words: Optional[List]
            [description], by default None

        """
        self.ngrams = ngrams
        self.exclude_stopwords = exclude_stopwords

        if self.exclude_stopwords:
            self.stop_words = stop_words
            if self.stop_words is None:
                nltk.download('stopwords', quiet=True)
                self.stop_words = stopwords.words('english')

        nltk.download('punkt', quiet=True) 
开发者ID:asappresearch,项目名称:flambe,代码行数:27,代码来源:word.py

示例15: get_only_text_washingtonpost_url

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def get_only_text_washingtonpost_url(url):
    # this func will take the URL as an argument and return only
    # the raw text of the url.
    # this function works specifically for the washPost articles
    # because we know the structure of the pages
    page = urllib.urlopen(url).read().decode('utf8')
    # we download the URL
    soup = BeautifulSoup(page)
    # initialize a beautifulsoup object with the page we downloaded
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    # the above gets everything bewteen a pair of HTML tags
    # that look a certain way e.g. <article> stuff</article>
    # the above format is specific to the washington post
    soup2 = BeautifulSoup(text)
    # find all the paragraph tage <p>
    text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text

#######################################################################

# TEST
###################################################################### 
开发者ID:qalhata,项目名称:Python-Scripts-Repo-on-Data-Science,代码行数:24,代码来源:NewsAutosummarize.py


注:本文中的nltk.download方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。