本文整理汇总了Python中nltk.download方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.download方法的具体用法?Python nltk.download怎么用?Python nltk.download使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.download方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(self, hyperparameters, lookup_table):
"""Constructor for initializing ASAP-AES datasets.
Args:
- hyperparameters: hyperparameters of the experiments.
- lookup_table: word embedding lookup table, which should be a dict
mapping words into their NumPy vector repre-
sentation.
"""
# This constructor tries to detect or download NLTK's tokenizer
# automatically.
try:
self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
except LookupError:
nltk.download("punkt")
self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Also load hyperparameters and lookup table.
self.lookup_table = lookup_table
self.hp = hyperparameters
示例2: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(
self,
anonymize=True,
trim_window=5,
lowercase=True,
drop_stopwords=True,
stem=True,
ngram_range=(1, 3),
**vectorizer_kwargs,
):
self.anonymize = anonymize
self.lowercase = lowercase
self.drop_stopwords = drop_stopwords
if drop_stopwords:
nltk.download("stopwords")
self.stopwords = set(nltk.corpus.stopwords.words("english"))
self.trim_window = trim_window
self.stem = stem
if stem:
self.porter = nltk.PorterStemmer()
self.vectorizer = CountVectorizer(
ngram_range=ngram_range, binary=True, **vectorizer_kwargs
)
示例3: install_deps
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def install_deps(*plugins):
installed = False
nltk_resources = set()
requirements = []
for info in plugins:
requirements = info.get('requirements', [])
if requirements:
requirements += missing_requirements(requirements)
nltk_resources |= set(info.get('nltk_resources', []))
if requirements:
logger.info('Installing requirements: ' + str(requirements))
pip_args = [sys.executable, '-m', 'pip', 'install']
for req in requirements:
pip_args.append(req)
process = subprocess.Popen(
pip_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
_log_subprocess_output(process)
exitcode = process.wait()
installed = True
if exitcode != 0:
raise models.Error(
"Dependencies not properly installed: {}".format(pip_args))
installed |= download(list(nltk_resources))
return installed
示例4: setUpClass
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def setUpClass(self):
try:
corpus_importer = CorpusImporter("latin")
corpus_importer.import_corpus("latin_models_cltk")
corpus_importer.import_corpus("latin_text_latin_library")
except:
raise Exception("Failure to download test corpus")
self.reader = get_corpus_reader(
language="latin", corpus_name="latin_text_latin_library"
)
self.reader._fileids = ["pervig.txt"]
# Need a additional instance because tests below change internals #TO-DO Fix
self.reader_2 = get_corpus_reader(
language="latin", corpus_name="latin_text_latin_library"
)
self.reader_3 = get_corpus_reader(
language="latin", corpus_name="latin_text_latin_library"
)
self.reader_4 = get_corpus_reader(
language="latin", corpus_name="latin_text_latin_library"
)
示例5: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(self, *args, **kwargs):
if 'tokenize' in kwargs:
raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.')
if 'detokenize' in kwargs:
raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.')
try:
import nltk
# Required for moses
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
except ImportError:
print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
raise
super().__init__(
*args,
tokenize=TreebankWordTokenizer().tokenize,
detokenize=TreebankWordDetokenizer().detokenize,
**kwargs)
示例6: print_corpus_download_warning
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def print_corpus_download_warning():
corpus_warning = """
Hmm...
---------------------
We had some trouble downloading the NLTK corpuses..
Try running the following from a command line. This should
download the needed packages.. but it might also tell you if
there is another issue.
$ python3 -m nltk.downloader punkt averaged_perceptron_tagger
"""
logger.warning(corpus_warning)
# Helpers
示例7: get_sentence_tokenizer
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def get_sentence_tokenizer():
"""
Loads the nltk sentence tokenizer.
"""
try:
import nltk
except ImportError:
raise ImportError('Please install nltk (e.g. pip install nltk).')
# nltk-specific setup
st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
try:
sent_tok = nltk.data.load(st_path)
except LookupError:
nltk.download('punkt')
sent_tok = nltk.data.load(st_path)
return sent_tok
示例8: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
try:
from nltk.corpus import wordnet
import nltk
except ImportError:
raise ImportError(
"WordNet-based data augmentation requires nltk to be installed."
)
self.wn = wordnet
try:
import spacy
from spacy.tokens import Token
except ImportError:
raise ImportError(
"WordNet-based data augmentation requires spaCy and a language "
"model to be installed (for part of speech tagging)."
)
if not skip_download_check:
nltk.download("wordnet")
self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
Token.set_extension("replacement", default=None, force=True)
示例9: download
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def download():
'''
Install required libraries.
Note this library will install nltk dependencies into your
user directory.
'''
click.echo("Installing nltk packages into your user directories in " +
"the following order of existence (first found):\n" +
'\n'.join(nltk.data.path))
extensions = [("taggers", "averaged_perceptron_tagger"),
("corpora", "wordnet"),
("tokenizers", "punkt")]
missing = check_packages_exist(extensions)
for ext_tuple in missing:
nltk.download(ext_tuple[1])
示例10: get_sentence_tokenizer
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def get_sentence_tokenizer():
"""
Loads the nltk sentence tokenizer
"""
try:
import nltk
except ImportError:
raise ImportError('Please install nltk (e.g. pip install nltk).')
# nltk-specific setup
st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
try:
sent_tok = nltk.data.load(st_path)
except LookupError:
nltk.download('punkt')
sent_tok = nltk.data.load(st_path)
return sent_tok
示例11: get
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def get(self, lang):
if lang not in self._spacys:
import spacy
# Hack to dynamically download languages on cluster machines,
# you can remove if you have the models installed and just do:
# cls._spacys[lang] = spacy.load(lang)
try:
old_exit = sys.exit
sys.exit = None
try:
self._spacys[lang] = spacy.load(lang)
except Exception:
spacy.cli.download(lang)
self._spacys[lang] = spacy.load(lang)
except Exception as e:
raise Exception(
"Failed to find or download language {0}: {1}"
.format(lang, e))
finally:
sys.exit = old_exit
return self._spacys[lang]
示例12: test_token_removal_filter
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def test_token_removal_filter():
stim = TextStim(text='this is not a very long sentence')
filt = TokenRemovalFilter()
assert filt.transform(stim).text == 'long sentence'
filt2 = TokenRemovalFilter(tokens=['a', 'the', 'is'])
assert filt2.transform(stim).text == 'this not very long sentence'
stim2 = TextStim(text='More. is Real, sentence that\'ll work')
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
from nltk.corpus import stopwords
tokens = set(stopwords.words('english')) | set(string.punctuation)
filt3 = TokenRemovalFilter(tokens=tokens)
assert filt3.transform(stim2).text == 'More Real sentence \'ll work'
示例13: nltk_download
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def nltk_download(name, ignore_errors=True):
r"""Like nltk.download, but be quiet about it, and get a room (separate python process)
Does some simple whitespace normalization on `name`, but doesn't yet do fuzzy matching
Caches the normalized names of packages already attempted, so they aren't re-tried
>>> nltk_download('nonexistent dataset name', ignore_errors=True)
False
>>> nltk_download('WordNet', ignore_errors=True)
True
>>> nltk_download('wordnet', ignore_errors=True)
True
"""
name = re.sub(r"[-\s=+']+", '_', name.lower())
if name in nltk_download.done:
return nltk_download.done[name]
proc = subprocess.Popen(["python", "-c", "import nltk; nltk.download('{}')".format(name)], stdout=subprocess.PIPE)
msgs = [s for s in proc.communicate() if s is not None]
if any(re.match(r'^\[nltk_data\]\s+Error', msg, flags=re.IGNORECASE) for msg in msgs):
nltk_download.done[name] = False
if ignore_errors:
return nltk_download.done[name]
raise ValueError('Unable to download the requested NLTK dataset: {}'.format('\n'.join(msgs)))
nltk_download.done[name] = True
return nltk_download.done[name]
示例14: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def __init__(self, ngrams: Union[int, List[int]] = 1,
exclude_stopwords: bool = False,
stop_words: Optional[List] = None) -> None:
""" Initialize the NGramsTokenizer
Parameters
----------
ngrams : Union[int, List[int]], optional
[description], by default 1
exclude_stopwords: bool
[description], by default False
stop_words: Optional[List]
[description], by default None
"""
self.ngrams = ngrams
self.exclude_stopwords = exclude_stopwords
if self.exclude_stopwords:
self.stop_words = stop_words
if self.stop_words is None:
nltk.download('stopwords', quiet=True)
self.stop_words = stopwords.words('english')
nltk.download('punkt', quiet=True)
示例15: get_only_text_washingtonpost_url
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import download [as 别名]
def get_only_text_washingtonpost_url(url):
# this func will take the URL as an argument and return only
# the raw text of the url.
# this function works specifically for the washPost articles
# because we know the structure of the pages
page = urllib.urlopen(url).read().decode('utf8')
# we download the URL
soup = BeautifulSoup(page)
# initialize a beautifulsoup object with the page we downloaded
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
# the above gets everything bewteen a pair of HTML tags
# that look a certain way e.g. <article> stuff</article>
# the above format is specific to the washington post
soup2 = BeautifulSoup(text)
# find all the paragraph tage <p>
text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
return soup.title.text, text
#######################################################################
# TEST
######################################################################