本文整理汇总了Python中nltk.download函数的典型用法代码示例。如果您正苦于以下问题:Python download函数的具体用法?Python download怎么用?Python download使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了download函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: compute_emb
def compute_emb(pages_path_in, pages_path_out, vocab):
wemb = pkl.load(open(prm.wordemb_path, 'rb'))
dim_emb = wemb[wemb.keys()[0]].shape[0]
W = 0.01 * np.random.randn(len(vocab), dim_emb).astype(np.float32)
for word, pos in vocab.items():
if word in wemb:
W[pos,:] = wemb[word]
f = h5py.File(pages_path_in, 'r')
if prm.att_doc and prm.att_segment_type == 'sentence':
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
os.remove(pages_path_out) if os.path.exists(pages_path_out) else None
# Save to HDF5
fout = h5py.File(pages_path_out,'a')
if prm.att_doc:
shape = (f['text'].shape[0],prm.max_segs_doc,prm.dim_emb)
else:
shape=(f['text'].shape[0],prm.dim_emb)
embs = fout.create_dataset('emb', shape=shape, dtype=np.float32)
mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32)
i = 0
for text in f['text']:
st = time.time()
if prm.att_doc:
if prm.att_segment_type == 'section':
segs = ['']
for line in text.split('\n'):
if line.strip().startswith('==') and line.strip().endswith('=='):
segs.append('')
segs[-1] += line + '\n'
elif prm.att_segment_type == 'sentence':
segs = tokenizer.tokenize(text.decode('ascii', 'ignore'))
else:
raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter.')
segs = segs[:prm.max_segs_doc]
emb_ = utils.Word2Vec_encode(segs, wemb)
embs[i,:len(emb_),:] = emb_
mask[i] = len(emb_)
else:
bow0, bow1 = utils.BOW(wordpunct_tokenize(text.lower()), vocab)
emb = (W[bow0] * bow1[:,None]).sum(0)
embs[i,:] = emb
i += 1
#if i > 3000:
# break
print 'processing article', i, 'time', time.time()-st
f.close()
fout.close()
示例2: _post_install
def _post_install():
from importlib import reload
import site
reload(site)
import nltk
nltk.download('punkt')
示例3: installNLTKResources
def installNLTKResources():
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet')
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
try:
nltk.data.find('corpora/europarl_raw')
except LookupError:
nltk.download('europarl_raw')
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
try:
nltk.data.find('corpora/gutenberg')
except LookupError:
nltk.download('gutenberg')
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
return( None )
示例4: _build_wordset
def _build_wordset(clazz, obscurity_limit):
# I'm sorry this method is so disgusting.
# It's all in the cause of fast loading in the main case.
from nltk import FreqDist
# Ensure corpora are loaded.
try:
from nltk.corpus import cmudict
cmudict.entries()
except LookupError:
print "CMUDict corpus not found. Downloading..."
from nltk import download
download('cmudict')
print "[Done]"
if obscurity_limit is not None:
from nltk.corpus import brown
try:
brown.words()
except LookupError:
print "Brown corpus not found. Downloading...",
from nltk import download
download('brown')
print "[Done]"
words = cmudict.entries()
if obscurity_limit is not None:
freqs = FreqDist([w.lower() for w in brown.words()])
words = sorted(words,
key=lambda x: freqs[x[0].lower()],
reverse=True)
return words[:obscurity_limit]
else:
return list(words)
示例5: morphy
def morphy(doc):
"""Lemmatize tokens using morphy, WordNet's lemmatizer."""
# XXX Results will be better if we do POS tagging first, but then we
# need to map Penn Treebank tags to WordNet tags.
nltk.download('wordnet', quiet=False)
return map(nltk.WordNetLemmatizer().lemmatize,
_tokenize_if_needed(fetch(doc)))
示例6: _download_nltk_data
def _download_nltk_data():
"""Install corpus data.
"""
for directory, data in nltk_data.iteritems():
for datum in data:
if not exists(join(NLTK_DATA_DIR, directory, datum)):
nltk.download(datum, download_dir=NLTK_DATA_DIR)
示例7: search_for_all_strings
def search_for_all_strings(line, file_format):
'''Search for all strings with NLTK'''
result = []
for regexp in Config.excluded_lines:
for match in re.finditer(regexp, line):
if match:
return([])
for regexp in Config.strings_patterns[file_format]:
for match in re.finditer(regexp, line):
if not match:
continue
group = match.group(1)
if len(group) > 0 and not contains_forbidden_patterns(group):
try:
tokens = nltk.word_tokenize(group)
if len(tokens) > 0:
for word in tokens:
morf = wn.morphy(word)
if morf and len(str(morf)) > 1:
if (output_format == "csv") | (group not in global_word_pull):
result.append(group)
global_word_pull.add(group)
break
except:
print ("Unexpected error:{0}".format(sys.exc_info()))
traceback.print_tb(sys.exc_info()[2])
url = os.path.join(os.path.split(os.path.realpath(__file__))[0] + "/nltk_info.html")
print("See here for installation instructions:\n" + url)
webbrowser.open_new(url)
nltk.download()
sys.exit(2)
return result
示例8: main
def main():
nltk.download('stopwords')
nltk.download('vader_lexicon')
print("\n================================================================================\n")
print("---------------------------------- Platform Information ------------------------")
print('machine: {}'.format(platform.machine()))
print('node: {}'.format(platform.node()))
print('processor: {}'.format(platform.processor()))
print('release: {}'.format(platform.release()))
print('system: {}'.format(platform.system()))
print('version: {}'.format(platform.version()))
print('uname: {}'.format(platform.uname()))
#mem = virtual_memory()
#print('memory: {}'.format(mem.total)) # total physical memory available
print('python_build: {}'.format(platform.python_build()))
print('python_compiler: {}'.format(platform.python_compiler()))
print('python_branch: {}'.format(platform.python_branch()))
print('python_implementation: {}'.format(platform.python_implementation()))
print('python_revision: {}'.format(platform.python_revision()))
print('python_version: {}'.format(platform.python_version()))
print("\n================================================================================\n")
示例9: lemma_tokenize
def lemma_tokenize(paragraph):
lmtzr = WordNetLemmatizer()
try:
return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
except LookupError:
nltk.download('wordnet')
return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
示例10: __init__
def __init__(self, ngram=False, use_idf=False):
self.ngram = ngram
self.use_idf = use_idf
# Load WordNet synsets and download data if necessary
try:
wordnet_path = nltk.data.find("corpora/wordnet")
except LookupError:
nltk.download("wordnet")
wordnet_path = nltk.data.find("corpora/wordnet")
self.wn = wordnet.WordNetCorpusReader(wordnet_path)
# Initialize the two types of n-gram generators
pentagram_vectorizer = CountVectorizer(
ngram_range=(1, 5), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
)
unigram_vectorizer = CountVectorizer(
ngram_range=(1, 1), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
)
# Function for generating five-grams through unigrams
self.pent_analyze = pentagram_vectorizer.build_analyzer()
# Function for generating just unigrams
self.uni_analyze = unigram_vectorizer.build_analyzer()
# Load IDF scores
self.IDF = self.get_idf_scores()
self.counts = self.get_counts()
示例11: main
def main():
import io
with io.open(os.path.join(HERE, "README.rst"), "r") as readme:
setup(
name=app.__project__,
version=app.__version__,
description=app.__doc__,
long_description=readme.read(),
classifiers=app.__classifiers__,
author=app.__author__,
author_email=app.__author_email__,
# url = app.__url__,
license=[c.rsplit("::", 1)[1].strip() for c in app.__classifiers__ if c.startswith("License ::")][0],
keywords=" ".join(app.__keywords__),
packages=["mancify"],
package_data={},
include_package_data=True,
platforms=app.__platforms__,
install_requires=app.__requires__,
extras_require=app.__extra_requires__,
zip_safe=True,
entry_points=app.__entry_points__,
tests_require=["pytest-cov", "pytest", "mock"],
cmdclass={"test": PyTest},
)
# Download the required NLTK packages automatically
import nltk
nltk.download("cmudict")
nltk.download("maxent_treebank_pos_tagger")
示例12: __init__
def __init__(self, save_path=None, download=False, tokenizer="wordpunct_tokenize", *args, **kwargs):
super().__init__(save_path=save_path)
if download:
nltk.download()
self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
if not callable(self.tokenizer):
raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))
示例13: __init__
def __init__(self):
print("Please Install the brown-corpus and wordnet on your machine : ")
nltk.download()
self.pfile = open("pcent_plurals.txt","w")
self.pfile.write("%s %s \n" % ("Plurals".ljust(20),"Percentages"))
self.plural_dict = {}
self.single_dict = {}
示例14: handle
def handle(self, *args, **options):
if args is None or len(args) < 2:
pages = Page.objects.all()
for page in pages:
self._log.info("Page #%s: %s" % (page.id, page.fb_page_name))
raise CommandError('Invalid arguments. Expected: <page_id> <action>, where action might be: extract, tfidf, webidf')
page_id = args[0]
action = args[1]
if page_id == 'setup':
self._log.info("invoking nltk download")
nltk.download()
exit()
self._log.info('AnalyticsCommand initializing.')
self._log.info('Page-Id: %s' % page_id)
page = Page.objects.get(id=page_id)
if action == "extract":
self.processPageExtract(page)
elif action == "tfidf":
self.processTfIdf(page)
elif action == "webidf":
self.processWebIdf(page)
else:
self._log.warn("Unknown action: %s" % action)
self._log.info("All done for now.")
示例15: annotations_to_words
def annotations_to_words(terms, dag, ipr_map, lower):
"""
Converts a string of accesssions into a string of the corresponding english-text representations.
"""
try:
sws = stopwords.words('english')
except LookupError:
nltk.download('stopwords')
sws = stopwords.words('english')
if lower:
sws = set([x.lower() for x in sws])
case = string.lower
else:
sws = set([x.upper() for x in sws])
case = string.upper
go_terms = [t.upper() for t in terms if 'GO' in t.upper()]
ipr_terms = [t.upper() for t in terms if t.upper() in ipr_map]
go_descriptions = ' '.join([case(dag[t].name) for t in go_terms]).split(' ')
ipr_descriptions = ' '.join([case(ipr_map[t]) for t in ipr_terms]).split(' ')
go_descriptions = [x.translate(None, string.punctuation) for x in go_descriptions]
ipr_descriptions = [x.translate(None, string.punctuation) for x in ipr_descriptions]
go_descriptions = [x for x in go_descriptions if case(x) not in sws]
ipr_descriptions = [x for x in ipr_descriptions if case(x) not in sws]
line = ' '.join(go_descriptions + ipr_descriptions)
return line