当前位置: 首页>>代码示例>>Python>>正文


Python nltk.download函数代码示例

本文整理汇总了Python中nltk.download函数的典型用法代码示例。如果您正苦于以下问题:Python download函数的具体用法?Python download怎么用?Python download使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了download函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: compute_emb

def compute_emb(pages_path_in, pages_path_out, vocab):

    wemb = pkl.load(open(prm.wordemb_path, 'rb'))
    dim_emb = wemb[wemb.keys()[0]].shape[0]
    W = 0.01 * np.random.randn(len(vocab), dim_emb).astype(np.float32)
    for word, pos in vocab.items():
        if word in wemb:
            W[pos,:] = wemb[word]

    f = h5py.File(pages_path_in, 'r')

    if prm.att_doc and prm.att_segment_type == 'sentence':
        nltk.download('punkt')
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    os.remove(pages_path_out) if os.path.exists(pages_path_out) else None

    # Save to HDF5
    fout = h5py.File(pages_path_out,'a')

    if prm.att_doc:
        shape = (f['text'].shape[0],prm.max_segs_doc,prm.dim_emb)
    else:
        shape=(f['text'].shape[0],prm.dim_emb)

    embs = fout.create_dataset('emb', shape=shape, dtype=np.float32)
    mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32)

    i = 0
    for text in f['text']:
        st = time.time()

        if prm.att_doc:
            if prm.att_segment_type == 'section':
                segs = ['']
                for line in text.split('\n'):
                    if line.strip().startswith('==') and line.strip().endswith('=='):
                        segs.append('')
                    segs[-1] += line + '\n'
            elif prm.att_segment_type == 'sentence':
                segs = tokenizer.tokenize(text.decode('ascii', 'ignore'))
            else:
                raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter.')

            segs = segs[:prm.max_segs_doc]
            emb_ = utils.Word2Vec_encode(segs, wemb)
            embs[i,:len(emb_),:] = emb_
            mask[i] = len(emb_)
        else:
            bow0, bow1 = utils.BOW(wordpunct_tokenize(text.lower()), vocab)
            emb = (W[bow0] * bow1[:,None]).sum(0)
            embs[i,:] = emb
        i += 1
        #if i > 3000:
        #    break

        print 'processing article', i, 'time', time.time()-st

    f.close()
    fout.close()
开发者ID:jxwuyi,项目名称:WebNav,代码行数:60,代码来源:convert2emb.py

示例2: _post_install

def _post_install():
    from importlib import reload
    import site
    reload(site)

    import nltk
    nltk.download('punkt')
开发者ID:alexlafroscia,项目名称:class-projects,代码行数:7,代码来源:setup.py

示例3: installNLTKResources

def installNLTKResources():

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/europarl_raw')
    except LookupError:
        nltk.download('europarl_raw')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/gutenberg')
    except LookupError:
        nltk.download('gutenberg')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
开发者ID:paradisepilot,项目名称:statistics,代码行数:34,代码来源:InstallNLTKResources.py

示例4: _build_wordset

    def _build_wordset(clazz, obscurity_limit):
        # I'm sorry this method is so disgusting.
        # It's all in the cause of fast loading in the main case.

        from nltk import FreqDist

        # Ensure corpora are loaded.
        try:
            from nltk.corpus import cmudict
            cmudict.entries()
        except LookupError:
            print "CMUDict corpus not found. Downloading..."
            from nltk import download
            download('cmudict')
            print "[Done]"
        if obscurity_limit is not None:
            from nltk.corpus import brown
            try:
                brown.words()
            except LookupError:
                print "Brown corpus not found. Downloading...",
                from nltk import download
                download('brown')
                print "[Done]"

        words = cmudict.entries()
        if obscurity_limit is not None:
            freqs = FreqDist([w.lower() for w in brown.words()])
            words = sorted(words,
                           key=lambda x: freqs[x[0].lower()],
                           reverse=True)
            return words[:obscurity_limit]
        else:
            return list(words)
开发者ID:StefanKopieczek,项目名称:pyverse,代码行数:34,代码来源:rhymelib.py

示例5: morphy

def morphy(doc):
    """Lemmatize tokens using morphy, WordNet's lemmatizer."""
    # XXX Results will be better if we do POS tagging first, but then we
    # need to map Penn Treebank tags to WordNet tags.
    nltk.download('wordnet', quiet=False)
    return map(nltk.WordNetLemmatizer().lemmatize,
               _tokenize_if_needed(fetch(doc)))
开发者ID:IsaacHaze,项目名称:xtas,代码行数:7,代码来源:single.py

示例6: _download_nltk_data

def _download_nltk_data():
    """Install corpus data.
    """
    for directory, data in nltk_data.iteritems():
        for datum in data:
            if not exists(join(NLTK_DATA_DIR, directory, datum)):
                nltk.download(datum, download_dir=NLTK_DATA_DIR)
开发者ID:ooda,项目名称:vwordnet,代码行数:7,代码来源:wordnet.py

示例7: search_for_all_strings

def search_for_all_strings(line, file_format):
    '''Search for all strings with NLTK'''
    result = []
    for regexp in Config.excluded_lines:
        for match in re.finditer(regexp, line):
            if match:
                return([])

    for regexp in Config.strings_patterns[file_format]:
        for match in re.finditer(regexp, line):
            if not match:
                continue
            group = match.group(1)
            if len(group) > 0 and not contains_forbidden_patterns(group):
                try:
                    tokens = nltk.word_tokenize(group)
                    if len(tokens) > 0:
                        for word in tokens:
                            morf = wn.morphy(word)
                            if morf and len(str(morf)) > 1:
                                if (output_format == "csv") | (group not in global_word_pull):
                                    result.append(group)
                                    global_word_pull.add(group)
                                break
                except:
                    print ("Unexpected error:{0}".format(sys.exc_info()))
                    traceback.print_tb(sys.exc_info()[2])
                    url = os.path.join(os.path.split(os.path.realpath(__file__))[0] + "/nltk_info.html")
                    print("See here for installation instructions:\n" + url)
                    webbrowser.open_new(url)

                    nltk.download()
                    sys.exit(2)

    return result
开发者ID:alexsosn,项目名称:OwlLocalizer,代码行数:35,代码来源:loc_finder.py

示例8: main

def main():

    nltk.download('stopwords')
    nltk.download('vader_lexicon')        
        
    print("\n================================================================================\n")
    print("---------------------------------- Platform Information ------------------------")
    print('machine: {}'.format(platform.machine()))
    print('node: {}'.format(platform.node()))    
    print('processor: {}'.format(platform.processor()))    
    print('release: {}'.format(platform.release()))
    print('system: {}'.format(platform.system()))    
    print('version: {}'.format(platform.version()))
    print('uname: {}'.format(platform.uname()))
    
    #mem = virtual_memory()
    #print('memory: {}'.format(mem.total))  # total physical memory available
    
    print('python_build: {}'.format(platform.python_build()))
    print('python_compiler: {}'.format(platform.python_compiler()))
    print('python_branch: {}'.format(platform.python_branch()))
    print('python_implementation: {}'.format(platform.python_implementation()))
    
    print('python_revision: {}'.format(platform.python_revision()))
    print('python_version: {}'.format(platform.python_version()))
    
    print("\n================================================================================\n")
开发者ID:Brbrew,项目名称:Docker,代码行数:27,代码来源:test.py

示例9: lemma_tokenize

def lemma_tokenize(paragraph):
    lmtzr = WordNetLemmatizer()
    try:
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
    except LookupError:
        nltk.download('wordnet')
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
开发者ID:dirkneumann,项目名称:nuanceq,代码行数:7,代码来源:text_nltk.py

示例10: __init__

    def __init__(self, ngram=False, use_idf=False):
        self.ngram = ngram
        self.use_idf = use_idf

        # Load WordNet synsets and download data if necessary
        try:
            wordnet_path = nltk.data.find("corpora/wordnet")
        except LookupError:
            nltk.download("wordnet")
            wordnet_path = nltk.data.find("corpora/wordnet")
        self.wn = wordnet.WordNetCorpusReader(wordnet_path)

        # Initialize the two types of n-gram generators
        pentagram_vectorizer = CountVectorizer(
            ngram_range=(1, 5), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )
        unigram_vectorizer = CountVectorizer(
            ngram_range=(1, 1), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )

        # Function for generating five-grams through unigrams
        self.pent_analyze = pentagram_vectorizer.build_analyzer()

        # Function for generating just unigrams
        self.uni_analyze = unigram_vectorizer.build_analyzer()

        # Load IDF scores
        self.IDF = self.get_idf_scores()
        self.counts = self.get_counts()
开发者ID:mmcauliffe,项目名称:linguistic-helper-functions,代码行数:29,代码来源:semantic_predictability.py

示例11: main

def main():
    import io

    with io.open(os.path.join(HERE, "README.rst"), "r") as readme:
        setup(
            name=app.__project__,
            version=app.__version__,
            description=app.__doc__,
            long_description=readme.read(),
            classifiers=app.__classifiers__,
            author=app.__author__,
            author_email=app.__author_email__,
            # url                  = app.__url__,
            license=[c.rsplit("::", 1)[1].strip() for c in app.__classifiers__ if c.startswith("License ::")][0],
            keywords=" ".join(app.__keywords__),
            packages=["mancify"],
            package_data={},
            include_package_data=True,
            platforms=app.__platforms__,
            install_requires=app.__requires__,
            extras_require=app.__extra_requires__,
            zip_safe=True,
            entry_points=app.__entry_points__,
            tests_require=["pytest-cov", "pytest", "mock"],
            cmdclass={"test": PyTest},
        )

    # Download the required NLTK packages automatically
    import nltk

    nltk.download("cmudict")
    nltk.download("maxent_treebank_pos_tagger")
开发者ID:jvlomax,项目名称:mancify,代码行数:32,代码来源:setup.py

示例12: __init__

 def __init__(self, save_path=None, download=False, tokenizer="wordpunct_tokenize", *args, **kwargs):
     super().__init__(save_path=save_path)
     if download:
         nltk.download()
     self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
     if not callable(self.tokenizer):
         raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))
开发者ID:CuteCha,项目名称:DeepPavlov,代码行数:7,代码来源:nltk_tokenizer.py

示例13: __init__

 def __init__(self):
     print("Please Install the brown-corpus and wordnet on your machine : ")
     nltk.download()
     self.pfile = open("pcent_plurals.txt","w")
     self.pfile.write("%s %s \n" % ("Plurals".ljust(20),"Percentages"))
     self.plural_dict = {}
     self.single_dict = {}
开发者ID:knkumar,项目名称:Plural_find,代码行数:7,代码来源:plural_Find.py

示例14: handle

    def handle(self, *args, **options):
        if args is None or len(args) < 2:
            pages = Page.objects.all()
            for page in pages:
                self._log.info("Page #%s: %s" % (page.id, page.fb_page_name))
            raise CommandError('Invalid arguments. Expected: <page_id> <action>, where action might be: extract, tfidf, webidf')


        page_id = args[0]
        action = args[1]

        if page_id == 'setup':
            self._log.info("invoking nltk download")
            nltk.download()
            exit()

        self._log.info('AnalyticsCommand initializing.')

        self._log.info('Page-Id: %s' % page_id)
        page = Page.objects.get(id=page_id)

        if action == "extract":
            self.processPageExtract(page)
        elif action == "tfidf":
            self.processTfIdf(page)
        elif action == "webidf":
            self.processWebIdf(page)
        else:
            self._log.warn("Unknown action: %s" % action)

        self._log.info("All done for now.")
开发者ID:FrankGrimm,项目名称:text-insights,代码行数:31,代码来源:analytics.py

示例15: annotations_to_words

def annotations_to_words(terms, dag, ipr_map, lower):
    """
    Converts a string of accesssions into a string of the corresponding english-text representations.
    """
    try:
        sws = stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        sws = stopwords.words('english')

    if lower:
        sws = set([x.lower() for x in sws])
        case = string.lower
    else:
        sws = set([x.upper() for x in sws])
        case = string.upper

    go_terms = [t.upper() for t in terms if 'GO' in t.upper()]
    ipr_terms = [t.upper() for t in terms if t.upper() in ipr_map]

    go_descriptions = ' '.join([case(dag[t].name) for t in go_terms]).split(' ')
    ipr_descriptions = ' '.join([case(ipr_map[t]) for t in ipr_terms]).split(' ')

    go_descriptions = [x.translate(None, string.punctuation) for x in go_descriptions]
    ipr_descriptions = [x.translate(None, string.punctuation) for x in ipr_descriptions]

    go_descriptions = [x for x in go_descriptions if case(x) not in sws]
    ipr_descriptions = [x for x in ipr_descriptions if case(x) not in sws]

    line = ' '.join(go_descriptions + ipr_descriptions)
    return line
开发者ID:daniaki,项目名称:ppi_wrangler,代码行数:31,代码来源:preprocess.py


注:本文中的nltk.download函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。