当前位置: 首页>>代码示例>>Python>>正文


Python data.find函数代码示例

本文整理汇总了Python中nltk.data.find函数的典型用法代码示例。如果您正苦于以下问题:Python find函数的具体用法?Python find怎么用?Python find使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了find函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_corpus_bleu

    def test_corpus_bleu(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the BLEU scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 2nd line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypothesis = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()],ref_fin))
                # Without smoothing.
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
                    # Check that the BLEU scores difference is less than 0.005 .
                    # Note: This is an approximate comparison; as much as
                    #       +/- 0.01 BLEU might be "statistically significant",
                    #       the actual translation quality might not be.
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

                # With the same smoothing method used in mteval-v13a.pl
                chencherry = SmoothingFunction()
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis,
                                            weights=(1.0/i,)*i,
                                            smoothing_function=chencherry.method3)
                    assert abs(mteval_bleu - nltk_bleu) < 0.005
开发者ID:DrDub,项目名称:nltk,代码行数:35,代码来源:test_bleu.py

示例2: demo

def demo():
    from itertools import islice

#    zip_path = find('corpora/toolbox.zip')
#    lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
    file_path = find('corpora/toolbox/rotokas.dic')
    lexicon = ToolboxData(file_path).parse()
    print('first field in fourth record:')
    print(lexicon[3][0].tag)
    print(lexicon[3][0].text)

    print('\nfields in sequential order:')
    for field in islice(lexicon.find('record'), 10):
        print(field.tag, field.text)

    print('\nlx fields:')
    for field in islice(lexicon.findall('record/lx'), 10):
        print(field.text)

    settings = ToolboxSettings()
    file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
    settings.open(file_path)
#    settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
    tree = settings.parse(unwrap=False, encoding='cp1252')
    print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
    settings_tree = ElementTree(tree)
    print(to_settings_string(settings_tree).encode('utf8'))
开发者ID:esabelhaus,项目名称:secret-octo-dubstep,代码行数:27,代码来源:toolbox.py

示例3: build_model

def build_model(fmt="binary"):
    print("Loading training data...")
    train_paths = [
        find("corpora/ace_data/ace.dev"),
        find("corpora/ace_data/ace.heldout"),
        find("corpora/ace_data/bbn.dev"),
        find("corpora/ace_data/muc.dev"),
    ]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print("Training...")
    cp = NEChunkParser(train_data)
    del train_data

    print("Loading eval data...")
    eval_paths = [find("corpora/ace_data/ace.eval")]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print("Evaluating...")
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3:
            cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = "/tmp/ne_chunker_%s.pickle" % fmt
    print("Saving chunker to %s..." % outfilename)

    with open(outfilename, "wb") as out:
        pickle.dump(cp, out, -1)

    return cp
开发者ID:huderlem,项目名称:nltk,代码行数:35,代码来源:named_entity.py

示例4: nltk_download_corpus

def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split

    # Download the wordnet data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    ## From http://www.nltk.org/api/nltk.html ##
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    ####
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith('/'):
        resource_path = resource_path + '/'

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True

    return downloaded
开发者ID:jianjun66,项目名称:ChatterBot,代码行数:32,代码来源:utils.py

示例5: build_model

def build_model(fmt='binary'):
    print('Loading training data...')
    train_paths = [find('corpora/ace_data/ace.dev'),
                   find('corpora/ace_data/ace.heldout'),
                   find('corpora/ace_data/bbn.dev'),
                   find('corpora/ace_data/muc.dev')]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print('Training...')
    cp = NEChunkParser(train_data)
    del train_data

    print('Loading eval data...')
    eval_paths = [find('corpora/ace_data/ace.eval')]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print('Evaluating...')
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3: cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
    print('Saving chunker to %s...' % outfilename)

    with open(outfilename, 'wb') as outfile:
        pickle.dump(cp, outfile, -1)

    return cp
开发者ID:chatbotimporved,项目名称:chatbot,代码行数:32,代码来源:informationextraction.py

示例6: __init__

    def __init__(self):
        from nltk.data import find
        from nltk import download

        try:
            find('wordnet.zip')
        except LookupError:
            download('wordnet')
开发者ID:fmoliveira,项目名称:ChatterBot,代码行数:8,代码来源:word_net.py

示例7: namedEntityRecognizer

def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    #print content

    if Verbose:
        echo2("Incoming content is "+content)
    PICKLE = "averaged_perceptron_tagger.pickle"
    AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
    tagger = PerceptronTagger(load=False)
    tagger.load(AP_MODEL_LOC)
    pos_tag = tagger.tag
    start = time.time()
    #date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt, 'NE')
    #names.extend(date_time)
    result = {"result" : "success", "names" : names}
    if Units:
        grammar = '''unit: {<CD><NNS>?<NN.*>?},
                     unit: {<CD><JJ>?<NN.*>}
                  '''
        parser = nltk.RegexpParser(grammar)
        units = extract_entity_names(parser.parse(tagged),'unit')
        result['units'] = units
    jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
    end = time.time()
    print "NER took "+str(end - start)+" seconds"
    return jsonDoc
开发者ID:anirbanmishra,项目名称:Content_Evaluation,代码行数:31,代码来源:NLTKRestServer.py

示例8: _vocabulary

 def _vocabulary(self):
     return (
         data.find('stemmers/porter_test/porter_vocabulary.txt')
             .open(encoding='utf-8')
             .read()
             .splitlines()
     )
开发者ID:DrDub,项目名称:nltk,代码行数:7,代码来源:test_stem.py

示例9: _vocabulary

 def _vocabulary(self):
     with closing(
         data.find('stemmers/porter_test/porter_vocabulary.txt').open(
             encoding='utf-8'
         )
     ) as fp:
         return fp.read().splitlines()
开发者ID:rmalouf,项目名称:nltk,代码行数:7,代码来源:test_stem.py

示例10: demo

def demo():
    from nltk.data import find
    corpus_root = find('corpora/childes/data-xml/Eng-USA/')
    childes = CHILDESCorpusReader(corpus_root, u'.*.xml')

    # describe all corpus
    for file in childes.fileids()[:5]:
        corpus = ''
        corpus_id = ''
        for (key,value) in childes.corpus(file)[0].items():
            if key == "Corpus": corpus = value
            if key == "Id": corpus_id = value
        print 'Reading', corpus,corpus_id,' .....'
        print "words:", childes.words(file)[:7],"..."
        print "words with replaced words:", childes.words(file, replace=True)[:7]," ..."
        print "words with pos tags:", childes.words(file, pos=True)[:7]," ..."
        print "words (only MOT):", childes.words(file, speaker='MOT')[:7], "..."
        print "words (only CHI):", childes.words(file, speaker='CHI')[:7], "..."
        print "stemmed words:", childes.words(file, stem=True)[:7]," ..."
        print "words with relations and pos-tag:", childes.words(file, relation=True)[:5]," ..."
        print "sentence:", childes.sents(file)[:2]," ..."
        for (participant, values) in childes.participants(file)[0].items():
                for (key, value) in values.items():
                    print "\tparticipant", participant, key, ":", value
        print "num of sent:", len(childes.sents(file))
        print "num of morphemes:", len(childes.words(file, stem=True))
        print "age:", childes.age(file)    
        print "age in month:", childes.age(file, month=True)    
        print "MLU:", childes.MLU(file)
        print '\r'
开发者ID:johndpope,项目名称:jazzparser,代码行数:30,代码来源:childes.py

示例11: test_vocabulary_nltk_mode

 def test_vocabulary_nltk_mode(self):
     self._test_against_expected_output(
         PorterStemmer.NLTK_EXTENSIONS,
         data.find('stemmers/porter_test/porter_nltk_output.txt')
             .open(encoding='utf-8')
             .read()
             .splitlines()
     )
开发者ID:DrDub,项目名称:nltk,代码行数:8,代码来源:test_stem.py

示例12: _get_tagger

def _get_tagger(lang=None):
    if lang == 'rus':
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    else:
        tagger = PerceptronTagger()
    return tagger
开发者ID:Weiming-Hu,项目名称:text-based-six-degree,代码行数:8,代码来源:__init__.py

示例13: __init__

 def __init__(self):
     from nltk.data import find
     from nltk import download
     import os
     
     # Download the wordnet data only if it is not already downloaded
     wordnet_path = None
     if os.name == 'nt':
         wordnet_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
                                             'corpora', 'wordnet.zip')
     else:
         wordnet_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
                                             'corpora', 'wordnet.zip')
     try:
         if not os.path.isfile(wordnet_path):
             find('wordnet.zip')
     except LookupError:
         download('wordnet')
开发者ID:AugustoQueiroz,项目名称:ChatterBot,代码行数:18,代码来源:wordnet.py

示例14: demo

def demo(corpus_root=None):
    """
    The CHILDES corpus should be manually downloaded and saved
    to ``[NLTK_Data_Dir]/corpora/childes/``
    """
    if not corpus_root:
        from nltk.data import find

        corpus_root = find('corpora/childes/data-xml/Eng-USA/')

    try:
        childes = CHILDESCorpusReader(corpus_root, '.*.xml')
        # describe all corpus
        for file in childes.fileids()[:5]:
            corpus = ''
            corpus_id = ''
            for (key, value) in childes.corpus(file)[0].items():
                if key == "Corpus":
                    corpus = value
                if key == "Id":
                    corpus_id = value
            print('Reading', corpus, corpus_id, ' .....')
            print("words:", childes.words(file)[:7], "...")
            print(
                "words with replaced words:",
                childes.words(file, replace=True)[:7],
                " ...",
            )
            print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
            print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
            print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
            print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
            print(
                "words with relations and pos-tag:",
                childes.words(file, relation=True)[:5],
                " ...",
            )
            print("sentence:", childes.sents(file)[:2], " ...")
            for (participant, values) in childes.participants(file)[0].items():
                for (key, value) in values.items():
                    print("\tparticipant", participant, key, ":", value)
            print("num of sent:", len(childes.sents(file)))
            print("num of morphemes:", len(childes.words(file, stem=True)))
            print("age:", childes.age(file))
            print("age in month:", childes.age(file, month=True))
            print("MLU:", childes.MLU(file))
            print()

    except LookupError as e:
        print(
            """The CHILDES corpus, or the parts you need, should be manually
        downloaded from https://childes.talkbank.org/data-xml/ and saved at
        [NLTK_Data_Dir]/corpora/childes/
            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
        demo('/path/to/childes/data-xml/Eng-USA/")
        """
        )
开发者ID:rmalouf,项目名称:nltk,代码行数:57,代码来源:childes.py

示例15: __init__

    def __init__(self):
        from nltk.data import find
        from nltk import download
        import os

        # Download the punkt data only if it is not already downloaded
        punkt_path = None
        if os.name == 'nt':
            punkt_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
                                                'tokenizers', 'punkt.zip')
        else:
            punkt_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
                                                'tokenizers', 'punkt.zip')
        try:
            if not os.path.isfile(punkt_path):
                find('punkt.zip')
        except LookupError:
            download('punkt')
开发者ID:AugustoQueiroz,项目名称:ChatterBot,代码行数:18,代码来源:tokenizer.py


注:本文中的nltk.data.find函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。