当前位置: 首页>>代码示例>>Python>>正文


Python data.find方法代码示例

本文整理汇总了Python中nltk.data.find方法的典型用法代码示例。如果您正苦于以下问题:Python data.find方法的具体用法?Python data.find怎么用?Python data.find使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.data的用法示例。


在下文中一共展示了data.find方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_vocabulary_martin_mode

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def test_vocabulary_martin_mode(self):
        """Tests all words from the test vocabulary provided by M Porter
        
        The sample vocabulary and output were sourced from:
            http://tartarus.org/martin/PorterStemmer/voc.txt
            http://tartarus.org/martin/PorterStemmer/output.txt
        and are linked to from the Porter Stemmer algorithm's homepage
        at
            http://tartarus.org/martin/PorterStemmer/
        """
        self._test_against_expected_output(
            PorterStemmer.MARTIN_EXTENSIONS,
            data.find('stemmers/porter_test/porter_martin_output.txt')
                .open(encoding='utf-8')
                .read()
                .splitlines()
        ) 
开发者ID:sdoran35,项目名称:hate-to-hugs,代码行数:19,代码来源:test_stem.py

示例2: __init__

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def __init__(self):
        """
        Contains various synset related functions.
        """
        try:
            data.find(os.path.join("corpora", "wordnet"))
        except LookupError:
            download("wordnet")

        self.API = ImageNetAPI() 
开发者ID:zevisert,项目名称:Imagyn,代码行数:12,代码来源:lexicon.py

示例3: build_model

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def build_model(fmt='binary'):
    print('Loading training data...')
    train_paths = [find('corpora/ace_data/ace.dev'),
                   find('corpora/ace_data/ace.heldout'),
                   find('corpora/ace_data/bbn.dev'),
                   find('corpora/ace_data/muc.dev')]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print('Training...')
    cp = NEChunkParser(train_data)
    del train_data

    print('Loading eval data...')
    eval_paths = [find('corpora/ace_data/ace.eval')]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print('Evaluating...')
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3: cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
    print('Saving chunker to %s...' % outfilename)

    with open(outfilename, 'wb') as outfile:
        pickle.dump(cp, outfile, -1)

    return cp 
开发者ID:Thejas-1,项目名称:Price-Comparator,代码行数:34,代码来源:named_entity.py

示例4: _chunk_parse

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def _chunk_parse(self, grammar=None, root_label='record', trace=0, **kwargs):
        """
        Returns an element tree structure corresponding to a toolbox data file
        parsed according to the chunk grammar.

        :type grammar: str
        :param grammar: Contains the chunking rules used to parse the
            database.  See ``chunk.RegExp`` for documentation.
        :type root_label: str
        :param root_label: The node value that should be used for the
            top node of the chunk structure.
        :type trace: int
        :param trace: The level of tracing that should be used when
            parsing a text.  ``0`` will generate no tracing output;
            ``1`` will generate normal tracing output; and ``2`` or
            higher will generate verbose tracing output.
        :type kwargs: dict
        :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()``
        :rtype: ElementTree._ElementInterface
        """
        from nltk import chunk
        from nltk.tree import Tree

        cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace)
        db = self.parse(**kwargs)
        tb_etree = Element('toolbox_data')
        header = db.find('header')
        tb_etree.append(header)
        for record in db.findall('record'):
            parsed = cp.parse([(elem.text, elem.tag) for elem in record])
            tb_etree.append(self._tree2etree(parsed))
        return tb_etree 
开发者ID:Thejas-1,项目名称:Price-Comparator,代码行数:34,代码来源:toolbox.py

示例5: add_default_fields

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def add_default_fields(elem, default_fields):
    """
    Add blank elements and subelements specified in default_fields.

    :param elem: toolbox data in an elementtree structure
    :type elem: ElementTree._ElementInterface
    :param default_fields: fields to add to each type of element and subelement
    :type default_fields: dict(tuple)
    """
    for field in default_fields.get(elem.tag,  []):
        if elem.find(field) is None:
            SubElement(elem, field)
    for child in elem:
        add_default_fields(child, default_fields) 
开发者ID:Thejas-1,项目名称:Price-Comparator,代码行数:16,代码来源:toolbox.py

示例6: demo

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def demo():
    from itertools import islice

#    zip_path = find('corpora/toolbox.zip')
#    lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
    file_path = find('corpora/toolbox/rotokas.dic')
    lexicon = ToolboxData(file_path).parse()
    print('first field in fourth record:')
    print(lexicon[3][0].tag)
    print(lexicon[3][0].text)

    print('\nfields in sequential order:')
    for field in islice(lexicon.find('record'), 10):
        print(field.tag, field.text)

    print('\nlx fields:')
    for field in islice(lexicon.findall('record/lx'), 10):
        print(field.text)

    settings = ToolboxSettings()
    file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
    settings.open(file_path)
#    settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
    tree = settings.parse(unwrap=False, encoding='cp1252')
    print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
    settings_tree = ElementTree(tree)
    print(to_settings_string(settings_tree).encode('utf8')) 
开发者ID:Thejas-1,项目名称:Price-Comparator,代码行数:29,代码来源:toolbox.py

示例7: __init__

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def __init__(self, load=True):
        '''
        :param load: Load the pickled model upon instantiation.
        '''
        self.model = AveragedPerceptron()
        self.tagdict = {}
        self.classes = set()
        if load:
            AP_MODEL_LOC = str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
            self.load(AP_MODEL_LOC) 
开发者ID:Thejas-1,项目名称:Price-Comparator,代码行数:12,代码来源:perceptron.py

示例8: __init__

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def __init__(self, load=True):
        '''
        :param load: Load the pickled model upon instantiation.
        '''
        self.model = AveragedPerceptron()
        self.tagdict = {}
        self.classes = set()
        if load:
            AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
            self.load(AP_MODEL_LOC) 
开发者ID:SignalMedia,项目名称:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda,代码行数:12,代码来源:perceptron.py

示例9: nltk_download_corpus

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split

    # Download the wordnet data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    ## From http://www.nltk.org/api/nltk.html ##
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    ####
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith('/'):
        resource_path = resource_path + '/'

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True

    return downloaded 
开发者ID:isipalma,项目名称:Tutorial-Chatterbot,代码行数:34,代码来源:utils.py

示例10: build_model

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def build_model(fmt='binary'):
    print('Loading training data...')
    train_paths = [find('corpora/ace_data/ace.dev'),
                   find('corpora/ace_data/ace.heldout'),
                   find('corpora/ace_data/bbn.dev'),
                   find('corpora/ace_data/muc.dev')]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print('Training...')
    cp = NEChunkParser(train_data)
    del train_data

    print('Loading eval data...')
    eval_paths = [find('corpora/ace_data/ace.eval')]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print('Evaluating...')
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3: cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = '/tmp/ne_chunker_{0}.pickle'.format(fmt)
    print('Saving chunker to {0}...'.format(outfilename))

    with open(outfilename, 'wb') as outfile:
        pickle.dump(cp, outfile, -1)

    return cp 
开发者ID:sdoran35,项目名称:hate-to-hugs,代码行数:34,代码来源:named_entity.py

示例11: test_corpus_bleu

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def test_corpus_bleu(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the BLEU scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 2nd line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypothesis = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()], ref_fin))
                # Without smoothing.
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
                    # Check that the BLEU scores difference is less than 0.005 .
                    # Note: This is an approximate comparison; as much as
                    #       +/- 0.01 BLEU might be "statistically significant",
                    #       the actual translation quality might not be.
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

                # With the same smoothing method used in mteval-v13a.pl
                chencherry = SmoothingFunction()
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis,
                                            weights=(1.0/i,)*i,
                                            smoothing_function=chencherry.method3)
                    assert abs(mteval_bleu - nltk_bleu) < 0.005 
开发者ID:sdoran35,项目名称:hate-to-hugs,代码行数:37,代码来源:test_bleu.py

示例12: _vocabulary

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def _vocabulary(self):
        return (
            data.find('stemmers/porter_test/porter_vocabulary.txt')
                .open(encoding='utf-8')
                .read()
                .splitlines()
        ) 
开发者ID:sdoran35,项目名称:hate-to-hugs,代码行数:9,代码来源:test_stem.py

示例13: test_vocabulary_original_mode

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def test_vocabulary_original_mode(self):
        # The list of stems for this test was generated by taking the
        # Martin-blessed stemmer from
        # http://tartarus.org/martin/PorterStemmer/c.txt
        # and removing all the --DEPARTURE-- sections from it and
        # running it against Martin's test vocabulary.
        self._test_against_expected_output(
            PorterStemmer.ORIGINAL_ALGORITHM,
            data.find('stemmers/porter_test/porter_original_output.txt')
                .open(encoding='utf-8')
                .read()
                .splitlines()
        ) 
开发者ID:sdoran35,项目名称:hate-to-hugs,代码行数:15,代码来源:test_stem.py

示例14: demo

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def demo():
    """This assumes the Python module bllipparser is installed."""

    # download and install a basic unified parsing model (Wall Street Journal)
    # sudo python -m nltk.downloader bllip_wsj_no_aux

    from nltk.data import find
    model_dir = find('models/bllip_wsj_no_aux').path

    print('Loading BLLIP Parsing models...')
    # the easiest way to get started is to use a unified model
    bllip = BllipParser.from_unified_model_dir(model_dir)
    print('Done.')

    sentence1 = 'British left waffles on Falklands .'.split()
    sentence2 = 'I saw the man with the telescope .'.split()
    # this sentence is known to fail under the WSJ parsing model
    fail1 = '# ! ? : -'.split()
    for sentence in (sentence1, sentence2, fail1):
        print('Sentence: %r' % ' '.join(sentence))
        try:
            tree = next(bllip.parse(sentence))
            print(tree)
        except StopIteration:
            print("(parse failed)")

    # n-best parsing demo
    for i, parse in enumerate(bllip.parse(sentence1)):
        print('parse %d:\n%s' % (i, parse))

    # using external POS tag constraints
    print("forcing 'tree' to be 'NN':",
          next(bllip.tagged_parse([('A', None), ('tree', 'NN')])))
    print("forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
          next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])))
    # constraints don't have to make sense... (though on more complicated
    # sentences, they may cause the parse to fail)
    print("forcing 'A' to be 'NNP':",
          next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)]))) 
开发者ID:sdoran35,项目名称:hate-to-hugs,代码行数:41,代码来源:bllip.py

示例15: _get_tagger

# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def _get_tagger(lang=None):
    if lang == 'rus':
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    elif lang == 'eng':
        tagger = PerceptronTagger()
    else:
        tagger = PerceptronTagger()
    return tagger 
开发者ID:sdoran35,项目名称:hate-to-hugs,代码行数:12,代码来源:__init__.py


注:本文中的nltk.data.find方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。