当前位置: 首页>>代码示例>>Python>>正文


Python tokens.DocumentFeature类代码示例

本文整理汇总了Python中discoutils.tokens.DocumentFeature的典型用法代码示例。如果您正苦于以下问题:Python DocumentFeature类的具体用法?Python DocumentFeature怎么用?Python DocumentFeature使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了DocumentFeature类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_smart_lower

def test_smart_lower():
    # test that the PoS of an n-gram entry is not lowercased
    assert DocumentFeature.smart_lower('Cat/N') == 'cat/N'
    assert DocumentFeature.smart_lower('Cat/n') == 'cat/n'
    assert DocumentFeature.smart_lower('Red/J_CaT/N') == 'red/J_cat/N'
    assert DocumentFeature.smart_lower('Red/J CaT/N', separator=' ') == 'red/J cat/N'
    # test that features are not touched
    assert DocumentFeature.smart_lower('amod-DEP:former', lowercasing=False) == 'amod-DEP:former'
开发者ID:jt86,项目名称:DiscoUtils,代码行数:8,代码来源:test_token.py

示例2: remove_overlapping_neighbours

    def remove_overlapping_neighbours(cls, entry, to_insert):
        """

        :type entry: DocumentFeature or str
        :type to_insert: list of (str, float) tuples
        """
        if isinstance(entry, (six.string_types, six.text_type)):
            entry = DocumentFeature.from_string(entry)
        features = [(DocumentFeature.from_string(x[0]), x[1]) for x in to_insert]
        to_insert = [(f[0].tokens_as_str(), f[1]) for f in features
                     if not any(t in entry.tokens for t in f[0].tokens)]
        return to_insert
开发者ID:jt86,项目名称:DiscoUtils,代码行数:12,代码来源:thesaurus_loader.py

示例3: test_document_feature_slicing

def test_document_feature_slicing():
    DocumentFeature.recompile_pattern()
    x = DocumentFeature.from_string('big/J_cat/N')
    assert x[0] == DocumentFeature.from_string('big/J')
    assert x[1] == DocumentFeature.from_string('cat/N')
    assert x[1] == DocumentFeature('1-GRAM', (Token('cat', 'N', 1), ))
    assert x[0:] == DocumentFeature.from_string('big/J_cat/N')

    x = DocumentFeature.from_string('cat/N')
    assert x[0] == DocumentFeature.from_string('cat/N')
    assert x[0:] == DocumentFeature.from_string('cat/N')
    assert x[:] == DocumentFeature.from_string('cat/N')
开发者ID:mbatchkarov,项目名称:DiscoUtils,代码行数:12,代码来源:test_token.py

示例4: contains_impl

 def contains_impl(self, feature):
     if isinstance(feature, six.string_types):
         feature = DocumentFeature.from_string(feature)
     if feature.type not in self.entry_types:
         # no point in composing single-word document features
         return False
     return str(feature[self.hardcoded_index]) in self.unigram_source
开发者ID:mbatchkarov,项目名称:vector_builder,代码行数:7,代码来源:vectorstore.py

示例5: to_tsv

    def to_tsv(self, events_path, entries_path='', features_path='',
               entry_filter=lambda x: True, row_transform=lambda x: x,
               gzipped=False, enforce_word_entry_pos_format=True, dense_hd5=False):
        """
        Writes this thesaurus to Byblo-compatible file like the one it was most likely read from. In the
        process converts all entries to a DocumentFeature, so all entries must be parsable into one. May reorder the
        features of each entry.

        :param events_path: file to write to
        :param entry_filter: Called for every DocumentFeature that is an entry in this thesaurus. The vector will
         only be written if this callable return true
        :param row_transform: Callable, any transformation that might need to be done to each entry before converting
         it to a DocumentFeature. This is needed because some entries (e.g. african/J:amod-HEAD:leader) are not
         directly convertible (needs to be african/J_leader/N). Use this if the entries cannot be converted to
         DocumentFeature, e.g. if the data isn't PoS tagged.
         :param dense_hd5: if true, convert to a pandas `DataFrame` and write to a compressed HDF file. This is a 30%
          faster and produces 30% smaller files than using `gzipped`. This is only suitable for matrices with a small
          number of columns- this method enforces a hard limit of 1000.
          Requires PyTables and HDF5.
        :return: the file name
        """
        if enforce_word_entry_pos_format:
            rows = {i: DocumentFeature.from_string(row_transform(feat)) for (feat, i) in self.name2row.items()}
        else:
            rows = {i: feat for (feat, i) in self.name2row.items()}

        if dense_hd5 and len(self.columns) <= 1000:
            write_vectors_to_hdf(self.matrix, self.row_names, self.columns, events_path)
        else:
            write_vectors_to_disk(coo_matrix(self.matrix), rows, self.columns, events_path,
                                  features_path=features_path, entries_path=entries_path,
                                  entry_filter=entry_filter, gzipped=gzipped)
        return events_path
开发者ID:tttthomasssss,项目名称:DiscoUtils,代码行数:33,代码来源:thesaurus_loader.py

示例6: reformat_socher_vectors

def reformat_socher_vectors():
    """
    Formats the files output by Socher (2011)'s matlab code into byblo-compatible files.

    Before running this a list of all phrases needs to be extracted from the labelled data, and these need to
    be composed with Socher's matlab code. See note "Socher vectors" in Evernote.

    """
    logging.info('Reformatting events file %s ---> %s',
                 socher_output_vectors_file, socher_composed_vectors_file)

    # socher's code removes all PoS tags, so we can't translate his output
    # back to a DocumentFeature. Let's read the input to his code instead and
    # get the corresponding output vectors
    # get a list of all phrases that we attempted to compose
    with open(plaintext_socher_input_file) as infile:
        composed_phrases = [DocumentFeature.from_string(line.strip()) for line in infile]

    # get a list of all phrases where composition worked (no unknown words)
    with open(socher_output_phrases_file) as infile:
        success = [i for i, line in enumerate(infile) if '*UNKNOWN*' not in line]
        # pick out just the phrases that composes successfully
    composed_phrases = itemgetter(*success)(composed_phrases)

    # load all vectors, remove these containing unknown words
    mat = np.loadtxt(socher_output_vectors_file, delimiter=',')
    mat = mat[success, :]
    assert len(composed_phrases) == mat.shape[0]  # same number of rows

    # do the actual writing
    write_vectors_to_hdf(sp.coo_matrix(mat),
                         composed_phrases,
                         ['RAE-feat%d' % i for i in range(100)],  # Socher provides 100-dimensional vectors
                         socher_composed_vectors_file)
开发者ID:mbatchkarov,项目名称:vector_builder,代码行数:34,代码来源:socher_vectors.py

示例7: train_verb_tensors

def train_verb_tensors(svos_file, noun_vectors_file, output_filename):
    """
    Trains Verb-bar matrices, as described in Milajevs et al (EMNLP-14, §3)
    :param svos_file: file containing a list of all SVOs in unlabelled data, one per line. May contain other document
     features too. Such a file is output by `find_all_NPs.py`, which is called from `observed_vectors.py`
    :param noun_vectors_file: a vector store containing noun vectors
    :param output_filename: name of output file- must identify the noun vectors and the unlabelled corpus
    """
    mkdirs_if_not_exists(os.path.dirname(output_filename))

    v = Vectors.from_tsv(noun_vectors_file)

    with open(svos_file) as infile:
        phrases = set()
        for line in infile:
            if DocumentFeature.from_string(line.strip()).type == 'SVO':
                phrases.add(tuple(line.strip().split('_')))
    phrases = [(subj, verb, obj) for subj, verb, obj in phrases if subj in v and obj in v]
    phrases = sorted(phrases, key=itemgetter(1))
    logging.info('Found %d SVOs in list', len(phrases))

    verb_tensors = dict()
    for verb, svos in groupby(phrases, itemgetter(1)):
        svos = list(svos)
        if len(svos) < MIN_SVO_PER_VERB:
            continue
        logging.info('Training matrix for %s from %d SVOs', verb, len(svos))
        vt = np.sum(np.outer(v.get_vector(subj).A, v.get_vector(obj).A) for subj, _, obj in svos)
        verb_tensors[verb] = vt

    logging.info('Trained %d verb matrices, saving...', len(verb_tensors))
    for verb, tensor in verb_tensors.items():
        df = pd.DataFrame(tensor)
        df.to_hdf(output_filename, verb.split('/')[0], complevel=9, complib='zlib')
开发者ID:mbatchkarov,项目名称:vector_builder,代码行数:34,代码来源:categorical_composers.py

示例8: get_all_document_features

def get_all_document_features(include_unigrams=False, remove_pos=False):
    """
    Finds all noun-noun and adj-noun compounds (and optionally adjs and nouns) in all labelled corpora
    mentioned in the conf files.
    :param include_unigrams: if False, only NPs will be returned
    :param remove_pos: whether to remove PoS tags if present, result will be either "cat/N" or "cat"
    :rtype: set of DocumentFeature
    """
    result = set()
    accepted_df_types = {'AN', 'NN', 'VO', 'SVO', '1-GRAM'} if include_unigrams else {'AN', 'NN', 'VO', 'SVO'}
    for corpus_name, _ in get_all_corpora():
        path = os.path.abspath(os.path.join(__file__, '..', '..', '..', ROOT, '%s_all_features.txt' % corpus_name))
        with open(path) as infile:
            for line in infile:
                df = DocumentFeature.from_string(line.strip())
                if df.type in accepted_df_types:
                    if remove_pos:
                        # todo these are of type str, in the other branch it's DocumentFeature. things will likely break
                        result.add(df.ngram_separator.join(t.text for t in df.tokens))
                    else:
                        result.add(df)

    logging.info('Found a total of %d features in all corpora', len(result))
    if not remove_pos:
        logging.info('Their types are %r', Counter(df.type for df in result))
    if include_unigrams:
        logging.info('PoS tags of unigrams are are %r',
                     Counter(df.tokens[0].pos for df in result if df.type == '1-GRAM'))
    else:
        logging.info('Unigram features not included!')
    return result
开发者ID:mbatchkarov,项目名称:dc_evaluation,代码行数:31,代码来源:compress_labelled_data.py

示例9: test_write_vectors_to_disk

def test_write_vectors_to_disk(resources, tmpdir):
    """
    Checks the entries/features files, the events file is checked by
    thesisgenerator.tests.test_thesaurus.test_to_file

    :type th: Thesaurus
    """
    th, expected_entries, expected_features, filter_callable = resources
    events_file = str(tmpdir.join('events.txt'))
    entries_file = str(tmpdir.join('entries.txt'))
    features_file = str(tmpdir.join('features.txt'))

    if not th: # empty thesaurus should raise an error
        with pytest.raises(ValueError):
            matrix, cols, rows = th.to_sparse_matrix()
    else:
        matrix, cols, rows = th.to_sparse_matrix()
        rows = [DocumentFeature.from_string(x) for x in rows]
        write_vectors_to_disk(sp.coo_matrix(matrix), rows, cols,
                              events_file, features_file, entries_file,
                              entry_filter=filter_callable)

        if expected_entries:
            # the file will not be written at all if there's nothing to put in it
            entries = [x.split('\t')[0] for x in _read_and_strip_lines(entries_file)]
            assert set(entries) == set(expected_entries)
        else:
            assert not os.path.exists(entries_file)

        if expected_features:
            features = [x.split('\t')[0] for x in _read_and_strip_lines(features_file)]
            assert features == expected_features
        else:
            assert not os.path.exists(features_file)
开发者ID:jt86,项目名称:DiscoUtils,代码行数:34,代码来源:test_io_utils.py

示例10: __iter__

 def __iter__(self):
     for fname in self.files:
         filename = join(self.dirname, fname)
         infile = gzip.open(filename) if is_gzipped(filename) else open(filename)
         with contextlib.closing(infile):
             for line in infile:
                 # yield gensim.utils.tokenize(line, lower=True)
                 if isinstance(line, bytes):
                     line = line.decode()
                 res = [DocumentFeature.smart_lower(w) for w in line.split() if
                        DocumentFeature.from_string(w).type != 'EMPTY']
                 if len(res) > 8:
                     # ignore short sentences, they are probably noise
                     if self.remove_pos:
                         yield [x.split('/')[0] for x in res]
                     else:
                         yield res
开发者ID:mbatchkarov,项目名称:vector_builder,代码行数:17,代码来源:get_word2vec_vectors.py

示例11: get_vector

 def get_vector(self, feature):
     """
     :type feature: DocumentFeature
     :rtype: scipy.sparse.csr_matrix
     """
     if isinstance(feature, six.string_types):
         feature = DocumentFeature.from_string(feature)
     return sp.csr_matrix(reduce(self.function,
                                 [self.unigram_source.get_vector(str(t)).A for t in feature[:]]))
开发者ID:mbatchkarov,项目名称:vector_builder,代码行数:9,代码来源:vectorstore.py

示例12: __contains__

    def __contains__(self, feature):
        if isinstance(feature, six.string_types):
            feature = DocumentFeature.from_string(feature)

        # this is a SVO, we have a verb tensor and vectors for both arguments
        return feature.type in self.entry_types and \
               str(feature[1]) in self.verb_tensors and \
               str(feature[0]) in self.unigram_source and \
               str(feature[2]) in self.unigram_source
开发者ID:mbatchkarov,项目名称:vector_builder,代码行数:9,代码来源:vectorstore.py

示例13: test_with_different_separators

def test_with_different_separators():
    DocumentFeature.recompile_pattern(pos_separator='_', ngram_separator='!')
    assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
           DocumentFeature.from_string('very_RB!big_J')

    DocumentFeature.recompile_pattern(pos_separator='-', ngram_separator=' ')
    assert DocumentFeature('1-GRAM', (Token('very', 'RB'),)) == DocumentFeature.from_string('very-RB')
    assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
           DocumentFeature.from_string('very-RB big-J')
开发者ID:mbatchkarov,项目名称:DiscoUtils,代码行数:9,代码来源:test_token.py

示例14: _paraphrase

    def _paraphrase(self, feature, vocabulary, j_indices, values, stats, **kwargs):
        """
        Replaces term with its k nearest neighbours from the thesaurus

        Parameters
        ----------
        neighbour_source : callable, returns a thesaurus-like object (a list of
          (neighbour, sim) tuples, sorted by highest sim first,
          acts as a defaultdict(list) ). The callable takes one parameter for
          compatibility purposes- one of the possible callables I want to
          use here requires access to the vocabulary.
           The default behaviour is to return a callable pointing to the
           currently loaded thesaurus.
        """

        # logging.debug('Paraphrasing %r in doc %d', feature, doc_id)
        neighbours = self.thesaurus.get_nearest_neighbours(feature)
        if self.thesaurus.__class__.__name__ == 'Thesaurus':
            # todo this will also activate for DenseVectors, because they are also instances of thesaurus
            # the check needs to be self.thesaurus.__class__.__name__ == 'Thesaurus', but then
            # we need to make sure init_sims is called with the correct vocabulary so that all neighbours are IV

            # precomputed thesauri do not guarantee that the returned neighbours will be in vocabulary
            # these should by now only the used in testing though
            neighbours = [(neighbour, sim) for (neighbour, sim) in neighbours
                          if DocumentFeature.from_string(neighbour) in vocabulary]
        event = [str(feature), len(neighbours)]
        for neighbour, sim in neighbours[:self.k]:
            # the document may already contain the feature we
            # are about to insert into it,
            # a merging strategy is required,
            # e.g. what do we do if the document has the word X
            # in it and we encounter X again. By default,
            # scipy uses addition
            df = DocumentFeature.from_string(neighbour)
            j_indices.append(vocabulary.get(df))
            values.append(self.sim_transformer(sim))
            # track the event
            event.extend([neighbour, sim])
        stats.register_paraphrase(tuple(event))
开发者ID:mbatchkarov,项目名称:dc_evaluation,代码行数:40,代码来源:feature_handlers.py

示例15: filter_out_infrequent_entries

def filter_out_infrequent_entries(desired_counts_per_feature_type, vectors):
    logging.info('Converting thesaurus to sparse matrix')
    mat, cols, rows = vectors.to_sparse_matrix()
    logging.info('Got a data matrix of shape %r', mat.shape)
    # convert to document feature for access to PoS tag
    document_features = [DocumentFeature.from_string(r) for r in rows]
    # don't want to do dimensionality reduction on composed vectors
    feature_types = [sorted_idx_and_pos_matching.type for sorted_idx_and_pos_matching in document_features]
    assert all(x == '1-GRAM' or x == 'AN' or x == 'NN' for x in feature_types), Counter(feature_types)
    # get the PoS tags of each row in the matrix
    pos_tags = np.array([df.tokens[0].pos if df.type == '1-GRAM' else df.type for df in document_features])
    # find the rows of the matrix that correspond to the most frequent nouns, verbs, ...,
    # as measured by sum of feature counts. This is Byblo's definition of frequency (which is in fact a marginal),
    # but it is strongly correlated with one normally thinks of as entry frequency
    desired_rows = []
    if desired_counts_per_feature_type is not None:
        for desired_pos, desired_count in desired_counts_per_feature_type:
            row_of_current_pos = pos_tags == desired_pos  # what rows are the right PoS tags at, boolean mask array
            # indices of the array sorted by row sum, and where the pos == desired_pos
            if desired_count > 0:
                sorted_idx_by_sum = np.ravel(mat.sum(1)).argsort()
                row_of_current_pos = row_of_current_pos[sorted_idx_by_sum]
                sorted_idx_and_pos_matching = sorted_idx_by_sum[row_of_current_pos]
                # slice off the top desired_count and store them
                desired_rows.extend(list(sorted_idx_and_pos_matching[-desired_count:]))
            else:
                # do not include
                pass

            logging.info('Frequency filter keeping %d/%d %s entries ', desired_count,
                         sum(row_of_current_pos), desired_pos)
    else:
        logging.info('Not filtering any of the entries')
        desired_rows = range(len(vectors))

    # remove the vectors for infrequent entries, update list of pos tags too
    if desired_counts_per_feature_type is not None:
        # if some rows have been removed update respective data structures
        mat = mat[desired_rows, :]
        rows = itemgetter(*desired_rows)(document_features)
        pos_tags = pos_tags[desired_rows]

        # removing rows may empty some columns, remove these as well. This is probably not very like to occur as we have
        # already filtered out infrequent features, so the column count will stay roughly the same
        desired_cols = np.ravel(mat.sum(0)) > 0
        mat = mat[:, desired_cols]
        col_indices = list(np.where(desired_cols)[0])
        cols = itemgetter(*col_indices)(cols)

    logging.info('Selected only the most frequent entries, matrix size is now %r', mat.shape)
    assert mat.shape == (len(rows), len(cols))
    return mat, pos_tags, rows, cols
开发者ID:mbatchkarov,项目名称:DiscoUtils,代码行数:52,代码来源:reduce_dimensionality.py


注:本文中的discoutils.tokens.DocumentFeature类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。