當前位置: 首頁>>代碼示例>>Python>>正文


Python DocumentFeature.from_string方法代碼示例

本文整理匯總了Python中discoutils.tokens.DocumentFeature.from_string方法的典型用法代碼示例。如果您正苦於以下問題:Python DocumentFeature.from_string方法的具體用法?Python DocumentFeature.from_string怎麽用?Python DocumentFeature.from_string使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在discoutils.tokens.DocumentFeature的用法示例。


在下文中一共展示了DocumentFeature.from_string方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_with_different_separators

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
def test_with_different_separators():
    DocumentFeature.recompile_pattern(pos_separator='_', ngram_separator='!')
    assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
           DocumentFeature.from_string('very_RB!big_J')

    DocumentFeature.recompile_pattern(pos_separator='-', ngram_separator=' ')
    assert DocumentFeature('1-GRAM', (Token('very', 'RB'),)) == DocumentFeature.from_string('very-RB')
    assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
           DocumentFeature.from_string('very-RB big-J')
開發者ID:mbatchkarov,項目名稱:DiscoUtils,代碼行數:11,代碼來源:test_token.py

示例2: remove_overlapping_neighbours

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
    def remove_overlapping_neighbours(cls, entry, to_insert):
        """

        :type entry: DocumentFeature or str
        :type to_insert: list of (str, float) tuples
        """
        if isinstance(entry, (six.string_types, six.text_type)):
            entry = DocumentFeature.from_string(entry)
        features = [(DocumentFeature.from_string(x[0]), x[1]) for x in to_insert]
        to_insert = [(f[0].tokens_as_str(), f[1]) for f in features
                     if not any(t in entry.tokens for t in f[0].tokens)]
        return to_insert
開發者ID:jt86,項目名稱:DiscoUtils,代碼行數:14,代碼來源:thesaurus_loader.py

示例3: get_all_document_features

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
def get_all_document_features(include_unigrams=False, remove_pos=False):
    """
    Finds all noun-noun and adj-noun compounds (and optionally adjs and nouns) in all labelled corpora
    mentioned in the conf files.
    :param include_unigrams: if False, only NPs will be returned
    :param remove_pos: whether to remove PoS tags if present, result will be either "cat/N" or "cat"
    :rtype: set of DocumentFeature
    """
    result = set()
    accepted_df_types = {'AN', 'NN', 'VO', 'SVO', '1-GRAM'} if include_unigrams else {'AN', 'NN', 'VO', 'SVO'}
    for corpus_name, _ in get_all_corpora():
        path = os.path.abspath(os.path.join(__file__, '..', '..', '..', ROOT, '%s_all_features.txt' % corpus_name))
        with open(path) as infile:
            for line in infile:
                df = DocumentFeature.from_string(line.strip())
                if df.type in accepted_df_types:
                    if remove_pos:
                        # todo these are of type str, in the other branch it's DocumentFeature. things will likely break
                        result.add(df.ngram_separator.join(t.text for t in df.tokens))
                    else:
                        result.add(df)

    logging.info('Found a total of %d features in all corpora', len(result))
    if not remove_pos:
        logging.info('Their types are %r', Counter(df.type for df in result))
    if include_unigrams:
        logging.info('PoS tags of unigrams are are %r',
                     Counter(df.tokens[0].pos for df in result if df.type == '1-GRAM'))
    else:
        logging.info('Unigram features not included!')
    return result
開發者ID:mbatchkarov,項目名稱:dc_evaluation,代碼行數:33,代碼來源:compress_labelled_data.py

示例4: contains_impl

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
 def contains_impl(self, feature):
     if isinstance(feature, six.string_types):
         feature = DocumentFeature.from_string(feature)
     if feature.type not in self.entry_types:
         # no point in composing single-word document features
         return False
     return str(feature[self.hardcoded_index]) in self.unigram_source
開發者ID:mbatchkarov,項目名稱:vector_builder,代碼行數:9,代碼來源:vectorstore.py

示例5: to_tsv

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
    def to_tsv(self, events_path, entries_path='', features_path='',
               entry_filter=lambda x: True, row_transform=lambda x: x,
               gzipped=False, enforce_word_entry_pos_format=True, dense_hd5=False):
        """
        Writes this thesaurus to Byblo-compatible file like the one it was most likely read from. In the
        process converts all entries to a DocumentFeature, so all entries must be parsable into one. May reorder the
        features of each entry.

        :param events_path: file to write to
        :param entry_filter: Called for every DocumentFeature that is an entry in this thesaurus. The vector will
         only be written if this callable return true
        :param row_transform: Callable, any transformation that might need to be done to each entry before converting
         it to a DocumentFeature. This is needed because some entries (e.g. african/J:amod-HEAD:leader) are not
         directly convertible (needs to be african/J_leader/N). Use this if the entries cannot be converted to
         DocumentFeature, e.g. if the data isn't PoS tagged.
         :param dense_hd5: if true, convert to a pandas `DataFrame` and write to a compressed HDF file. This is a 30%
          faster and produces 30% smaller files than using `gzipped`. This is only suitable for matrices with a small
          number of columns- this method enforces a hard limit of 1000.
          Requires PyTables and HDF5.
        :return: the file name
        """
        if enforce_word_entry_pos_format:
            rows = {i: DocumentFeature.from_string(row_transform(feat)) for (feat, i) in self.name2row.items()}
        else:
            rows = {i: feat for (feat, i) in self.name2row.items()}

        if dense_hd5 and len(self.columns) <= 1000:
            write_vectors_to_hdf(self.matrix, self.row_names, self.columns, events_path)
        else:
            write_vectors_to_disk(coo_matrix(self.matrix), rows, self.columns, events_path,
                                  features_path=features_path, entries_path=entries_path,
                                  entry_filter=entry_filter, gzipped=gzipped)
        return events_path
開發者ID:tttthomasssss,項目名稱:DiscoUtils,代碼行數:35,代碼來源:thesaurus_loader.py

示例6: reformat_socher_vectors

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
def reformat_socher_vectors():
    """
    Formats the files output by Socher (2011)'s matlab code into byblo-compatible files.

    Before running this a list of all phrases needs to be extracted from the labelled data, and these need to
    be composed with Socher's matlab code. See note "Socher vectors" in Evernote.

    """
    logging.info('Reformatting events file %s ---> %s',
                 socher_output_vectors_file, socher_composed_vectors_file)

    # socher's code removes all PoS tags, so we can't translate his output
    # back to a DocumentFeature. Let's read the input to his code instead and
    # get the corresponding output vectors
    # get a list of all phrases that we attempted to compose
    with open(plaintext_socher_input_file) as infile:
        composed_phrases = [DocumentFeature.from_string(line.strip()) for line in infile]

    # get a list of all phrases where composition worked (no unknown words)
    with open(socher_output_phrases_file) as infile:
        success = [i for i, line in enumerate(infile) if '*UNKNOWN*' not in line]
        # pick out just the phrases that composes successfully
    composed_phrases = itemgetter(*success)(composed_phrases)

    # load all vectors, remove these containing unknown words
    mat = np.loadtxt(socher_output_vectors_file, delimiter=',')
    mat = mat[success, :]
    assert len(composed_phrases) == mat.shape[0]  # same number of rows

    # do the actual writing
    write_vectors_to_hdf(sp.coo_matrix(mat),
                         composed_phrases,
                         ['RAE-feat%d' % i for i in range(100)],  # Socher provides 100-dimensional vectors
                         socher_composed_vectors_file)
開發者ID:mbatchkarov,項目名稱:vector_builder,代碼行數:36,代碼來源:socher_vectors.py

示例7: train_verb_tensors

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
def train_verb_tensors(svos_file, noun_vectors_file, output_filename):
    """
    Trains Verb-bar matrices, as described in Milajevs et al (EMNLP-14, §3)
    :param svos_file: file containing a list of all SVOs in unlabelled data, one per line. May contain other document
     features too. Such a file is output by `find_all_NPs.py`, which is called from `observed_vectors.py`
    :param noun_vectors_file: a vector store containing noun vectors
    :param output_filename: name of output file- must identify the noun vectors and the unlabelled corpus
    """
    mkdirs_if_not_exists(os.path.dirname(output_filename))

    v = Vectors.from_tsv(noun_vectors_file)

    with open(svos_file) as infile:
        phrases = set()
        for line in infile:
            if DocumentFeature.from_string(line.strip()).type == 'SVO':
                phrases.add(tuple(line.strip().split('_')))
    phrases = [(subj, verb, obj) for subj, verb, obj in phrases if subj in v and obj in v]
    phrases = sorted(phrases, key=itemgetter(1))
    logging.info('Found %d SVOs in list', len(phrases))

    verb_tensors = dict()
    for verb, svos in groupby(phrases, itemgetter(1)):
        svos = list(svos)
        if len(svos) < MIN_SVO_PER_VERB:
            continue
        logging.info('Training matrix for %s from %d SVOs', verb, len(svos))
        vt = np.sum(np.outer(v.get_vector(subj).A, v.get_vector(obj).A) for subj, _, obj in svos)
        verb_tensors[verb] = vt

    logging.info('Trained %d verb matrices, saving...', len(verb_tensors))
    for verb, tensor in verb_tensors.items():
        df = pd.DataFrame(tensor)
        df.to_hdf(output_filename, verb.split('/')[0], complevel=9, complib='zlib')
開發者ID:mbatchkarov,項目名稱:vector_builder,代碼行數:36,代碼來源:categorical_composers.py

示例8: test_write_vectors_to_disk

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
def test_write_vectors_to_disk(resources, tmpdir):
    """
    Checks the entries/features files, the events file is checked by
    thesisgenerator.tests.test_thesaurus.test_to_file

    :type th: Thesaurus
    """
    th, expected_entries, expected_features, filter_callable = resources
    events_file = str(tmpdir.join('events.txt'))
    entries_file = str(tmpdir.join('entries.txt'))
    features_file = str(tmpdir.join('features.txt'))

    if not th: # empty thesaurus should raise an error
        with pytest.raises(ValueError):
            matrix, cols, rows = th.to_sparse_matrix()
    else:
        matrix, cols, rows = th.to_sparse_matrix()
        rows = [DocumentFeature.from_string(x) for x in rows]
        write_vectors_to_disk(sp.coo_matrix(matrix), rows, cols,
                              events_file, features_file, entries_file,
                              entry_filter=filter_callable)

        if expected_entries:
            # the file will not be written at all if there's nothing to put in it
            entries = [x.split('\t')[0] for x in _read_and_strip_lines(entries_file)]
            assert set(entries) == set(expected_entries)
        else:
            assert not os.path.exists(entries_file)

        if expected_features:
            features = [x.split('\t')[0] for x in _read_and_strip_lines(features_file)]
            assert features == expected_features
        else:
            assert not os.path.exists(features_file)
開發者ID:jt86,項目名稱:DiscoUtils,代碼行數:36,代碼來源:test_io_utils.py

示例9: __contains__

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
    def __contains__(self, feature):
        if isinstance(feature, six.string_types):
            feature = DocumentFeature.from_string(feature)

        # this is a SVO, we have a verb tensor and vectors for both arguments
        return feature.type in self.entry_types and \
               str(feature[1]) in self.verb_tensors and \
               str(feature[0]) in self.unigram_source and \
               str(feature[2]) in self.unigram_source
開發者ID:mbatchkarov,項目名稱:vector_builder,代碼行數:11,代碼來源:vectorstore.py

示例10: get_vector

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
 def get_vector(self, feature):
     """
     :type feature: DocumentFeature
     :rtype: scipy.sparse.csr_matrix
     """
     if isinstance(feature, six.string_types):
         feature = DocumentFeature.from_string(feature)
     return sp.csr_matrix(reduce(self.function,
                                 [self.unigram_source.get_vector(str(t)).A for t in feature[:]]))
開發者ID:mbatchkarov,項目名稱:vector_builder,代碼行數:11,代碼來源:vectorstore.py

示例11: test_document_feature_slicing

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
def test_document_feature_slicing():
    DocumentFeature.recompile_pattern()
    x = DocumentFeature.from_string('big/J_cat/N')
    assert x[0] == DocumentFeature.from_string('big/J')
    assert x[1] == DocumentFeature.from_string('cat/N')
    assert x[1] == DocumentFeature('1-GRAM', (Token('cat', 'N', 1), ))
    assert x[0:] == DocumentFeature.from_string('big/J_cat/N')

    x = DocumentFeature.from_string('cat/N')
    assert x[0] == DocumentFeature.from_string('cat/N')
    assert x[0:] == DocumentFeature.from_string('cat/N')
    assert x[:] == DocumentFeature.from_string('cat/N')
開發者ID:mbatchkarov,項目名稱:DiscoUtils,代碼行數:14,代碼來源:test_token.py

示例12: _paraphrase

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
    def _paraphrase(self, feature, vocabulary, j_indices, values, stats, **kwargs):
        """
        Replaces term with its k nearest neighbours from the thesaurus

        Parameters
        ----------
        neighbour_source : callable, returns a thesaurus-like object (a list of
          (neighbour, sim) tuples, sorted by highest sim first,
          acts as a defaultdict(list) ). The callable takes one parameter for
          compatibility purposes- one of the possible callables I want to
          use here requires access to the vocabulary.
           The default behaviour is to return a callable pointing to the
           currently loaded thesaurus.
        """

        # logging.debug('Paraphrasing %r in doc %d', feature, doc_id)
        neighbours = self.thesaurus.get_nearest_neighbours(feature)
        if self.thesaurus.__class__.__name__ == 'Thesaurus':
            # todo this will also activate for DenseVectors, because they are also instances of thesaurus
            # the check needs to be self.thesaurus.__class__.__name__ == 'Thesaurus', but then
            # we need to make sure init_sims is called with the correct vocabulary so that all neighbours are IV

            # precomputed thesauri do not guarantee that the returned neighbours will be in vocabulary
            # these should by now only the used in testing though
            neighbours = [(neighbour, sim) for (neighbour, sim) in neighbours
                          if DocumentFeature.from_string(neighbour) in vocabulary]
        event = [str(feature), len(neighbours)]
        for neighbour, sim in neighbours[:self.k]:
            # the document may already contain the feature we
            # are about to insert into it,
            # a merging strategy is required,
            # e.g. what do we do if the document has the word X
            # in it and we encounter X again. By default,
            # scipy uses addition
            df = DocumentFeature.from_string(neighbour)
            j_indices.append(vocabulary.get(df))
            values.append(self.sim_transformer(sim))
            # track the event
            event.extend([neighbour, sim])
        stats.register_paraphrase(tuple(event))
開發者ID:mbatchkarov,項目名稱:dc_evaluation,代碼行數:42,代碼來源:feature_handlers.py

示例13: filter_out_infrequent_entries

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
def filter_out_infrequent_entries(desired_counts_per_feature_type, vectors):
    logging.info('Converting thesaurus to sparse matrix')
    mat, cols, rows = vectors.to_sparse_matrix()
    logging.info('Got a data matrix of shape %r', mat.shape)
    # convert to document feature for access to PoS tag
    document_features = [DocumentFeature.from_string(r) for r in rows]
    # don't want to do dimensionality reduction on composed vectors
    feature_types = [sorted_idx_and_pos_matching.type for sorted_idx_and_pos_matching in document_features]
    assert all(x == '1-GRAM' or x == 'AN' or x == 'NN' for x in feature_types), Counter(feature_types)
    # get the PoS tags of each row in the matrix
    pos_tags = np.array([df.tokens[0].pos if df.type == '1-GRAM' else df.type for df in document_features])
    # find the rows of the matrix that correspond to the most frequent nouns, verbs, ...,
    # as measured by sum of feature counts. This is Byblo's definition of frequency (which is in fact a marginal),
    # but it is strongly correlated with one normally thinks of as entry frequency
    desired_rows = []
    if desired_counts_per_feature_type is not None:
        for desired_pos, desired_count in desired_counts_per_feature_type:
            row_of_current_pos = pos_tags == desired_pos  # what rows are the right PoS tags at, boolean mask array
            # indices of the array sorted by row sum, and where the pos == desired_pos
            if desired_count > 0:
                sorted_idx_by_sum = np.ravel(mat.sum(1)).argsort()
                row_of_current_pos = row_of_current_pos[sorted_idx_by_sum]
                sorted_idx_and_pos_matching = sorted_idx_by_sum[row_of_current_pos]
                # slice off the top desired_count and store them
                desired_rows.extend(list(sorted_idx_and_pos_matching[-desired_count:]))
            else:
                # do not include
                pass

            logging.info('Frequency filter keeping %d/%d %s entries ', desired_count,
                         sum(row_of_current_pos), desired_pos)
    else:
        logging.info('Not filtering any of the entries')
        desired_rows = range(len(vectors))

    # remove the vectors for infrequent entries, update list of pos tags too
    if desired_counts_per_feature_type is not None:
        # if some rows have been removed update respective data structures
        mat = mat[desired_rows, :]
        rows = itemgetter(*desired_rows)(document_features)
        pos_tags = pos_tags[desired_rows]

        # removing rows may empty some columns, remove these as well. This is probably not very like to occur as we have
        # already filtered out infrequent features, so the column count will stay roughly the same
        desired_cols = np.ravel(mat.sum(0)) > 0
        mat = mat[:, desired_cols]
        col_indices = list(np.where(desired_cols)[0])
        cols = itemgetter(*col_indices)(cols)

    logging.info('Selected only the most frequent entries, matrix size is now %r', mat.shape)
    assert mat.shape == (len(rows), len(cols))
    return mat, pos_tags, rows, cols
開發者ID:mbatchkarov,項目名稱:DiscoUtils,代碼行數:54,代碼來源:reduce_dimensionality.py

示例14: run_glove

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
def run_glove():
    logging.info('Starting training')
    with temp_chdir(args.glove_dir):
        run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data))

    # convert their format to ours
    df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None)
    logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df))
    # remove any shit-looking tokens, they'll get in the way later
    mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index]
    logging.info('Keeping %d entries', sum(mask))
    logging.info('Shape of vectors before filtering %r', df.shape)
    df = df[mask]
    logging.info('Shape of vectors after filtering %r', df.shape)
    cols = ['f%d' % i for i in range(df.shape[1])]
    mkdirs_if_not_exists(output_dir)
    write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
開發者ID:mbatchkarov,項目名稱:vector_builder,代碼行數:19,代碼來源:get_glove_vectors.py

示例15: __iter__

# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import from_string [as 別名]
 def __iter__(self):
     for fname in self.files:
         filename = join(self.dirname, fname)
         infile = gzip.open(filename) if is_gzipped(filename) else open(filename)
         with contextlib.closing(infile):
             for line in infile:
                 # yield gensim.utils.tokenize(line, lower=True)
                 if isinstance(line, bytes):
                     line = line.decode()
                 res = [DocumentFeature.smart_lower(w) for w in line.split() if
                        DocumentFeature.from_string(w).type != 'EMPTY']
                 if len(res) > 8:
                     # ignore short sentences, they are probably noise
                     if self.remove_pos:
                         yield [x.split('/')[0] for x in res]
                     else:
                         yield res
開發者ID:mbatchkarov,項目名稱:vector_builder,代碼行數:19,代碼來源:get_word2vec_vectors.py


注:本文中的discoutils.tokens.DocumentFeature.from_string方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。