本文整理汇总了Python中discoutils.tokens.DocumentFeature类的典型用法代码示例。如果您正苦于以下问题:Python DocumentFeature类的具体用法?Python DocumentFeature怎么用?Python DocumentFeature使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DocumentFeature类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_smart_lower
def test_smart_lower():
# test that the PoS of an n-gram entry is not lowercased
assert DocumentFeature.smart_lower('Cat/N') == 'cat/N'
assert DocumentFeature.smart_lower('Cat/n') == 'cat/n'
assert DocumentFeature.smart_lower('Red/J_CaT/N') == 'red/J_cat/N'
assert DocumentFeature.smart_lower('Red/J CaT/N', separator=' ') == 'red/J cat/N'
# test that features are not touched
assert DocumentFeature.smart_lower('amod-DEP:former', lowercasing=False) == 'amod-DEP:former'
示例2: remove_overlapping_neighbours
def remove_overlapping_neighbours(cls, entry, to_insert):
"""
:type entry: DocumentFeature or str
:type to_insert: list of (str, float) tuples
"""
if isinstance(entry, (six.string_types, six.text_type)):
entry = DocumentFeature.from_string(entry)
features = [(DocumentFeature.from_string(x[0]), x[1]) for x in to_insert]
to_insert = [(f[0].tokens_as_str(), f[1]) for f in features
if not any(t in entry.tokens for t in f[0].tokens)]
return to_insert
示例3: test_document_feature_slicing
def test_document_feature_slicing():
DocumentFeature.recompile_pattern()
x = DocumentFeature.from_string('big/J_cat/N')
assert x[0] == DocumentFeature.from_string('big/J')
assert x[1] == DocumentFeature.from_string('cat/N')
assert x[1] == DocumentFeature('1-GRAM', (Token('cat', 'N', 1), ))
assert x[0:] == DocumentFeature.from_string('big/J_cat/N')
x = DocumentFeature.from_string('cat/N')
assert x[0] == DocumentFeature.from_string('cat/N')
assert x[0:] == DocumentFeature.from_string('cat/N')
assert x[:] == DocumentFeature.from_string('cat/N')
示例4: contains_impl
def contains_impl(self, feature):
if isinstance(feature, six.string_types):
feature = DocumentFeature.from_string(feature)
if feature.type not in self.entry_types:
# no point in composing single-word document features
return False
return str(feature[self.hardcoded_index]) in self.unigram_source
示例5: to_tsv
def to_tsv(self, events_path, entries_path='', features_path='',
entry_filter=lambda x: True, row_transform=lambda x: x,
gzipped=False, enforce_word_entry_pos_format=True, dense_hd5=False):
"""
Writes this thesaurus to Byblo-compatible file like the one it was most likely read from. In the
process converts all entries to a DocumentFeature, so all entries must be parsable into one. May reorder the
features of each entry.
:param events_path: file to write to
:param entry_filter: Called for every DocumentFeature that is an entry in this thesaurus. The vector will
only be written if this callable return true
:param row_transform: Callable, any transformation that might need to be done to each entry before converting
it to a DocumentFeature. This is needed because some entries (e.g. african/J:amod-HEAD:leader) are not
directly convertible (needs to be african/J_leader/N). Use this if the entries cannot be converted to
DocumentFeature, e.g. if the data isn't PoS tagged.
:param dense_hd5: if true, convert to a pandas `DataFrame` and write to a compressed HDF file. This is a 30%
faster and produces 30% smaller files than using `gzipped`. This is only suitable for matrices with a small
number of columns- this method enforces a hard limit of 1000.
Requires PyTables and HDF5.
:return: the file name
"""
if enforce_word_entry_pos_format:
rows = {i: DocumentFeature.from_string(row_transform(feat)) for (feat, i) in self.name2row.items()}
else:
rows = {i: feat for (feat, i) in self.name2row.items()}
if dense_hd5 and len(self.columns) <= 1000:
write_vectors_to_hdf(self.matrix, self.row_names, self.columns, events_path)
else:
write_vectors_to_disk(coo_matrix(self.matrix), rows, self.columns, events_path,
features_path=features_path, entries_path=entries_path,
entry_filter=entry_filter, gzipped=gzipped)
return events_path
示例6: reformat_socher_vectors
def reformat_socher_vectors():
"""
Formats the files output by Socher (2011)'s matlab code into byblo-compatible files.
Before running this a list of all phrases needs to be extracted from the labelled data, and these need to
be composed with Socher's matlab code. See note "Socher vectors" in Evernote.
"""
logging.info('Reformatting events file %s ---> %s',
socher_output_vectors_file, socher_composed_vectors_file)
# socher's code removes all PoS tags, so we can't translate his output
# back to a DocumentFeature. Let's read the input to his code instead and
# get the corresponding output vectors
# get a list of all phrases that we attempted to compose
with open(plaintext_socher_input_file) as infile:
composed_phrases = [DocumentFeature.from_string(line.strip()) for line in infile]
# get a list of all phrases where composition worked (no unknown words)
with open(socher_output_phrases_file) as infile:
success = [i for i, line in enumerate(infile) if '*UNKNOWN*' not in line]
# pick out just the phrases that composes successfully
composed_phrases = itemgetter(*success)(composed_phrases)
# load all vectors, remove these containing unknown words
mat = np.loadtxt(socher_output_vectors_file, delimiter=',')
mat = mat[success, :]
assert len(composed_phrases) == mat.shape[0] # same number of rows
# do the actual writing
write_vectors_to_hdf(sp.coo_matrix(mat),
composed_phrases,
['RAE-feat%d' % i for i in range(100)], # Socher provides 100-dimensional vectors
socher_composed_vectors_file)
示例7: train_verb_tensors
def train_verb_tensors(svos_file, noun_vectors_file, output_filename):
"""
Trains Verb-bar matrices, as described in Milajevs et al (EMNLP-14, §3)
:param svos_file: file containing a list of all SVOs in unlabelled data, one per line. May contain other document
features too. Such a file is output by `find_all_NPs.py`, which is called from `observed_vectors.py`
:param noun_vectors_file: a vector store containing noun vectors
:param output_filename: name of output file- must identify the noun vectors and the unlabelled corpus
"""
mkdirs_if_not_exists(os.path.dirname(output_filename))
v = Vectors.from_tsv(noun_vectors_file)
with open(svos_file) as infile:
phrases = set()
for line in infile:
if DocumentFeature.from_string(line.strip()).type == 'SVO':
phrases.add(tuple(line.strip().split('_')))
phrases = [(subj, verb, obj) for subj, verb, obj in phrases if subj in v and obj in v]
phrases = sorted(phrases, key=itemgetter(1))
logging.info('Found %d SVOs in list', len(phrases))
verb_tensors = dict()
for verb, svos in groupby(phrases, itemgetter(1)):
svos = list(svos)
if len(svos) < MIN_SVO_PER_VERB:
continue
logging.info('Training matrix for %s from %d SVOs', verb, len(svos))
vt = np.sum(np.outer(v.get_vector(subj).A, v.get_vector(obj).A) for subj, _, obj in svos)
verb_tensors[verb] = vt
logging.info('Trained %d verb matrices, saving...', len(verb_tensors))
for verb, tensor in verb_tensors.items():
df = pd.DataFrame(tensor)
df.to_hdf(output_filename, verb.split('/')[0], complevel=9, complib='zlib')
示例8: get_all_document_features
def get_all_document_features(include_unigrams=False, remove_pos=False):
"""
Finds all noun-noun and adj-noun compounds (and optionally adjs and nouns) in all labelled corpora
mentioned in the conf files.
:param include_unigrams: if False, only NPs will be returned
:param remove_pos: whether to remove PoS tags if present, result will be either "cat/N" or "cat"
:rtype: set of DocumentFeature
"""
result = set()
accepted_df_types = {'AN', 'NN', 'VO', 'SVO', '1-GRAM'} if include_unigrams else {'AN', 'NN', 'VO', 'SVO'}
for corpus_name, _ in get_all_corpora():
path = os.path.abspath(os.path.join(__file__, '..', '..', '..', ROOT, '%s_all_features.txt' % corpus_name))
with open(path) as infile:
for line in infile:
df = DocumentFeature.from_string(line.strip())
if df.type in accepted_df_types:
if remove_pos:
# todo these are of type str, in the other branch it's DocumentFeature. things will likely break
result.add(df.ngram_separator.join(t.text for t in df.tokens))
else:
result.add(df)
logging.info('Found a total of %d features in all corpora', len(result))
if not remove_pos:
logging.info('Their types are %r', Counter(df.type for df in result))
if include_unigrams:
logging.info('PoS tags of unigrams are are %r',
Counter(df.tokens[0].pos for df in result if df.type == '1-GRAM'))
else:
logging.info('Unigram features not included!')
return result
示例9: test_write_vectors_to_disk
def test_write_vectors_to_disk(resources, tmpdir):
"""
Checks the entries/features files, the events file is checked by
thesisgenerator.tests.test_thesaurus.test_to_file
:type th: Thesaurus
"""
th, expected_entries, expected_features, filter_callable = resources
events_file = str(tmpdir.join('events.txt'))
entries_file = str(tmpdir.join('entries.txt'))
features_file = str(tmpdir.join('features.txt'))
if not th: # empty thesaurus should raise an error
with pytest.raises(ValueError):
matrix, cols, rows = th.to_sparse_matrix()
else:
matrix, cols, rows = th.to_sparse_matrix()
rows = [DocumentFeature.from_string(x) for x in rows]
write_vectors_to_disk(sp.coo_matrix(matrix), rows, cols,
events_file, features_file, entries_file,
entry_filter=filter_callable)
if expected_entries:
# the file will not be written at all if there's nothing to put in it
entries = [x.split('\t')[0] for x in _read_and_strip_lines(entries_file)]
assert set(entries) == set(expected_entries)
else:
assert not os.path.exists(entries_file)
if expected_features:
features = [x.split('\t')[0] for x in _read_and_strip_lines(features_file)]
assert features == expected_features
else:
assert not os.path.exists(features_file)
示例10: __iter__
def __iter__(self):
for fname in self.files:
filename = join(self.dirname, fname)
infile = gzip.open(filename) if is_gzipped(filename) else open(filename)
with contextlib.closing(infile):
for line in infile:
# yield gensim.utils.tokenize(line, lower=True)
if isinstance(line, bytes):
line = line.decode()
res = [DocumentFeature.smart_lower(w) for w in line.split() if
DocumentFeature.from_string(w).type != 'EMPTY']
if len(res) > 8:
# ignore short sentences, they are probably noise
if self.remove_pos:
yield [x.split('/')[0] for x in res]
else:
yield res
示例11: get_vector
def get_vector(self, feature):
"""
:type feature: DocumentFeature
:rtype: scipy.sparse.csr_matrix
"""
if isinstance(feature, six.string_types):
feature = DocumentFeature.from_string(feature)
return sp.csr_matrix(reduce(self.function,
[self.unigram_source.get_vector(str(t)).A for t in feature[:]]))
示例12: __contains__
def __contains__(self, feature):
if isinstance(feature, six.string_types):
feature = DocumentFeature.from_string(feature)
# this is a SVO, we have a verb tensor and vectors for both arguments
return feature.type in self.entry_types and \
str(feature[1]) in self.verb_tensors and \
str(feature[0]) in self.unigram_source and \
str(feature[2]) in self.unigram_source
示例13: test_with_different_separators
def test_with_different_separators():
DocumentFeature.recompile_pattern(pos_separator='_', ngram_separator='!')
assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
DocumentFeature.from_string('very_RB!big_J')
DocumentFeature.recompile_pattern(pos_separator='-', ngram_separator=' ')
assert DocumentFeature('1-GRAM', (Token('very', 'RB'),)) == DocumentFeature.from_string('very-RB')
assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
DocumentFeature.from_string('very-RB big-J')
示例14: _paraphrase
def _paraphrase(self, feature, vocabulary, j_indices, values, stats, **kwargs):
"""
Replaces term with its k nearest neighbours from the thesaurus
Parameters
----------
neighbour_source : callable, returns a thesaurus-like object (a list of
(neighbour, sim) tuples, sorted by highest sim first,
acts as a defaultdict(list) ). The callable takes one parameter for
compatibility purposes- one of the possible callables I want to
use here requires access to the vocabulary.
The default behaviour is to return a callable pointing to the
currently loaded thesaurus.
"""
# logging.debug('Paraphrasing %r in doc %d', feature, doc_id)
neighbours = self.thesaurus.get_nearest_neighbours(feature)
if self.thesaurus.__class__.__name__ == 'Thesaurus':
# todo this will also activate for DenseVectors, because they are also instances of thesaurus
# the check needs to be self.thesaurus.__class__.__name__ == 'Thesaurus', but then
# we need to make sure init_sims is called with the correct vocabulary so that all neighbours are IV
# precomputed thesauri do not guarantee that the returned neighbours will be in vocabulary
# these should by now only the used in testing though
neighbours = [(neighbour, sim) for (neighbour, sim) in neighbours
if DocumentFeature.from_string(neighbour) in vocabulary]
event = [str(feature), len(neighbours)]
for neighbour, sim in neighbours[:self.k]:
# the document may already contain the feature we
# are about to insert into it,
# a merging strategy is required,
# e.g. what do we do if the document has the word X
# in it and we encounter X again. By default,
# scipy uses addition
df = DocumentFeature.from_string(neighbour)
j_indices.append(vocabulary.get(df))
values.append(self.sim_transformer(sim))
# track the event
event.extend([neighbour, sim])
stats.register_paraphrase(tuple(event))
示例15: filter_out_infrequent_entries
def filter_out_infrequent_entries(desired_counts_per_feature_type, vectors):
logging.info('Converting thesaurus to sparse matrix')
mat, cols, rows = vectors.to_sparse_matrix()
logging.info('Got a data matrix of shape %r', mat.shape)
# convert to document feature for access to PoS tag
document_features = [DocumentFeature.from_string(r) for r in rows]
# don't want to do dimensionality reduction on composed vectors
feature_types = [sorted_idx_and_pos_matching.type for sorted_idx_and_pos_matching in document_features]
assert all(x == '1-GRAM' or x == 'AN' or x == 'NN' for x in feature_types), Counter(feature_types)
# get the PoS tags of each row in the matrix
pos_tags = np.array([df.tokens[0].pos if df.type == '1-GRAM' else df.type for df in document_features])
# find the rows of the matrix that correspond to the most frequent nouns, verbs, ...,
# as measured by sum of feature counts. This is Byblo's definition of frequency (which is in fact a marginal),
# but it is strongly correlated with one normally thinks of as entry frequency
desired_rows = []
if desired_counts_per_feature_type is not None:
for desired_pos, desired_count in desired_counts_per_feature_type:
row_of_current_pos = pos_tags == desired_pos # what rows are the right PoS tags at, boolean mask array
# indices of the array sorted by row sum, and where the pos == desired_pos
if desired_count > 0:
sorted_idx_by_sum = np.ravel(mat.sum(1)).argsort()
row_of_current_pos = row_of_current_pos[sorted_idx_by_sum]
sorted_idx_and_pos_matching = sorted_idx_by_sum[row_of_current_pos]
# slice off the top desired_count and store them
desired_rows.extend(list(sorted_idx_and_pos_matching[-desired_count:]))
else:
# do not include
pass
logging.info('Frequency filter keeping %d/%d %s entries ', desired_count,
sum(row_of_current_pos), desired_pos)
else:
logging.info('Not filtering any of the entries')
desired_rows = range(len(vectors))
# remove the vectors for infrequent entries, update list of pos tags too
if desired_counts_per_feature_type is not None:
# if some rows have been removed update respective data structures
mat = mat[desired_rows, :]
rows = itemgetter(*desired_rows)(document_features)
pos_tags = pos_tags[desired_rows]
# removing rows may empty some columns, remove these as well. This is probably not very like to occur as we have
# already filtered out infrequent features, so the column count will stay roughly the same
desired_cols = np.ravel(mat.sum(0)) > 0
mat = mat[:, desired_cols]
col_indices = list(np.where(desired_cols)[0])
cols = itemgetter(*col_indices)(cols)
logging.info('Selected only the most frequent entries, matrix size is now %r', mat.shape)
assert mat.shape == (len(rows), len(cols))
return mat, pos_tags, rows, cols