本文整理汇总了Python中discoutils.tokens.DocumentFeature.smart_lower方法的典型用法代码示例。如果您正苦于以下问题:Python DocumentFeature.smart_lower方法的具体用法?Python DocumentFeature.smart_lower怎么用?Python DocumentFeature.smart_lower使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类discoutils.tokens.DocumentFeature
的用法示例。
在下文中一共展示了DocumentFeature.smart_lower方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_smart_lower
# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import smart_lower [as 别名]
def test_smart_lower():
# test that the PoS of an n-gram entry is not lowercased
assert DocumentFeature.smart_lower('Cat/N') == 'cat/N'
assert DocumentFeature.smart_lower('Cat/n') == 'cat/n'
assert DocumentFeature.smart_lower('Red/J_CaT/N') == 'red/J_cat/N'
assert DocumentFeature.smart_lower('Red/J CaT/N', separator=' ') == 'red/J cat/N'
# test that features are not touched
assert DocumentFeature.smart_lower('amod-DEP:former', lowercasing=False) == 'amod-DEP:former'
示例2: _read_vector
# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import smart_lower [as 别名]
def _read_vector(vector_file):
bn = os.path.basename(vector_file)
sent_file = os.path.join(os.path.dirname(vector_file), "%s.sent" % bn.split(".")[0])
if not os.path.exists(sent_file):
return "__MISSING__", {}
with open(sent_file) as infile:
phrase = " ".join(line.strip().split("\t")[1] for line in infile if line.strip())
with gzip.open(vector_file) as infile:
file_content = infile.readline().decode("utf8").strip().split("\t")
features = [
(DocumentFeature.smart_lower(word, lowercasing=True), float(count))
for (word, count) in walk_nonoverlapping_pairs(file_content, beg=0)
]
return phrase, features
示例3: __iter__
# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import smart_lower [as 别名]
def __iter__(self):
for fname in self.files:
filename = join(self.dirname, fname)
infile = gzip.open(filename) if is_gzipped(filename) else open(filename)
with contextlib.closing(infile):
for line in infile:
# yield gensim.utils.tokenize(line, lower=True)
if isinstance(line, bytes):
line = line.decode()
res = [DocumentFeature.smart_lower(w) for w in line.split() if
DocumentFeature.from_string(w).type != 'EMPTY']
if len(res) > 8:
# ignore short sentences, they are probably noise
if self.remove_pos:
yield [x.split('/')[0] for x in res]
else:
yield res
示例4: from_tsv
# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import smart_lower [as 别名]
def from_tsv(cls, tsv_file, sim_threshold=0, include_self=False,
lowercasing=False, ngram_separator='_', allow_lexical_overlap=True,
row_filter=lambda x, y: True, column_filter=lambda x: True, max_len=50,
max_neighbours=1e8, merge_duplicates=False, immutable=True,
enforce_word_entry_pos_format=True, tar=False, **kwargs):
"""
Create a Thesaurus by parsing a Byblo-compatible TSV files (events or sims).
If duplicate values are encoutered during parsing, only the latest will be kept.
:param tsv_file: path to input TSV file
:type tsv_file: str
:param sim_threshold: min similarity between an entry and its neighbour for the neighbour to be included
:type sim_threshold: float
:param include_self: whether to include self as nearest neighbour.
:type include_self: bool
:param lowercasing: if true, most of what is read will be lowercased (excluding PoS tags), so
Cat/N -> cat/N. This is desirable when reading thesauri with this class. If False, no lowercasing
will take place. This might be desirable when readings feature lists or already lowercased neighbour
lists. FET + Byblo thesauri are already lowercased.
:type lowercasing: bool
:param ngram_separator: When n_gram entries are read in, what are the indidivual tokens separated by
:param column_filter: A function that takes a string (column in the file) and returns whether or not
the string should be kept
:param row_filter: takes a string and its corresponding DocumentFeature and determines if it should be loaded.
If `enforce_word_entry_pos_format` is `False`, the second parameter to this function will be `None`
:param allow_lexical_overlap: whether neighbours/features are allowed to overlap lexically with the entry
they are neighbours/features of. OTE: THE BEHAVIOUR OF THIS PARAMETER IS SLIGHTLY DIFFERENT FROM THE EQUIVALENT
IN VECTORS. SEE COMMENT THERE.
:param max_len: maximum length (in characters) of permissible **entries**. Longer entries are ignored.
:param max_neighbours: maximum neighbours per entry. This is applied AFTER the filtering defined by
column_filter and allow_lexical_overlap is finished.
:param merge_duplicates: whether to raise en error if multiple entries exist, or concatenate/add them together.
The former is appropriate for `Thesaurus`, and the latter for `Vectors`
:param enforce_word_entry_pos_format: if true, entries that are not in a `word/POS` format are skipped. This
must be true for `allow_lexical_overlap` to work.
:param tar: whether the file is compressed by running `tar -zcvf file.gz file.txt`. Assuming the tar contains
a single file.
"""
if not tsv_file:
raise ValueError("No thesaurus specified")
to_return = dict()
logging.info('Loading thesaurus %s from disk', tsv_file)
gz_file = tsv_file + '.gz'
if os.path.exists(gz_file) and tar:
logging.warning('Using .gz version of thesaurus')
tsv_file = gz_file
if not allow_lexical_overlap:
logging.warning('DISALLOWING LEXICAL OVERLAP')
if not allow_lexical_overlap and not enforce_word_entry_pos_format:
raise ValueError('allow_lexical_overlap requires entries to be converted to a DocumentFeature. '
'Please enable enforce_word_entry_pos_format')
FILTERED = '___FILTERED___'.lower()
if tar:
tarf = tarfile.open(tsv_file, 'r')
members = tarf.getmembers()
if len(members) != 1:
# todo this is odd, I don't know why it is happening
# on some machine tar adds a second hidden file to the archive
logging.warning('Tar archive contains multiple files: %r' % members)
logging.warning('Using the last file in the tar')
fhandle = tarf.extractfile(members[-1])
else:
fhandle = open(tsv_file)
with fhandle as infile:
for line in infile.readlines():
if tar:
# this is a byte steam, needs to be decoded
tokens = line.decode('UTF8').strip().split('\t')
else:
tokens = line.strip().split('\t')
if len(tokens) % 2 == 0:
# must have an odd number of things, one for the entry
# and pairs for (neighbour, similarity)
logging.warning('Skipping dodgy line in thesaurus file: %s\n %s', tsv_file, line)
continue
if tokens[0] != FILTERED:
key = DocumentFeature.smart_lower(tokens[0], ngram_separator, lowercasing)
dfkey = DocumentFeature.from_string(key) if enforce_word_entry_pos_format else None
if enforce_word_entry_pos_format and dfkey.type == 'EMPTY':
# do not load things in the wrong format, they'll get in the way later
logging.warning('%s is not in the word/POS format, skipping', tokens[0])
continue
if (not row_filter(key, dfkey)) or len(key) > max_len:
logging.warning('Skipping entry for %s', key)
continue
to_insert = [(DocumentFeature.smart_lower(word, ngram_separator, lowercasing), float(sim))
for (word, sim) in walk_nonoverlapping_pairs(tokens, 1)
if word.lower() != FILTERED and column_filter(word) and float(sim) > sim_threshold]
if not allow_lexical_overlap:
#.........这里部分代码省略.........