本文整理汇总了Python中collections.Counter方法的典型用法代码示例。如果您正苦于以下问题:Python collections.Counter方法的具体用法?Python collections.Counter怎么用?Python collections.Counter使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类collections
的用法示例。
在下文中一共展示了collections.Counter方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: value_counts
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def value_counts(self, subset, attr, value, base=False):
"""
Get the number of currences per value of the dependent variable when
the given attribute is equal to the given value.
FIXME: Can attr/value be eliminated??
Args:
subset: the subset with which to act upon.
attr: the attribute of the value.
value: the value with which to track counts.
base: whether or not to calculate values based on the dependent
value (default False).
Returns:
A Counter instance detailing the number of occurrences per
dependent variable.
"""
counts = Counter()
for row in subset:
if row[attr] == value or base:
counts[row[self.dependent]] += 1
return counts
示例2: attr_counts
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def attr_counts(self, subset, attr):
"""
Get the number of occurrences per value of the given attribute
Args:
subset: the subset with which to act upon.
attr: the selected attribute.
Returns:
A Counter instance detailing the number of occurrences per
attribute value.
"""
counts = Counter()
for row in subset:
counts[row[attr]] += 1
return counts
示例3: get_buckets
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def get_buckets(stories, max_ignore_unbatched=100, max_pad_amount=25):
sentencecounts = [len(sents_graphs) for (sents_graphs, query, answer) in stories]
countpairs = sorted(collections.Counter(sentencecounts).items())
buckets = []
smallest_left_val = 0
num_unbatched = max_ignore_unbatched
for val,ct in countpairs:
num_unbatched += ct
if val - smallest_left_val > max_pad_amount or num_unbatched > max_ignore_unbatched:
buckets.append(val)
smallest_left_val = val
num_unbatched = 0
if buckets[-1] != countpairs[-1][0]:
buckets.append(countpairs[-1][0])
return buckets
示例4: _get_genome_amounts
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def _get_genome_amounts(self, probability, max_genome_amount):
"""
Get amounts of genomes by original genome
@param probability: Proportion of simulated original genomes
@type probability: int | long | float
@param max_genome_amount: Total number of genomes
@type max_genome_amount: int | long
@return: List of integers representing amount of strains
@rtype: list[int]
"""
assert isinstance(probability, (int, long, float))
assert 0 <= probability <= 1
assert isinstance(max_genome_amount, (int, long))
genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount)
diverence = Counter(genome_amounts)[1] / float(len(genome_amounts))
if max_genome_amount >= 10:
while abs(diverence - probability) > 0.05:
# print "need: {}, gotten: {}".format(probability, diverence)
genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount)
diverence = Counter(genome_amounts)[1] / float(len(genome_amounts))
return genome_amounts
示例5: build_vocab
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def build_vocab(words, vocab_size, visual_fld):
""" Build vocabulary of VOCAB_SIZE most frequent words and write it to
visualization/vocab.tsv
"""
utils.safe_mkdir(visual_fld)
file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')
dictionary = dict()
count = [('UNK', -1)]
index = 0
count.extend(Counter(words).most_common(vocab_size - 1))
for word, _ in count:
dictionary[word] = index
index += 1
file.write(word + '\n')
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
file.close()
return dictionary, index_dictionary
示例6: generate_bi_graphemes_dictionary
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def generate_bi_graphemes_dictionary(label_list):
freqs = Counter()
for label in label_list:
label = label.split(' ')
for i in label:
for pair in split_every(2, i):
if len(pair) == 2:
freqs[pair] += 1
with open('resources/unicodemap_en_baidu_bi_graphemes.csv', 'w') as bigram_label:
bigramwriter = csv.writer(bigram_label, delimiter = ',')
baidu_labels = list('\' abcdefghijklmnopqrstuvwxyz')
for index, key in enumerate(baidu_labels):
bigramwriter.writerow((key, index+1))
for index, key in enumerate(freqs.keys()):
bigramwriter.writerow((key, index+len(baidu_labels)+1))
示例7: test_tokens_to_indices
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def test_tokens_to_indices():
counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1, unknown_token='<unk>',
reserved_tokens=None)
i1 = vocab.to_indices('c')
assert i1 == 1
i2 = vocab.to_indices(['c'])
assert i2 == [1]
i3 = vocab.to_indices(['<unk>', 'non-exist'])
assert i3 == [0, 0]
i4 = vocab.to_indices(['a', 'non-exist', 'a', 'b'])
assert i4 == [3, 0, 3, 2]
示例8: test_indices_to_tokens
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def test_indices_to_tokens():
counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1,
unknown_token='<unknown>', reserved_tokens=None)
i1 = vocab.to_tokens(1)
assert i1 == 'c'
i2 = vocab.to_tokens([1])
assert i2 == ['c']
i3 = vocab.to_tokens([0, 0])
assert i3 == ['<unknown>', '<unknown>']
i4 = vocab.to_tokens([3, 0, 3, 2])
assert i4 == ['a', '<unknown>', 'a', 'b']
assertRaises(ValueError, vocab.to_tokens, 100)
示例9: CountErrors
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def CountErrors(ocr_text, truth_text):
"""Counts the drops and adds between 2 bags of iterables.
Simple bag of objects count returns the number of dropped and added
elements, regardless of order, from anything that is iterable, eg
a pair of strings gives character errors, and a pair of word lists give
word errors.
Args:
ocr_text: OCR text iterable (eg string for chars, word list for words).
truth_text: Truth text iterable.
Returns:
ErrorCounts named tuple.
"""
counts = collections.Counter(truth_text)
counts.subtract(ocr_text)
drops = sum(c for c in counts.values() if c > 0)
adds = sum(-c for c in counts.values() if c < 0)
return ErrorCounts(drops, adds, len(truth_text), len(ocr_text))
示例10: _get_ngrams
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def _get_ngrams(segment, max_order):
"""Extracts all n-grams up to a given maximum order from an input segment.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Returns:
The Counter containing all n-grams up to max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts = collections.Counter()
for order in range(1, max_order + 1):
for i in range(0, len(segment) - order + 1):
ngram = tuple(segment[i:i + order])
ngram_counts[ngram] += 1
return ngram_counts
示例11: _build_vocab
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def _build_vocab(generator, vocab_dir, vocab_name):
"""Build a vocabulary from examples.
Args:
generator: text generator for creating vocab.
vocab_dir: directory where to save the vocabulary.
vocab_name: vocab file name.
Returns:
text encoder.
"""
vocab_path = os.path.join(vocab_dir, vocab_name)
if not tf.gfile.Exists(vocab_path):
data = []
for line in generator:
data.extend(line.split())
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
encoder.store_to_file(vocab_path)
else:
encoder = text_encoder.TokenTextEncoder(vocab_path)
return encoder
示例12: _build_vocab
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def _build_vocab(filename, vocab_dir, vocab_name):
"""Reads a file to build a vocabulary.
Args:
filename: file to read list of words from.
vocab_dir: directory where to save the vocabulary.
vocab_name: vocab file name.
Returns:
text encoder.
"""
vocab_path = os.path.join(vocab_dir, vocab_name)
if not tf.gfile.Exists(vocab_path):
with tf.gfile.GFile(filename, "r") as f:
data = f.read().split()
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
encoder.store_to_file(vocab_path)
else:
encoder = text_encoder.TokenTextEncoder(vocab_path)
return encoder
示例13: test_custom_reserved_tokens
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def test_custom_reserved_tokens(self):
"""Test that we can pass custom reserved tokens to SubwordTextEncoder."""
corpus = "The quick brown fox jumps over the lazy dog"
token_counts = collections.Counter(corpus.split(" "))
start_symbol = "<S>"
end_symbol = "<E>"
reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
end_symbol]
encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
10, token_counts, 2, 10, reserved_tokens=reserved_tokens)
# Make sure that reserved tokens appear in the right places.
self.assertEqual(encoder.decode([2]), start_symbol)
self.assertEqual(encoder.decode([3]), end_symbol)
# Make sure that we haven't messed up the ability to reconstruct.
reconstructed_corpus = encoder.decode(encoder.encode(corpus))
self.assertEqual(corpus, reconstructed_corpus)
示例14: test_encodable_when_not_in_alphabet
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def test_encodable_when_not_in_alphabet(self):
corpus = "the quick brown fox jumps over the lazy dog"
token_counts = collections.Counter(corpus.split(" "))
encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
100, token_counts, 2, 10)
original = "This has UPPER CASE letters that are out of alphabet"
# Early versions could have an infinite loop when breaking into subtokens
# if there was any out-of-alphabet characters in the encoded string.
encoded = encoder.encode(original)
decoded = encoder.decode(encoded)
self.assertEqual(original, decoded)
encoded_str = "".join(encoder.all_subtoken_strings[i] for i in encoded)
self.assertIn("\\84;", encoded_str)
示例15: test_reserved_token_chars_not_in_alphabet
# 需要导入模块: import collections [as 别名]
# 或者: from collections import Counter [as 别名]
def test_reserved_token_chars_not_in_alphabet(self):
corpus = "dog"
token_counts = collections.Counter(corpus.split(" "))
encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size(
100, token_counts, 2, 100)
filename = os.path.join(self.test_temp_dir, "out.voc")
encoder1.store_to_file(filename)
encoder2 = text_encoder.SubwordTextEncoder(filename=filename)
self.assertEqual(encoder1._alphabet, encoder2._alphabet)
for t in text_encoder.RESERVED_TOKENS:
for c in t:
# Verify that encoders can encode all reserved token chars.
encoder1.encode(c)
encoder2.encode(c)