Python collections.Counter方法代码示例

本文整理汇总了Python中collections.Counter方法的典型用法代码示例。如果您正苦于以下问题:Python collections.Counter方法的具体用法?Python collections.Counter怎么用?Python collections.Counter使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在collections的用法示例。


示例1: value_counts

def value_counts(self, subset, attr, value, base=False):
        Get the number of currences per value of the dependent variable when
        the given attribute is equal to the given value.

        FIXME: Can attr/value be eliminated??

            subset: the subset with which to act upon.
            attr: the attribute of the value.
            value: the value with which to track counts.
            base: whether or not to calculate values based on the dependent
                value (default False).
            A Counter instance detailing the number of occurrences per
            dependent variable.

        counts = Counter()
        for row in subset:
            if row[attr] == value or base:
                counts[row[self.dependent]] += 1
        return counts 

示例2: attr_counts

def attr_counts(self, subset, attr):
        Get the number of occurrences per value of the given attribute

            subset: the subset with which to act upon.
            attr: the selected attribute.
            A Counter instance detailing the number of occurrences per
            attribute value.

        counts = Counter()
        for row in subset:
            counts[row[attr]] += 1
        return counts 

示例3: get_buckets

def get_buckets(stories, max_ignore_unbatched=100, max_pad_amount=25):
    sentencecounts = [len(sents_graphs) for (sents_graphs, query, answer) in stories]
    countpairs = sorted(collections.Counter(sentencecounts).items())

    buckets = []
    smallest_left_val = 0
    num_unbatched = max_ignore_unbatched
    for val,ct in countpairs:
        num_unbatched += ct
        if val - smallest_left_val > max_pad_amount or num_unbatched > max_ignore_unbatched:
            smallest_left_val = val
            num_unbatched = 0
    if buckets[-1] != countpairs[-1][0]:

    return buckets 

示例4: _get_genome_amounts

def _get_genome_amounts(self, probability, max_genome_amount):
		Get amounts of genomes by original genome

		@param probability: Proportion of simulated original genomes
		@type probability: int | long | float
		@param max_genome_amount: Total number of genomes
		@type max_genome_amount: int | long

		@return: List of integers representing amount of strains
		@rtype: list[int]
		assert isinstance(probability, (int, long, float))
		assert 0 <= probability <= 1
		assert isinstance(max_genome_amount, (int, long))

		genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount)
		diverence = Counter(genome_amounts)[1] / float(len(genome_amounts))
		if max_genome_amount >= 10:
			while abs(diverence - probability) > 0.05:
				# print "need: {}, gotten: {}".format(probability, diverence)
				genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount)
				diverence = Counter(genome_amounts)[1] / float(len(genome_amounts))
		return genome_amounts 

示例5: build_vocab

def build_vocab(words, vocab_size, visual_fld):
    """ Build vocabulary of VOCAB_SIZE most frequent words and write it to
    file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')

    dictionary = dict()
    count = [('UNK', -1)]
    index = 0
    count.extend(Counter(words).most_common(vocab_size - 1))

    for word, _ in count:
        dictionary[word] = index
        index += 1
        file.write(word + '\n')

    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary 

示例6: generate_bi_graphemes_dictionary

def generate_bi_graphemes_dictionary(label_list):
    freqs = Counter()
    for label in label_list:
        label = label.split(' ')
        for i in label:
            for pair in split_every(2, i):
                if len(pair) == 2:
                    freqs[pair] += 1

    with open('resources/unicodemap_en_baidu_bi_graphemes.csv', 'w') as bigram_label:
        bigramwriter = csv.writer(bigram_label, delimiter = ',')
        baidu_labels = list('\' abcdefghijklmnopqrstuvwxyz')
        for index, key in enumerate(baidu_labels):
            bigramwriter.writerow((key, index+1))
        for index, key in enumerate(freqs.keys()):
            bigramwriter.writerow((key, index+len(baidu_labels)+1)) 

示例7: test_tokens_to_indices

def test_tokens_to_indices():
    counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1, unknown_token='<unk>',

    i1 = vocab.to_indices('c')
    assert i1 == 1

    i2 = vocab.to_indices(['c'])
    assert i2 == [1]

    i3 = vocab.to_indices(['<unk>', 'non-exist'])
    assert i3 == [0, 0]

    i4 = vocab.to_indices(['a', 'non-exist', 'a', 'b'])
    assert i4 == [3, 0, 3, 2] 

示例8: test_indices_to_tokens

def test_indices_to_tokens():
    counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1,
                                  unknown_token='<unknown>', reserved_tokens=None)
    i1 = vocab.to_tokens(1)
    assert i1 == 'c'

    i2 = vocab.to_tokens([1])
    assert i2 == ['c']

    i3 = vocab.to_tokens([0, 0])
    assert i3 == ['<unknown>', '<unknown>']

    i4 = vocab.to_tokens([3, 0, 3, 2])
    assert i4 == ['a', '<unknown>', 'a', 'b']

    assertRaises(ValueError, vocab.to_tokens, 100) 

示例9: CountErrors

def CountErrors(ocr_text, truth_text):
  """Counts the drops and adds between 2 bags of iterables.

  Simple bag of objects count returns the number of dropped and added
  elements, regardless of order, from anything that is iterable, eg
  a pair of strings gives character errors, and a pair of word lists give
  word errors.
    ocr_text:    OCR text iterable (eg string for chars, word list for words).
    truth_text:  Truth text iterable.

    ErrorCounts named tuple.
  counts = collections.Counter(truth_text)
  drops = sum(c for c in counts.values() if c > 0)
  adds = sum(-c for c in counts.values() if c < 0)
  return ErrorCounts(drops, adds, len(truth_text), len(ocr_text)) 

示例10: _get_ngrams

def _get_ngrams(segment, max_order):
  """Extracts all n-grams up to a given maximum order from an input segment.

    segment: text segment from which n-grams will be extracted.
    max_order: maximum length in tokens of the n-grams returned by this

    The Counter containing all n-grams up to max_order in segment
    with a count of how many times each n-gram occurred.
  ngram_counts = collections.Counter()
  for order in range(1, max_order + 1):
    for i in range(0, len(segment) - order + 1):
      ngram = tuple(segment[i:i + order])
      ngram_counts[ngram] += 1
  return ngram_counts 

示例11: _build_vocab

def _build_vocab(generator, vocab_dir, vocab_name):
  """Build a vocabulary from examples.

    generator: text generator for creating vocab.
    vocab_dir: directory where to save the vocabulary.
    vocab_name: vocab file name.

    text encoder.
  vocab_path = os.path.join(vocab_dir, vocab_name)
  if not tf.gfile.Exists(vocab_path):
    data = []
    for line in generator:
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
    encoder = text_encoder.TokenTextEncoder(vocab_path)
  return encoder 

示例12: _build_vocab

def _build_vocab(filename, vocab_dir, vocab_name):
  """Reads a file to build a vocabulary.

    filename: file to read list of words from.
    vocab_dir: directory where to save the vocabulary.
    vocab_name: vocab file name.

    text encoder.
  vocab_path = os.path.join(vocab_dir, vocab_name)
  if not tf.gfile.Exists(vocab_path):
    with tf.gfile.GFile(filename, "r") as f:
      data = f.read().split()
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
    encoder = text_encoder.TokenTextEncoder(vocab_path)
  return encoder 

示例13: test_custom_reserved_tokens

def test_custom_reserved_tokens(self):
    """Test that we can pass custom reserved tokens to SubwordTextEncoder."""
    corpus = "The quick brown fox jumps over the lazy dog"
    token_counts = collections.Counter(corpus.split(" "))

    start_symbol = "<S>"
    end_symbol = "<E>"
    reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        10, token_counts, 2, 10, reserved_tokens=reserved_tokens)

    # Make sure that reserved tokens appear in the right places.
    self.assertEqual(encoder.decode([2]), start_symbol)
    self.assertEqual(encoder.decode([3]), end_symbol)

    # Make sure that we haven't messed up the ability to reconstruct.
    reconstructed_corpus = encoder.decode(encoder.encode(corpus))
    self.assertEqual(corpus, reconstructed_corpus) 

示例14: test_encodable_when_not_in_alphabet

def test_encodable_when_not_in_alphabet(self):
    corpus = "the quick brown fox jumps over the lazy dog"
    token_counts = collections.Counter(corpus.split(" "))

    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 10)
    original = "This has UPPER CASE letters that are out of alphabet"

    # Early versions could have an infinite loop when breaking into subtokens
    # if there was any out-of-alphabet characters in the encoded string.
    encoded = encoder.encode(original)
    decoded = encoder.decode(encoded)

    self.assertEqual(original, decoded)
    encoded_str = "".join(encoder.all_subtoken_strings[i] for i in encoded)
    self.assertIn("\\84;", encoded_str) 

示例15: test_reserved_token_chars_not_in_alphabet

def test_reserved_token_chars_not_in_alphabet(self):
    corpus = "dog"
    token_counts = collections.Counter(corpus.split(" "))
    encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 100)
    filename = os.path.join(self.test_temp_dir, "out.voc")
    encoder2 = text_encoder.SubwordTextEncoder(filename=filename)

    self.assertEqual(encoder1._alphabet, encoder2._alphabet)

    for t in text_encoder.RESERVED_TOKENS:
      for c in t:
        # Verify that encoders can encode all reserved token chars.
