当前位置: 首页>>代码示例>>Python>>正文


Python SonicScrewdriver类代码示例

本文整理汇总了Python中SonicScrewdriver的典型用法代码示例。如果您正苦于以下问题:Python SonicScrewdriver类的具体用法?Python SonicScrewdriver怎么用?Python SonicScrewdriver使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了SonicScrewdriver类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: open

import matplotlib.pyplot as plt
import SonicScrewdriver as utils

targetfile = input('Path to input file? ')

counts = dict()
alltags = set()
alldecades = set()
allcounts = Counter()

with open(targetfile, encoding = 'utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        date = row['date']
        decade = 10 * int(int(date)/10)
        tagset = utils.get_tagset(row['genretags'])
        for tag in tagset:
            if tag == 'chirandom' and ('chiscifi' in tagset):
                continue
            if tag not in counts:
                counts[tag] = Counter()

            counts[tag][decade] += 1
            alltags.add(tag)
            alldecades.add(decade)
            allcounts[decade] += 1

sorted_decades = sorted(list(alldecades))
numdecs = len(sorted_decades)

colors = ['g-', 'b-', 'r-', 'k-', 'ro', 'go', 'bo', 'ko']
开发者ID:tedunderwood,项目名称:fiction,代码行数:31,代码来源:graph_tags_over_time.py

示例2: passfilter

# refine fiction

import SonicScrewdriver as utils

def passfilter(genrestring):
	fields = genrestring.split(';')
	if "Autobiography" in fields or "Biography" in fields:
		return False
	else:
		return True

rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv')

rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')

with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f:
	filelines = f.readlines()

idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]

filteredrows = list()

missing = 0

for anid in idlist:
	if anid in rows19c:
		genrestring = table19c["genres"][anid]
		rowdict = dict()
		for col in columns19c:
			rowdict[col] = table19c[col][anid]
	elif anid in rows20c:
开发者ID:tedunderwood,项目名称:GenreProject,代码行数:31,代码来源:refine_fiction.py

示例3: select_common_features

def select_common_features(trainingset, n):
	''' Very simply, selects the top n features in the training set.
	Not a sophisticated feature-selection strategy, but in many
	cases it gets the job done.
	'''
	allwordcounts = dict()

	for avolume in trainingset:
		utils.add_dicts(avolume.rawcounts, allwordcounts)
		# The add_dicts function will add up all the raw counts into
		# a single master dictionary.

	descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse = True)
	# This returns a list of 2-tuple (frequency, word) pairs.

	if n > len(descendingbyfreq):
		n = len(descendingbyfreq)
		print("We only have " + str(n) + " features.")

	# List comprehension that gets the second element of each tuple, up to
	# a total of n tuples.

	topfeatures = [x[1] for x in descendingbyfreq[0 : n]]

	return topfeatures
开发者ID:tedunderwood,项目名称:GenreProject,代码行数:25,代码来源:logistic.py

示例4: sequence_to_counts

def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.
    Note that this function cannot return "bio." If
    biography is the largest genre it returns "non"fiction.
    It counts bio, but ensures that all votes for bio are also votes
    for non.
    '''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)
        if page == 'bio':
            utils.addtodict('non', 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
    maxgenre = genretuples[0][1]

    if maxgenre == 'bio':
        maxgenre = 'non'

    return genrecounts, maxgenre
开发者ID:deepfriedrabbit,项目名称:genre,代码行数:25,代码来源:logisticconfidence.py

示例5: sequence_to_counts

def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
    genrecounts['non'] = 0

    for page in genresequence:
        indexas = page

        # For this purpose, we treat biography and indexes as equivalent to nonfiction.
        if page == "bio" or page == "index" or page == "back" or page == "trv":
            indexas = "non"

        utils.addtodict(indexas, 1, genrecounts)

    # Convert the dictionary of counts into a sorted list, and take the max.
    genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
    maxgenre = genretuples[0][1]

    return genrecounts, maxgenre
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:25,代码来源:MetadataSorter.py

示例6: sequence_to_counts

def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre.

    Note that this version of the function is slightly different
    from the version in MetadataCascades, in allowing a wider range
    of genres and not initializing anything to zero.'''

    genrecounts = dict()

    for page in genresequence:
        utils.addtodict(page, 1, genrecounts)

    return genrecounts
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:15,代码来源:CRF.py

示例7: get_classvector

def get_classvector(classpath, volumeIDs):
	with open(classpath, encoding = 'utf-8') as f:
		filelines = f.readlines()
	classdict = dict()
	for line in filelines:
		line = line.rstrip()
		fields = line.split('\t')
		volid = utils.clean_pairtree(fields[0])
		theclass = fields[1]
		if theclass == 'elite':
			intclass = 1
		elif theclass == 'vulgar':
			intclass = 0
		else:
			intclass = int(theclass)
		classdict[volid] = intclass

	if len(volumeIDs) < 1:
		volumeIDs = [x for x in classdict.keys()]

	classvector = np.zeros(len(volumeIDs))
	for idx, anid in enumerate(volumeIDs):
		if anid in classdict:
			classvector[idx] = classdict[anid]
		else:
			print('Missing from class metadata: ' + anid)

	return classvector, volumeIDs
开发者ID:tedunderwood,项目名称:GenreProject,代码行数:28,代码来源:test_boundary.py

示例8: add_to_ficgenre

def add_to_ficgenre(docid, existingfile, tagas):
    global outfieldnames, metadata
    with open(existingfile, mode = 'a', encoding = 'utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = outfieldnames)
        o = dict()
        j = metadata[docid]
        fields = [j['HTid'], str(j['date']), j['author'], j['title'], j['enumcron']]
        print(" | ".join(fields))
        o['docid'] = utils.clean_pairtree(j['HTid'])
        o['recordid'] = j['recordid']
        o['oclc'] = j['OCLC']
        o['locnum'] = j['LOCnum']
        o['author'] = j['author']
        o['imprint'] = j['imprint']
        o['date'] = j['date']
        o['firstpub'] = input('First publication date? ')
        o['birthdate'] = input('Author birth year? ')
        o['nationality'] = input('Nationality? ')
        o['gender'] = input('Gender? ')
        o['title'] = j['title']
        o['subjects'] = j['subjects']
        o['enumcron'] = j['enumcron']
        o['genretags'] = tagas
        for key, value in o.items():
            if o[key] == '<blank>':
                o[key] = ''
        writer.writerow(o)
    print('Done.')
开发者ID:tedunderwood,项目名称:fiction,代码行数:28,代码来源:fiction_browser.py

示例9: get_featureframe

def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir):
    ''' Returns a pandas dataframe with feature counts for all the volumes
    to be used in this model.
    '''

    df = dict()
    # We initially construct the data frame as a dictionary of Series.
    vocabset = set(vocabulary)
    allIDs = positiveIDs + negativeIDs

    for v in vocabulary:
        df[v] = pd.Series(np.zeros(len(allIDs)), index = allIDs)

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                feature = row['feature']

                if feature.startswith('#header'):
                    feature = feature.replace('#header', '')

                if feature in vocabset:
                    df[feature].loc[docid] = row['count']

    # Now let's refashion the dictionary as an actual dataframe.
    df = pd.DataFrame(df, index = allIDs)
    df = df[vocabulary]
    # This reorders the columns to be in vocab order

    stdscaler = StandardScaler()
    scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index = allIDs)

    return scaleddf
开发者ID:tedunderwood,项目名称:20cgenres,代码行数:35,代码来源:trainamodel.py

示例10: choose_cascade

def choose_cascade(htid):
    '''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''

    global rowindices, columns, metadata, modelindices, modeldata


    probablydrama = False
    probablypoetry = False
    probablybiography = False
    probablyfiction = False
    maybefiction = False

    htid = utils.pairtreelabel(htid)
    # convert the clean pairtree filename into a dirty pairtree label for metadata matching

    if htid not in rowindices:
        # We have no metadata for this volume.
        print("Volume missing from ExtractedMetadata.tsv: " + htid)

    else:
        genrestring = metadata["genres"][htid]
        genreinfo = genrestring.split(";")
        # It's a semicolon-delimited list of items.

        for info in genreinfo:

            if info == "Biography" or info == "Autobiography":
                probablybiography = True

            if info == "Fiction" or info == "Novel":
                probablyfiction = True

            if (info == "Poetry" or info == "Poems"):
                probablypoetry = True

            if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
                probablydrama = True

    if htid in modelindices:

        title = metadata["title"][htid].lower()
        titlewords = title.split()

        maxgenre = maxoption((modeldata["bio"][htid], modeldata["dra"][htid], modeldata["fic"][htid], modeldata["non"][htid], modeldata["poe"][htid]))

        if maxgenre == 4 and "poems" in titlewords or "poetical" in titlewords:
            probablypoetry = True

        if maxgenre == 1:
            probablydrama = True

        if maxgenre == 2:
            maybefiction = True

    return probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:55,代码来源:MetadataSorter.py

示例11: compare_two_lists

def compare_two_lists(truelist, predicted, wordsperpage, whethertocountwords):
    global genretranslations
    assert(len(truelist) == len(predicted))

    errorsbygenre = dict()
    correctbygenre = dict()
    accurate = 0
    inaccurate = 0
    totaltruegenre = dict()

    for index, truegenre in enumerate(truelist):
        if truegenre in genretranslations:
            truegenre = genretranslations[truegenre]

        if whethertocountwords:
            increment = wordsperpage[index]
        else:
            increment = 1

        utils.addtodict(truegenre, increment, totaltruegenre)

        predictedgenre = predicted[index]

        if genresareequal(truegenre, predictedgenre):
            utils.addtodict(truegenre, increment, correctbygenre)
            accurate += increment
        else:
            utils.addtodict((truegenre, predictedgenre), increment, errorsbygenre)
            inaccurate += increment

    return totaltruegenre, correctbygenre, errorsbygenre, accurate, inaccurate
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:31,代码来源:EnsembleModule.py

示例12: comparelists

def comparelists(firstmap, secondmap, genremistakes, correctbygenre, wordcounts):
	if len(firstmap) > len(secondmap):
		length = len(secondmap)
	elif len(firstmap) == len(secondmap):
		length = len(firstmap)
	else:
		print("Error, Will Robinson. There are occasions where the consensus version is shorter but no valid reason for it to be longer.")

	divergence = 0.0

	for i in range(length):

		generalizedfirst = translate(firstmap[i])
		generalizedsecond = translate(secondmap[i])

		if effectively_equal(generalizedfirst, generalizedsecond):
			utils.addtodict(generalizedsecond, wordcounts[i], correctbygenre)
		else:
			divergence += wordcounts[i]
			utils.addtodict((generalizedsecond, generalizedfirst), wordcounts[i], genremistakes)

	return divergence
开发者ID:deepfriedrabbit,项目名称:genre,代码行数:22,代码来源:HumanDissensus.py

示例13: addmetadata

    def addmetadata(self, row, table):
        self.author = table['author'][row]
        self.title = table['title'][row]
        self.date = utils.simple_date(row, table)
        genrelist = table['genres'][row].split(';')
        self.genres = set(genrelist)

        varietiesofnon = ['Bibliographies', 'Catalog', 'Dictionary', 'Encyclopedia', 'Handbooks', 'Indexes', 'Legislation', 'Directories', 'Statistics', 'Legal cases', 'Legal articles', 'Calendars', 'Autobiography', 'Biography', 'Letters', 'Essays', 'Speeches']

        self.nonmetaflag = False
        for genre in varietiesofnon:
            if genre in self.genres:
                self.nonmetaflag = True
开发者ID:deepfriedrabbit,项目名称:genre,代码行数:13,代码来源:logisticconfidence.py

示例14: resolve_voting

def resolve_voting(votes, tiebreaker):
    electorate = len(votes)

    results = dict()
    for vote in votes:
        # if vote == "bio":
        #   vote = "non"
        utils.addtodict(vote, 1, results)
    candidate = utils.sortkeysbyvalue(results, whethertoreverse = True)

    dissent = (electorate - candidate[0][0]) / electorate

    if len(candidate) < 2:
        # There is only one candidate.
        return candidate[0][1], dissent, candidate[0][1]

    elif candidate[0][0] > candidate[1][0]:
        # We have a majority.
        return candidate[0][1], dissent, candidate[1][1]

    else:
        # We have a tie.
        if tiebreaker == candidate[0][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[0][1], dissent, candidate[1][1]
        elif tiebreaker == candidate[1][1]:
            print("Tiebreaker " + tiebreaker)
            return candidate[1][1], dissent, candidate[0][1]
        else:
            print("Tie in spite of " + tiebreaker)
            win = random.choice([candidate[0][1], candidate[1][1]])
            if win == candidate[0][1]:
                runnerup = candidate[1][1]
            else:
                runnerup = candidate[0][1]

            return win, dissent, runnerup
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:37,代码来源:EnsembleModule.py

示例15: get_vocabulary_and_counts

def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also
    returns a dictionary of wordcounts so we don't have to read them again from the
    file when generating a feature dataframe.
    '''

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()
    counts = dict()

    for docid in allIDs:
        counts[docid] = Counter()
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                word = row['feature']
                if len(word) < 1:
                    continue

                ct = float(row['count'])

                if word.startswith('#header'):
                    word = word.replace('#header', '')
                #
                # This debatable choice treats header words as equivalent
                # to occurrences in the body text. In practice, this seems
                # to slightly improve performance, at least when you're using
                # SVMs and relatively low numbers of features (140-300).
                # Otherwise header words are in practice just discarded, because
                # e.g. #headeract won't be one of the top 250 words.

                doc_freq[word] += 1
                counts[docid][word] += ct

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab, counts
开发者ID:tedunderwood,项目名称:20cgenres,代码行数:40,代码来源:trainamodel.py


注:本文中的SonicScrewdriver类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。