本文整理汇总了Python中SonicScrewdriver类的典型用法代码示例。如果您正苦于以下问题:Python SonicScrewdriver类的具体用法?Python SonicScrewdriver怎么用?Python SonicScrewdriver使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SonicScrewdriver类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: open
import matplotlib.pyplot as plt
import SonicScrewdriver as utils
targetfile = input('Path to input file? ')
counts = dict()
alltags = set()
alldecades = set()
allcounts = Counter()
with open(targetfile, encoding = 'utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
date = row['date']
decade = 10 * int(int(date)/10)
tagset = utils.get_tagset(row['genretags'])
for tag in tagset:
if tag == 'chirandom' and ('chiscifi' in tagset):
continue
if tag not in counts:
counts[tag] = Counter()
counts[tag][decade] += 1
alltags.add(tag)
alldecades.add(decade)
allcounts[decade] += 1
sorted_decades = sorted(list(alldecades))
numdecs = len(sorted_decades)
colors = ['g-', 'b-', 'r-', 'k-', 'ro', 'go', 'bo', 'ko']
示例2: passfilter
# refine fiction
import SonicScrewdriver as utils
def passfilter(genrestring):
fields = genrestring.split(';')
if "Autobiography" in fields or "Biography" in fields:
return False
else:
return True
rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv')
rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')
with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f:
filelines = f.readlines()
idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]
filteredrows = list()
missing = 0
for anid in idlist:
if anid in rows19c:
genrestring = table19c["genres"][anid]
rowdict = dict()
for col in columns19c:
rowdict[col] = table19c[col][anid]
elif anid in rows20c:
示例3: select_common_features
def select_common_features(trainingset, n):
''' Very simply, selects the top n features in the training set.
Not a sophisticated feature-selection strategy, but in many
cases it gets the job done.
'''
allwordcounts = dict()
for avolume in trainingset:
utils.add_dicts(avolume.rawcounts, allwordcounts)
# The add_dicts function will add up all the raw counts into
# a single master dictionary.
descendingbyfreq = utils.sortkeysbyvalue(allwordcounts, whethertoreverse = True)
# This returns a list of 2-tuple (frequency, word) pairs.
if n > len(descendingbyfreq):
n = len(descendingbyfreq)
print("We only have " + str(n) + " features.")
# List comprehension that gets the second element of each tuple, up to
# a total of n tuples.
topfeatures = [x[1] for x in descendingbyfreq[0 : n]]
return topfeatures
示例4: sequence_to_counts
def sequence_to_counts(genresequence):
'''Converts a sequence of page-level predictions to
a dictionary of counts reflecting the number of pages
assigned to each genre. Also reports the largest genre.
Note that this function cannot return "bio." If
biography is the largest genre it returns "non"fiction.
It counts bio, but ensures that all votes for bio are also votes
for non.
'''
genrecounts = dict()
for page in genresequence:
utils.addtodict(page, 1, genrecounts)
if page == 'bio':
utils.addtodict('non', 1, genrecounts)
# Convert the dictionary of counts into a sorted list, and take the max.
genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
maxgenre = genretuples[0][1]
if maxgenre == 'bio':
maxgenre = 'non'
return genrecounts, maxgenre
示例5: sequence_to_counts
def sequence_to_counts(genresequence):
'''Converts a sequence of page-level predictions to
a dictionary of counts reflecting the number of pages
assigned to each genre. Also reports the largest genre.'''
genrecounts = dict()
genrecounts['fic'] = 0
genrecounts['poe'] = 0
genrecounts['dra'] = 0
genrecounts['non'] = 0
for page in genresequence:
indexas = page
# For this purpose, we treat biography and indexes as equivalent to nonfiction.
if page == "bio" or page == "index" or page == "back" or page == "trv":
indexas = "non"
utils.addtodict(indexas, 1, genrecounts)
# Convert the dictionary of counts into a sorted list, and take the max.
genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
maxgenre = genretuples[0][1]
return genrecounts, maxgenre
示例6: sequence_to_counts
def sequence_to_counts(genresequence):
'''Converts a sequence of page-level predictions to
a dictionary of counts reflecting the number of pages
assigned to each genre.
Note that this version of the function is slightly different
from the version in MetadataCascades, in allowing a wider range
of genres and not initializing anything to zero.'''
genrecounts = dict()
for page in genresequence:
utils.addtodict(page, 1, genrecounts)
return genrecounts
示例7: get_classvector
def get_classvector(classpath, volumeIDs):
with open(classpath, encoding = 'utf-8') as f:
filelines = f.readlines()
classdict = dict()
for line in filelines:
line = line.rstrip()
fields = line.split('\t')
volid = utils.clean_pairtree(fields[0])
theclass = fields[1]
if theclass == 'elite':
intclass = 1
elif theclass == 'vulgar':
intclass = 0
else:
intclass = int(theclass)
classdict[volid] = intclass
if len(volumeIDs) < 1:
volumeIDs = [x for x in classdict.keys()]
classvector = np.zeros(len(volumeIDs))
for idx, anid in enumerate(volumeIDs):
if anid in classdict:
classvector[idx] = classdict[anid]
else:
print('Missing from class metadata: ' + anid)
return classvector, volumeIDs
示例8: add_to_ficgenre
def add_to_ficgenre(docid, existingfile, tagas):
global outfieldnames, metadata
with open(existingfile, mode = 'a', encoding = 'utf-8') as f:
writer = csv.DictWriter(f, fieldnames = outfieldnames)
o = dict()
j = metadata[docid]
fields = [j['HTid'], str(j['date']), j['author'], j['title'], j['enumcron']]
print(" | ".join(fields))
o['docid'] = utils.clean_pairtree(j['HTid'])
o['recordid'] = j['recordid']
o['oclc'] = j['OCLC']
o['locnum'] = j['LOCnum']
o['author'] = j['author']
o['imprint'] = j['imprint']
o['date'] = j['date']
o['firstpub'] = input('First publication date? ')
o['birthdate'] = input('Author birth year? ')
o['nationality'] = input('Nationality? ')
o['gender'] = input('Gender? ')
o['title'] = j['title']
o['subjects'] = j['subjects']
o['enumcron'] = j['enumcron']
o['genretags'] = tagas
for key, value in o.items():
if o[key] == '<blank>':
o[key] = ''
writer.writerow(o)
print('Done.')
示例9: get_featureframe
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir):
''' Returns a pandas dataframe with feature counts for all the volumes
to be used in this model.
'''
df = dict()
# We initially construct the data frame as a dictionary of Series.
vocabset = set(vocabulary)
allIDs = positiveIDs + negativeIDs
for v in vocabulary:
df[v] = pd.Series(np.zeros(len(allIDs)), index = allIDs)
for docid in allIDs:
path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
with open(path, encoding = 'utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
feature = row['feature']
if feature.startswith('#header'):
feature = feature.replace('#header', '')
if feature in vocabset:
df[feature].loc[docid] = row['count']
# Now let's refashion the dictionary as an actual dataframe.
df = pd.DataFrame(df, index = allIDs)
df = df[vocabulary]
# This reorders the columns to be in vocab order
stdscaler = StandardScaler()
scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index = allIDs)
return scaleddf
示例10: choose_cascade
def choose_cascade(htid):
'''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''
global rowindices, columns, metadata, modelindices, modeldata
probablydrama = False
probablypoetry = False
probablybiography = False
probablyfiction = False
maybefiction = False
htid = utils.pairtreelabel(htid)
# convert the clean pairtree filename into a dirty pairtree label for metadata matching
if htid not in rowindices:
# We have no metadata for this volume.
print("Volume missing from ExtractedMetadata.tsv: " + htid)
else:
genrestring = metadata["genres"][htid]
genreinfo = genrestring.split(";")
# It's a semicolon-delimited list of items.
for info in genreinfo:
if info == "Biography" or info == "Autobiography":
probablybiography = True
if info == "Fiction" or info == "Novel":
probablyfiction = True
if (info == "Poetry" or info == "Poems"):
probablypoetry = True
if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
probablydrama = True
if htid in modelindices:
title = metadata["title"][htid].lower()
titlewords = title.split()
maxgenre = maxoption((modeldata["bio"][htid], modeldata["dra"][htid], modeldata["fic"][htid], modeldata["non"][htid], modeldata["poe"][htid]))
if maxgenre == 4 and "poems" in titlewords or "poetical" in titlewords:
probablypoetry = True
if maxgenre == 1:
probablydrama = True
if maxgenre == 2:
maybefiction = True
return probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction
示例11: compare_two_lists
def compare_two_lists(truelist, predicted, wordsperpage, whethertocountwords):
global genretranslations
assert(len(truelist) == len(predicted))
errorsbygenre = dict()
correctbygenre = dict()
accurate = 0
inaccurate = 0
totaltruegenre = dict()
for index, truegenre in enumerate(truelist):
if truegenre in genretranslations:
truegenre = genretranslations[truegenre]
if whethertocountwords:
increment = wordsperpage[index]
else:
increment = 1
utils.addtodict(truegenre, increment, totaltruegenre)
predictedgenre = predicted[index]
if genresareequal(truegenre, predictedgenre):
utils.addtodict(truegenre, increment, correctbygenre)
accurate += increment
else:
utils.addtodict((truegenre, predictedgenre), increment, errorsbygenre)
inaccurate += increment
return totaltruegenre, correctbygenre, errorsbygenre, accurate, inaccurate
示例12: comparelists
def comparelists(firstmap, secondmap, genremistakes, correctbygenre, wordcounts):
if len(firstmap) > len(secondmap):
length = len(secondmap)
elif len(firstmap) == len(secondmap):
length = len(firstmap)
else:
print("Error, Will Robinson. There are occasions where the consensus version is shorter but no valid reason for it to be longer.")
divergence = 0.0
for i in range(length):
generalizedfirst = translate(firstmap[i])
generalizedsecond = translate(secondmap[i])
if effectively_equal(generalizedfirst, generalizedsecond):
utils.addtodict(generalizedsecond, wordcounts[i], correctbygenre)
else:
divergence += wordcounts[i]
utils.addtodict((generalizedsecond, generalizedfirst), wordcounts[i], genremistakes)
return divergence
示例13: addmetadata
def addmetadata(self, row, table):
self.author = table['author'][row]
self.title = table['title'][row]
self.date = utils.simple_date(row, table)
genrelist = table['genres'][row].split(';')
self.genres = set(genrelist)
varietiesofnon = ['Bibliographies', 'Catalog', 'Dictionary', 'Encyclopedia', 'Handbooks', 'Indexes', 'Legislation', 'Directories', 'Statistics', 'Legal cases', 'Legal articles', 'Calendars', 'Autobiography', 'Biography', 'Letters', 'Essays', 'Speeches']
self.nonmetaflag = False
for genre in varietiesofnon:
if genre in self.genres:
self.nonmetaflag = True
示例14: resolve_voting
def resolve_voting(votes, tiebreaker):
electorate = len(votes)
results = dict()
for vote in votes:
# if vote == "bio":
# vote = "non"
utils.addtodict(vote, 1, results)
candidate = utils.sortkeysbyvalue(results, whethertoreverse = True)
dissent = (electorate - candidate[0][0]) / electorate
if len(candidate) < 2:
# There is only one candidate.
return candidate[0][1], dissent, candidate[0][1]
elif candidate[0][0] > candidate[1][0]:
# We have a majority.
return candidate[0][1], dissent, candidate[1][1]
else:
# We have a tie.
if tiebreaker == candidate[0][1]:
print("Tiebreaker " + tiebreaker)
return candidate[0][1], dissent, candidate[1][1]
elif tiebreaker == candidate[1][1]:
print("Tiebreaker " + tiebreaker)
return candidate[1][1], dissent, candidate[0][1]
else:
print("Tie in spite of " + tiebreaker)
win = random.choice([candidate[0][1], candidate[1][1]])
if win == candidate[0][1]:
runnerup = candidate[1][1]
else:
runnerup = candidate[0][1]
return win, dissent, runnerup
示例15: get_vocabulary_and_counts
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir, n):
''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also
returns a dictionary of wordcounts so we don't have to read them again from the
file when generating a feature dataframe.
'''
allIDs = positiveIDs + negativeIDs
doc_freq = Counter()
counts = dict()
for docid in allIDs:
counts[docid] = Counter()
path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
with open(path, encoding = 'utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
word = row['feature']
if len(word) < 1:
continue
ct = float(row['count'])
if word.startswith('#header'):
word = word.replace('#header', '')
#
# This debatable choice treats header words as equivalent
# to occurrences in the body text. In practice, this seems
# to slightly improve performance, at least when you're using
# SVMs and relatively low numbers of features (140-300).
# Otherwise header words are in practice just discarded, because
# e.g. #headeract won't be one of the top 250 words.
doc_freq[word] += 1
counts[docid][word] += ct
vocab = [x[0] for x in doc_freq.most_common(n)]
print('Vocabulary constructed.')
return vocab, counts