本文整理汇总了Python中SonicScrewdriver.pairtreelabel方法的典型用法代码示例。如果您正苦于以下问题:Python SonicScrewdriver.pairtreelabel方法的具体用法?Python SonicScrewdriver.pairtreelabel怎么用?Python SonicScrewdriver.pairtreelabel使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类SonicScrewdriver
的用法示例。
在下文中一共展示了SonicScrewdriver.pairtreelabel方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: choose_cascade
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
def choose_cascade(htid):
'''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''
global rowindices, columns, metadata, modelindices, modeldata
probablydrama = False
probablypoetry = False
probablybiography = False
probablyfiction = False
maybefiction = False
htid = utils.pairtreelabel(htid)
# convert the clean pairtree filename into a dirty pairtree label for metadata matching
if htid not in rowindices:
# We have no metadata for this volume.
print("Volume missing from ExtractedMetadata.tsv: " + htid)
else:
genrestring = metadata["genres"][htid]
genreinfo = genrestring.split(";")
# It's a semicolon-delimited list of items.
for info in genreinfo:
if info == "Biography" or info == "Autobiography":
probablybiography = True
if info == "Fiction" or info == "Novel":
probablyfiction = True
if (info == "Poetry" or info == "Poems"):
probablypoetry = True
if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
probablydrama = True
if htid in modelindices:
title = metadata["title"][htid].lower()
titlewords = title.split()
maxgenre = maxoption((modeldata["bio"][htid], modeldata["dra"][htid], modeldata["fic"][htid], modeldata["non"][htid], modeldata["poe"][htid]))
if maxgenre == 4 and "poems" in titlewords or "poetical" in titlewords:
probablypoetry = True
if maxgenre == 1:
probablydrama = True
if maxgenre == 2:
maybefiction = True
return probablybiography, probablydrama, probablyfiction, probablypoetry, maybefiction
示例2: get_metadata_evidence
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
def get_metadata_evidence(htid, rowindices, columns, metadata):
'''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''
metadata_evidence = dict()
metadata_evidence["drama"] = False
metadata_evidence["poetry"] = False
metadata_evidence["biography"] = False
metadata_evidence["fiction"] = False
htid = utils.pairtreelabel(htid)
# convert the clean pairtree filename into a dirty pairtree label for metadata matching
if htid not in rowindices:
# We have no metadata for this volume.
return metadata_evidence
else:
genrestring = metadata["genres"][htid]
genreinfo = genrestring.split(";")
# It's a semicolon-delimited list of items.
for info in genreinfo:
if info == "Biography" or info == "Autobiography":
metadata_evidence["biography"] = True
if info == "Fiction" or info == "Novel":
metadata_evidence["fiction"] = True
if (info == "Poetry" or info == "Poems"):
metadata_evidence["poetry"] = True
if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
metadata_evidence["drama"] = True
return metadata_evidence
示例3: open
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
# We can perhaps enumerate currency terms intuitively, but not these.
alltargetwords = moneywords
sourcedir = "/Volumes/TARDIS/work/moneytexts/"
filelist = os.listdir(sourcedir)
filelist = [x for x in filelist if x.endswith(".txt")]
contexts = []
WINDOWRADIUS = 7
ctr = 0
for filename in filelist:
htid = utils.pairtreelabel(filename.replace('.fic.txt', ''))
if htid not in rows:
print(htid)
continue
else:
date = utils.simple_date(htid, table)
filepath = os.path.join(sourcedir, filename)
with open(filepath, encoding = 'utf-8') as f:
filelines = f.readlines()
pagelist = [filelines]
# The wordcounter module expects a list of pages, each of which is a list of lines.
# Ebooks have no pages -- at least as I currently receive them -- so we treat it
# all as one giant page.
示例4: passfilter
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
def passfilter(genrestring):
fields = genrestring.split(';')
if "Autobiography" in fields or "Biography" in fields:
return False
else:
return True
rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv')
rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')
with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f:
filelines = f.readlines()
idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]
filteredrows = list()
missing = 0
for anid in idlist:
if anid in rows19c:
genrestring = table19c["genres"][anid]
rowdict = dict()
for col in columns19c:
rowdict[col] = table19c[col][anid]
elif anid in rows20c:
genrestring = table20c["genres"][anid]
rowdict = dict()
for col in columns20c:
示例5: extractgenres
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
def extractgenres(pathtotarfile, rows, columns, table):
''' Given a tarfile containing a bunch of jsons, this goes through all the jsons
and identifies the ones that belong in filtered subsets for
fiction, drama, and poetry. The cutoff is 95 percent precision, except for poetry,
where it's 93.9, because the 95-percent threshold is hard to reach.
We also write metadata for all jsons where maxgenre is drama, fiction, or poetry,
including those that didn't reach threshold.
'''
fiction = list()
drama = list()
poetry = list()
ficmeta = list()
drameta = list()
poemeta = list()
tar = tarfile.open(pathtotarfile, 'r:gz')
counter = 0
for tarinfo in tar:
counter += 1
if tarinfo.isreg():
# This is the name of a regular file rather than a directory.
tardata = tar.extractfile(tarinfo.name)
somebytes = tardata.read()
astring = somebytes.decode('utf-8', 'strict')
jobj = json.loads(astring)
meta = jobj['hathi_metadata']
stringdate = meta['inferred_date']
htid = meta['htid']
dirtyhtid = utils.pairtreelabel(htid)
filename = htid + '.json'
pathparts = tarinfo.name.split('/')
if filename != pathparts[1]:
print(filename)
print('Is anomalous, because not equal to ' + pathparts[1])
try:
intdate = int(stringdate)
except:
intdate = 0
print('Anomalous non-numeric date.')
if 'drama' in jobj:
dramadata = jobj['drama']
precision = dramadata['[email protected]']
probability = dramadata['prob_dra>80precise']
if precision >= 0.95:
drama.append((intdate, filename, astring))
included = True
else:
included = False
if dirtyhtid in rows:
drameta.append(make_outrow(htid, dirtyhtid, probability, included, columns, table))
else:
print('Missing htid: ' + htid)
if 'fiction' in jobj:
ficdata = jobj['fiction']
precision = ficdata['[email protected]']
probability = ficdata['prob_fic>80precise']
if precision >= 0.95:
fiction.append((intdate, filename, astring))
included = True
else:
included = False
if dirtyhtid in rows:
ficmeta.append(make_outrow(htid, dirtyhtid, probability, included, columns, table))
else:
print('Missing htid: ' + htid)
if 'poetry' in jobj:
poedata = jobj['poetry']
precision = poedata['[email protected]']
probability = poedata['prob_poe>80precise']
if precision >= 0.939:
poetry.append((intdate, filename, astring))
included = True
else:
included = False
if dirtyhtid in rows:
poemeta.append(make_outrow(htid, dirtyhtid, probability, included, columns, table))
tar.close()
with open('/Volumes/TARDIS/maps/drama/drama_metadata.csv', mode='a', encoding = 'utf-8') as f:
writer = csv.writer(f)
for row in drameta:
writer.writerow(row)
with open('/Volumes/TARDIS/maps/fiction/fiction_metadata.csv', mode='a', encoding = 'utf-8') as f:
#.........这里部分代码省略.........
示例6: list
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
fictionFPs = list()
fictionTNs = list()
fictionFNs = list()
dramaTPs = list()
dramaFPs = list()
dramaTNs = list()
dramaFNs = list()
genrefeatures = dict()
genreprecisions = dict()
modeledvols = dict()
for filename in predicts:
mapname = filename.replace('.predict', '.map')
labelid = utils.pairtreelabel(filename.replace('.predict', ''))
fileid = filename.replace('.predict', '')
if mapname in firstmaps:
firstpath = os.path.join(firstsource, mapname)
if os.path.isfile(firstpath):
with open(firstpath, encoding = 'utf-8') as f:
filelines = f.readlines()
success = True
wordcounts = firstwordcounts[fileid]
else:
success = False
elif mapname in secondmaps:
secondpath = os.path.join(secondsource, mapname)
if os.path.isfile(secondpath):
with open(secondpath, encoding = 'utf-8') as f:
示例7: list
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
import os, sys
import SonicScrewdriver as utils
import random
rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")
initialsample = random.sample(rowindices, 2000)
directorylist = os.listdir("/Users/tunder/Dropbox/pagedata/mixedtraining/pagefeatures")
existingfiles = list()
for filename in directorylist:
if filename.startswith(".") or filename.startswith("_"):
continue
htid = utils.pairtreelabel(filename[0:-7])
existingfiles.append(htid)
counter = 0
toremove = list()
for htid in initialsample:
if htid in existingfiles:
counter +=1
toremove.append(htid)
print("Found " + str(counter) + " duplicates.")
for htid in toremove:
initialsample.remove(htid)
genresrepresented = set()
for htid in initialsample:
示例8: choose_cascade
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
def choose_cascade(htid):
'''Reads metadata about this volume and uses it to decide what metadata-level features should be assigned.'''
global rowindices, columns, metadata, litlocs, biolocs
probablydrama = False
probablypoetry = False
probablybiography = False
probablyfiction = False
htid = utils.pairtreelabel(htid)
# convert the clean pairtree filename into a dirty pairtree label for metadata matching
if htid not in rowindices:
# We have no metadata for this volume.
print("Volume missing from ExtractedMetadata.tsv: " + htid)
else:
genrestring = metadata["genres"][htid]
genreinfo = genrestring.split(";")
# It's a semicolon-delimited list of items.
for info in genreinfo:
if info == "Biography" or info == "Autobiography":
probablybiography = True
if info == "Fiction" or info == "Novel":
probablyfiction = True
if (info == "Poetry" or info == "Poems"):
probablypoetry = True
if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
probablydrama = True
title = metadata["title"][htid].lower()
titlewords = title.split()
if "poems" in titlewords or "ballads" in titlewords or "poetical" in titlewords:
probablypoetry = True
loc = metadata["LOCnum"][htid]
LC = letterpart(loc)
if LC in litlocs:
litprob = litlocs[LC]
print(LC + " lit: " + str(litprob))
else:
litprob = 120
print(LC)
if LC in biolocs:
bioprob = biolocs[LC]
print(LC + " bio: " + str(bioprob))
else:
bioprob = 120
print(LC)
return probablybiography, probablydrama, probablyfiction, probablypoetry, litprob, bioprob
示例9: comparelists
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
for reading in listoftuples:
readera = reading[0]
predictedgenres = reading[1]
divergence = comparelists(predictedgenres, truegenres, genremistakes, correctbygenre, wordcounts)
totaldivergence += divergence
agreement = (potentialcomparisons - totaldivergence)
agreementpercent = agreement / potentialcomparisons
volumepercents[htid] = agreementpercent
overallcomparisons += potentialcomparisons
overallagreement += agreement
print("Average human agreement: " + str(overallagreement / overallcomparisons))
with open("/Users/tunder/Dropbox/pagedata/interrater/HumanDissensus.tsv", mode="w", encoding = "utf-8") as f:
f.write("htid\tagreement\n")
for key, value in volumepercents.items():
outline = utils.pairtreelabel(key) + "\t" + str(value) + "\n"
f.write(outline)
import ConfusionMatrix
ConfusionMatrix.confusion_matrix(correctbygenre, genremistakes)
示例10: open
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
# good reason.
alltargetwords = moneywords
sourcedir = "/Users/tunder/Dropbox/GenreProject/python/piketty2/anova/"
filelist = os.listdir(sourcedir)
filelist = [x for x in filelist if x.endswith(".txt")]
contexts = []
WINDOWRADIUS = 12
ctr = 0
for filename in filelist:
htid = utils.pairtreelabel(filename.replace('.norm.txt', ''))
if htid not in rows:
print(htid + ' MISSING')
continue
else:
date = utils.simple_date(htid, table)
filepath = os.path.join(sourcedir, filename)
with open(filepath, encoding = 'utf-8') as f:
filelines = f.readlines()
pagelist = [filelines]
tokenstream = tokenizer.makestream(pagelist)
newcontexts = tokenizer.extract_snippets(tokenstream, WINDOWRADIUS, alltargetwords)
示例11: censor
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
def censor(htid, genresequence):
htid = utils.pairtreelabel(htid)
# convert the htid into a dirty pairtree label for metadata matching
# Create a dictionary with entries for all possible conditions, initially set negative.
symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"]
reported = dict()
for symptom in symptoms:
reported[symptom] = 0
couldbefiction = True
# Now we need to assess the largest genre in this volume.
genrecounts = dict()
genrecounts['fic'] = 0
genrecounts['poe'] = 0
genrecounts['dra'] = 0
genrecounts['non'] = 0
for page in genresequence:
indexas = page
# For this purpose, we treat biography and indexes as equivalent to nonfiction.
if page == "bio" or page == "index" or page == "back":
indexas = "non"
utils.addtodict(indexas, 1, genrecounts)
# Convert the dictionary of counts into a sorted list, and take the max.
genretuples = utils.sortkeysbyvalue(genrecounts, whethertoreverse = True)
maxgenre = genretuples[0][1]
if htid not in rowindices and htid not in modelindices:
return genresequence, reported
if htid in rowindices:
genrestring = metadata["genres"][htid]
genreinfo = genrestring.split(";")
# It's a semicolon-delimited list of items.
for info in genreinfo:
if info == "Biography" or info == "Autobiography":
couldbefiction = False
if info == "biog?" and maxgenre == "non":
reported["weakconfirmation"] = 1
if info == "biog?" and maxgenre != "non":
reported["weakdenial"] = 1
if info == "Not fiction" and maxgenre == "non":
reported["weakconfirmation"] = 1
if info == "Not fiction" and maxgenre == "fic":
reported["weakdenial"] = 1
if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
reported["strongconfirmation"] = 1
if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
reported["strongdenial"] = 1
if info == "Biography" and maxgenre == "non":
reported["strongconfirmation"] = 1
if info == "Biography" and maxgenre != "non":
reported["strongdenial"] = 1
if info == "Autobiography" and maxgenre == "non":
reported["strongconfirmation"] = 1
if info == "Autobiography" and maxgenre != "non":
reported["strongdenial"] = 1
if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
reported["strongconfirmation"] = 1
if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
reported["strongdenial"] = 1
if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra":
reported["strongconfirmation"] = 1
if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra":
reported["strongdenial"] = 1
if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non":
reported["strongconfirmation"] = 1
couldbefiction = False
if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non":
reported["strongdenial"] = 1
else:
print("Skipped.")
if htid in modelindices:
modelpredictions = dict()
for genre, genrecolumn in modeldata.items():
if not genre in options:
# this column is not a genre!
continue
modelpredictions[genre] = float(genrecolumn[htid])
predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True)
modelprediction = predictionlist[0][1]
#.........这里部分代码省略.........
示例12: choose_cascade
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
def choose_cascade(htid, pagepredictions):
'''Reads metadata about this volume and uses it, combined with
the thrust of page-level predictions, to decide what other models,
if any, should be used to correct/adjust current predictions.
Returns three boolean flags, indicating whether the volume is
1) Mostly drama and poetry.
2) Probably biography.
3) Probably fiction.
It's entirely conceivable that more than one of these flags could be true
at the same time. In that case no cascade will be applied, because we have
inconsistent/untrustworthy evidence.'''
global rowindices, columns, metadata
genresequence = [x for x in pagepredictions]
# Make a defensive copy of current page predictions
# Then count genres.
genrecounts, maxgenre = sequence_to_counts(genresequence)
if genrecounts['fic'] > 0 and genrecounts['fic'] < (len(genresequence) / 3):
notfiction = True
else:
notfiction = False
if genrecounts['dra'] > 0 and (genrecounts['non'] > len(genresequence) / 2 or genrecounts['fic'] > len(genresequence) / 2 or genrecounts['poe'] > len(genresequence) * .9):
notdrama = True
else:
notdrama = False
# Use those counts to decide whether the volume is more than 50% drama and/or poetry.
if (genrecounts['dra'] + genrecounts['poe']) > (len(genresequence) / 2):
mostlydrapoe = True
else:
mostlydrapoe = False
# One other flag will be governed by existing metadata.
probablyfiction = False
probablybiography = False
htid = utils.pairtreelabel(htid)
# convert the clean pairtree filename into a dirty pairtree label for metadata matching
if htid not in rowindices:
# We have no metadata for this volume.
print("Volume missing from ExtractedMetadata.tsv: " + htid)
else:
genrestring = metadata["genres"][htid]
genreinfo = genrestring.split(";")
# It's a semicolon-delimited list of items.
for info in genreinfo:
if info == "Biography" or info == "Autobiography":
probablybiography = True
if info == "Fiction" or info == "Novel":
probablyfiction = True
if (info == "Poetry" or info == "Poems"):
mostlydrapoe = True
if (info == "Drama" or info == "Tragedies" or info == "Comedies"):
mostlydrapoe = True
title = metadata["title"][htid].lower()
titlewords = title.split()
if "poems" in titlewords or "ballads" in titlewords or "poetical" in titlewords:
mostlydrapoe = True
if "comedy" in titlewords or "tragedy" in titlewords or "plays" in titlewords:
mostlydrapoe = True
return mostlydrapoe, probablybiography, probablyfiction, notdrama, notfiction
示例13: metadata_check
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
def metadata_check(htid, inputsequence):
global options, rowindices, columns, metadata, modelindices, modelcolumns, modeldata
'''Assesses whether previous metadata tend to deny or confirm the
thrust of page-level genre predictions. For this purpose we use both
genre codes extracted from the MARC record and the predictions of a volume-
level probabilistic model.
Returns two parameters: 1) a dictionary of "confirmations" that indicate
whether metadata aligns with page-level predictions in six specific ways.
2) The "maxgenre" or genre most commonly predicted at the page level.'''
genresequence = [x for x in inputsequence]
# make a defensive copy of incoming parameter
htid = utils.pairtreelabel(htid)
# convert the htid into a dirty pairtree label for metadata matching
# Create a dictionary with entries for all possible conditions, initially set negative.
symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"]
# The first four of these symptoms reflect metadata extracted from the MARC record. Weakconfirmation and
# weakdenial are based on flags extracted from controlfield 008 which I find are not very reliable as guides.
# Strongconfirmation and strongdenial are based on strings extracted from other fields that are more
# specific and reliable as indications of genre. Modelagrees and modeldisagrees reflect the alignment of
# page-level predictions with an earlier volume-level model of the corpus.
confirmations = dict()
for symptom in symptoms:
confirmations[symptom] = 0
genrecounts, maxgenre = sequence_to_counts(genresequence)
if htid not in rowindices and htid not in modelindices:
return confirmations
if htid in rowindices:
genrestring = metadata["genres"][htid]
genreinfo = genrestring.split(";")
# It's a semicolon-delimited list of items.
for info in genreinfo:
# if info == "biog?" and maxgenre == "non":
# confirmations["weakconfirmation"] = 1
# if info == "biog?" and maxgenre != "non":
# confirmations["weakdenial"] = 1
if info == "Not fiction" and maxgenre == "non":
confirmations["weakconfirmation"] = 1
if info == "Not fiction" and maxgenre == "fic":
confirmations["weakdenial"] = 1
if (info == "Fiction" or info == "Novel") and maxgenre == "fic":
confirmations["strongconfirmation"] = 1
if (info == "Fiction" or info == "Novel") and maxgenre != "fic":
confirmations["strongdenial"] = 1
if info == "Biography" and maxgenre == "non":
confirmations["strongconfirmation"] = 1
if info == "Biography" and maxgenre != "non":
confirmations["strongdenial"] = 1
if info == "Autobiography" and maxgenre == "non":
confirmations["strongconfirmation"] = 1
if info == "Autobiography" and maxgenre != "non":
confirmations["strongdenial"] = 1
if (info == "Poetry" or info == "Poems") and maxgenre == "poe":
confirmations["strongconfirmation"] = 1
if (info == "Poetry" or info == "Poems") and maxgenre != "poe":
confirmations["strongdenial"] = 1
if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre == "dra":
confirmations["strongconfirmation"] = 1
if (info == "Drama" or info == "Tragedies" or info == "Comedies") and maxgenre != "dra":
confirmations["strongdenial"] = 1
if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre == "non":
confirmations["strongconfirmation"] = 1
couldbefiction = False
if (info == "Catalog" or info == "Dictionary" or info=="Bibliographies") and maxgenre != "non":
confirmations["strongdenial"] = 1
else:
print("Skipped.")
if htid in modelindices:
modelpredictions = dict()
for genre, genrecolumn in modeldata.items():
if not genre in options:
# this column is not a genre!
continue
modelpredictions[genre] = float(genrecolumn[htid])
predictionlist = utils.sortkeysbyvalue(modelpredictions, whethertoreverse = True)
modelprediction = predictionlist[0][1]
modelconfidence = predictionlist[0][0]
nextclosest = predictionlist[1][0]
# Take the top prediction.
# For purposes of this routine, treat biography as nonfiction:
#.........这里部分代码省略.........
示例14: logitpredict
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
predictions = logitpredict(parameters, data)
# with open("/Volumes/TARDIS/output/models/results.txt", mode ="w") as f:
# for idx, prediction in enumerate(predictions):
# f.write(str(idx) + '\t' + data.index[idx] + '\t' + str(prediction) + '\n')
# This will also do it more easily:
# with open("/Volumes/TARDIS/output/models/PredictAccuracy.p", mode = "r+b") as f:
# model = pickle.load(f)
# otherpredictions = model.predict(data)
import SonicScrewdriver as utils
indices = [utils.pairtreelabel(x) for x in data.index]
decorated = [x for x in zip(predictions, indices)]
decorated.sort()
sortedpredictions, sortedindices = zip(*decorated)
with open("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv", mode = "r", encoding = "utf-8") as f:
filelines = f.readlines()
linedict=dict()
for line in filelines[1:]:
line = line.rstrip()
fields = line.split('\t')
headlessline = '\t'.join(fields[1:])
linedict[fields[0]] = headlessline
示例15: list
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import pairtreelabel [as 别名]
import os, sys
import SonicScrewdriver as utils
rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")
sourcedir = "/Users/tunder/Dropbox/pagedata/newfeatures/oldfeatures/"
dirlist = os.listdir(sourcedir)
htids = list()
ctr = 0
with open("/Users/tunder/Dropbox/pagedata/trainingmeta.tsv", mode="w", encoding="utf-8") as f:
for filename in dirlist:
if len(filename) > 7 and not filename.startswith("."):
stripped = filename[:-7]
htid = utils.pairtreelabel(stripped)
outline = ""
for column in columns:
outline = outline + metadata[column][htid] + '\t'
f.write(outline + "\n")