本文整理汇总了Python中SonicScrewdriver.readtsv方法的典型用法代码示例。如果您正苦于以下问题:Python SonicScrewdriver.readtsv方法的具体用法?Python SonicScrewdriver.readtsv怎么用?Python SonicScrewdriver.readtsv使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类SonicScrewdriver
的用法示例。
在下文中一共展示了SonicScrewdriver.readtsv方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: open
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# sort_anovaset.py
import SonicScrewdriver as utils
import csv
rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/19cmetadata.tsv')
with open('anovaset.txt', encoding = 'utf-8') as f:
filelines = f.readlines()
wholeset = [x.rstrip() for x in filelines]
the19c = list()
the20c = list()
for anid in wholeset:
if anid in rows:
the19c.append(anid)
else:
the20c.append(anid)
with open('anova19c.txt', mode = 'w', encoding = 'utf-8') as f:
for anid in the19c:
f.write(anid + '\n')
with open('anova20c.txt', mode = 'w', encoding = 'utf-8') as f:
for anid in the20c:
f.write(anid + '\n')
示例2: passfilter
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# refine fiction
import SonicScrewdriver as utils
def passfilter(genrestring):
fields = genrestring.split(';')
if "Autobiography" in fields or "Biography" in fields:
return False
else:
return True
rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv')
rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')
with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f:
filelines = f.readlines()
idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]
filteredrows = list()
missing = 0
for anid in idlist:
if anid in rows19c:
genrestring = table19c["genres"][anid]
rowdict = dict()
for col in columns19c:
rowdict[col] = table19c[col][anid]
elif anid in rows20c:
示例3: print
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
print()
# print("ROUGH MICROACCURACY:")
# print(roughaccuracy)
print("SMOOTHED MICROACCURACY:")
print(smoothaccuracy)
print("COALESCED MICROACCURACY:")
print(coalaccuracy)
with open("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv", mode = "w", encoding="utf-8") as f:
f.write("htid\taccuracy\n")
for key, value in accuracies.items():
outline = key + "\t" + str(value) + "\n"
f.write(outline)
metadatapath = os.path.join(firstdir, "predictionMetadata.tsv")
rowindices, columns, metadata = utils.readtsv(metadatapath)
metadatatable['maxprob']= metadata['maxprob']
metadatatable['gap'] = metadata['gap']
metadatatable['accuracy'] = accuracies
metadatatable['dissent'] = dissentperfile
data = pd.DataFrame(metadatatable, dtype = "float")
data['intercept'] = 1.0
train_cols = data.columns[1:]
logit = sm.Logit(data['accuracy'], data[train_cols])
result = logit.fit()
print(result.summary())
predictions = result.predict(data[train_cols])
print(pearsonr(data['accuracy'], predictions))
示例4: open
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# Generate Cotraining Set
# This script uses a set of volumes already classified and sorted by a model
# in order to generate additional training data for a new model.
import SonicScrewdriver as utils
from shutil import copyfile
indices, columns, metadata = utils.readtsv("/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv")
toget = indices[-200:]
toget = [utils.pairtreefile(x) for x in toget]
genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/"
featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/"
for htid in toget:
featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv"
featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv"
copyfile(featuresource, featuredestination)
genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict"
genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map"
with open(genresource, mode="r", encoding = "utf-8") as f:
filelines = f.readlines()
with open(genredestination, mode="w", encoding = "utf-8") as f:
for line in filelines:
line = line.rstrip()
示例5: add_counts
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
def add_counts(wordcounts, year, word, count):
if year in wordcounts:
if word in wordcounts[year]:
wordcounts[year][word] += count
else:
wordcounts[year][word] = count
else:
wordcounts[year] = dict()
wordcounts[year][word] = count
metafile = '/Users/tunder/Dropbox/GenreProject/metadata/filteredfiction.tsv'
rows, columns, table = utils.readtsv(metafile)
dateindex = dict()
for volid in rows:
startdate = table["startdate"][volid]
enddate = table["enddate"][volid]
textdate = table["textdate"][volid]
intdate = infer_date(startdate, enddate, textdate)
if intdate >= 1750 and intdate <= 1950:
if intdate in dateindex:
dateindex[intdate].append(volid)
else:
dateindex[intdate] = [volid]
示例6: main
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
def main():
global testrun, datapath, slicepath, metadatapath, current_working, metaoutpath, errorpath, pagevocabset
if testrun:
filelist = os.listdir(datapath)
HTIDs = set()
for afilename in filelist:
if not (afilename.startswith(".") or afilename.startswith("_")):
HTIDs.add(afilename)
else:
with open(slicepath, encoding="utf-8") as file:
HTIDlist = file.readlines()
HTIDs = set([x.rstrip() for x in HTIDlist])
del HTIDlist
## discard bad volume IDs
with open(metadatapath + "badIDs.txt", encoding = 'utf-8') as file:
filelines = file.readlines()
for line in filelines:
line = line.rstrip()
line = line.split(delim)
if line[0] in HTIDs:
HTIDs.discard(line[0])
if not os.path.isfile(metaoutpath):
with open(metaoutpath, 'w', encoding = 'utf-8') as f:
f.write("volID\ttotalwords\tprematched\tpreenglish\tpostmatched\tpostenglish\n")
print(len(HTIDs))
# Let's get some metadata to create metadata features.
if testrun:
rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")
else:
rowindices, columns, metadata = utils.readtsv("/projects/ichass/usesofscale/hathimeta/ExtractedMetadata.tsv")
metadata_clues = list()
for aHTID in HTIDs:
evidence = get_metadata_evidence(aHTID, rowindices, columns, metadata)
metadata_clues.append(evidence)
assert len(HTIDs) == len(metadata_clues)
file_tuples = zip(HTIDs, metadata_clues)
pool = Pool(processes = 12)
res = pool.map_async(process_a_file, file_tuples)
# After all files are processed, write metadata, errorlog, and counts of phrases.
res.wait()
resultlist = res.get()
processedmeta = list()
errorlog = list()
phrasecount = dict()
for file_dict in resultlist:
processedmeta.append(file_dict["metadata"])
errorlog.extend(file_dict["errors"])
htid = file_dict["htid"]
# Metadata.
with open(metaoutpath, mode = 'a', encoding = 'utf-8') as file:
for metatuple in processedmeta:
outlist = [x for x in metatuple]
outline = delim.join(outlist) + '\n'
file.write(outline)
# Write the errorlog.
if len(errorlog) > 0:
with open(errorpath, mode = 'w', encoding = 'utf-8') as file:
for line in errorlog:
file.write(line + '\n')
# Write phrase counts.
# with open(phrasecountpath, mode="w", encoding = "utf-8") as file:
# j = json.dumps(phrasecount)
# file.write(j)
print("Done.")
pool.close()
pool.join()
示例7: loadwordcounts
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
if htid in wordcountsbyfile:
wordcountsbyfile[htid].append(count)
else:
wordcountsbyfile[htid] = [count]
return wordcountsbyfile
# Begin main script.
TOL = 0.1
THRESH = 0.80
genrestocheck = ['fic', 'poe', 'dra']
metadatapath = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv'
rows, columns, table = utils.readtsv(metadatapath)
firstsource = "/Users/tunder/Dropbox/pagedata/to1923features/genremaps/"
secondsource = "/Users/tunder/Dropbox/pagedata/seventhfeatures/genremaps/"
firstmaps = os.listdir(firstsource)
secondmaps = os.listdir(secondsource)
firstwordcounts = loadwordcounts(firstsource)
secondwordcounts = loadwordcounts(secondsource)
predictsource = '/Users/tunder/Dropbox/pagedata/production/crosspredicts/'
predicts = os.listdir(predictsource)
predicts = [x for x in predicts if not x.startswith('.')]
示例8: censor
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# Uses metadata to help assess degrees
import os, sys
import SonicScrewdriver as utils
rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")
modelindices, modelcolumns, modeldata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")
options = ["non", "bio", "poe", "dra", "fic"]
def censor(htid, genresequence):
htid = utils.pairtreelabel(htid)
# convert the htid into a dirty pairtree label for metadata matching
# Create a dictionary with entries for all possible conditions, initially set negative.
symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"]
reported = dict()
for symptom in symptoms:
reported[symptom] = 0
couldbefiction = True
# Now we need to assess the largest genre in this volume.
genrecounts = dict()
genrecounts['fic'] = 0
genrecounts['poe'] = 0
genrecounts['dra'] = 0
genrecounts['non'] = 0
示例9: keywithmaxval
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# Figures out what call numbers mean for genre
import os, sys
import SonicScrewdriver as utils
rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")
options = ["non", "bio", "poe", "dra", "fic"]
modelindices, modelcolumns, modeldata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")
def keywithmaxval(dictionary):
maxval = 0
maxkey = ""
for key, value in dictionary.items():
if value > maxval:
maxval = value
maxkey = key
return maxkey
def sequence_to_counts(genresequence):
'''Converts a sequence of page-level predictions to
a dictionary of counts reflecting the number of pages
assigned to each genre. Also reports the largest genre.'''
genrecounts = dict()
genrecounts['fic'] = 0
genrecounts['poe'] = 0
genrecounts['dra'] = 0
示例10: open
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1880-1899_200.csv'
with open(reviews) as f:
reader = csv.reader(f)
for fields in reader:
htid = fields[0]
if htid == "HTid":
continue
jgenre = fields[13]
date = int(fields[1])
if jgenre == 'poe':
selecteddates[htid] = date
selected.add(htid)
rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv')
bydate = dict()
for row in rows:
if row in selected:
continue
date = utils.simple_date(row, table)
if date in bydate:
bydate[date].append(row)
else:
bydate[date] = [row]
controlset = set()
示例11: open
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# Uses metadata to help assess degrees
import os, sys
import SonicScrewdriver as utils
rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")
options = ["non", "bio", "poe", "dra", "fic"]
with open("/Users/tunder/Dropbox/pagedata/litlocs.tsv", encoding="utf-8") as f:
filelines = f.readlines()
litlocs = dict()
for line in filelines:
line = line.strip()
fields = line.split('\t')
litlocs[fields[0]] = int(round(1000 * float(fields[1])))
with open("/Users/tunder/Dropbox/pagedata/biolocs.tsv", encoding="utf-8") as f:
filelines = f.readlines()
biolocs = dict()
for line in filelines:
line = line.strip()
fields = line.split('\t')
biolocs[fields[0]] = int(round(1000 * float(fields[1])))
def letterpart(locnum):
if locnum == "<blank>":
return "<blank>"
letterstring = ""
for char in locnum:
示例12: list
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
##
import os, sys
import SonicScrewdriver as utils
import random
rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")
initialsample = random.sample(rowindices, 2000)
directorylist = os.listdir("/Users/tunder/Dropbox/pagedata/mixedtraining/pagefeatures")
existingfiles = list()
for filename in directorylist:
if filename.startswith(".") or filename.startswith("_"):
continue
htid = utils.pairtreelabel(filename[0:-7])
existingfiles.append(htid)
counter = 0
toremove = list()
for htid in initialsample:
if htid in existingfiles:
counter +=1
toremove.append(htid)
print("Found " + str(counter) + " duplicates.")
for htid in toremove:
initialsample.remove(htid)
示例13: count_words
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
title = fields[2]
author = fields[3] + ', ' + fields[4]
date = fields[8]
filename = idcode + '.txt'
filepath = os.path.join(sourcedir, filename)
if os.path.isfile(filepath):
tokencount, wordcount = count_words(filepath)
else:
print("Missing file: " + filepath)
sys.exit(0)
newrow = [idcode, date, tokencount, wordcount, author, title]
outtable.append(newrow)
print(counter)
counter += 1
rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv')
sourcedir = "/Volumes/TARDIS/work/moneytexts/"
for row in rows:
filename = utils.pairtreefile(row) + ".fic.txt"
filepath = os.path.join(sourcedir, filename)
if os.path.isfile(filepath):
tokencount, wordcount = count_words(filepath)
else:
print("Missing file: " + filepath)
sys.exit(0)
idcode = table["HTid"][row]
date = str(utils.simple_date(row, table))
author = table["author"][row]
示例14: open
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
modelfolder = "/Volumes/TARDIS/work/moneycontext/"
modelpath = modelfolder + "logisticmodel.p"
with open(modelpath, mode = 'rb') as f:
logisticmodel = pickle.load(f)
standardizerpath = modelfolder + 'standardizer.p'
with open(standardizerpath, mode = 'rb') as f:
standardizer = pickle.load(f)
featurepath = modelfolder + 'featurelist.p'
with open(featurepath, mode = 'rb') as f:
features = pickle.load(f)
# Now load HathiTrust metadata.
rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/MergedMonographs.tsv')
ambiguouswords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'}
moneywords = {'dollar', 'dollars', 'dime', 'dimes', 'nickel', 'nickels', 'pound', 'pounds', 'shilling', 'shillings', 'sovereign', 'sovereigns','cent', 'cents', 'centime', 'centimes', 'crown', 'crowns', 'halfcrown', 'half-crown','penny', 'pennies', 'pence', 'farthing', 'farthings', 'franc', 'francs', 'guilder', 'guilders', 'florin', 'florins', 'guinea', 'guineas', "ha'penny", 'tuppence', 'twopence', 'sixpence', '|arabicprice|', '|price|', 'quid'}
# Words I explicitly decided not to include: 'quarter', 'quarters', 'mark', 'marks.' Monetary uses
# seemed rare enough relative to others that they'd be more likely to introduce noise than to help.
# |arabicprice| is a code the tokenizer in modelingcounter produces whenever it encounters
# a number connected to £, $, ¢, s, or d. In the output we convert that to |price|, for no very
# good reason.
wealthwords = {'fortune', 'fortunes', 'wealth', 'rich', 'riches', 'money', 'moneys', 'fund', 'funds', 'sum', 'sums', 'price', 'prices', 'priced'}
# This is by no means an exhaustive list. Owe, loan, borrowed, etc.
# If we really want to get at the full range of words potentially
示例15: open
# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# make fiction subset
import SonicScrewdriver as utils
rows, columns, table = utils.readtsv("/Users/tunder/Dropbox/bookNLP/metadata/enrichedmetadataDec6.tsv")
datedict = dict()
selected = []
for row in rows:
date = int(table["date"][row])
if date in datedict:
datedict[date] += 1
else:
datedict[date] = 1
if datedict[date] > 3:
continue
else:
selected.append(row)
with open("/Users/tunder/Dropbox/GenreProject/python/piketty/fictionsubset.txt", mode='w', encoding = 'utf-8') as f:
for line in selected:
f.write(line + '\n')