当前位置: 首页>>代码示例>>Python>>正文


Python SonicScrewdriver.readtsv方法代码示例

本文整理汇总了Python中SonicScrewdriver.readtsv方法的典型用法代码示例。如果您正苦于以下问题:Python SonicScrewdriver.readtsv方法的具体用法?Python SonicScrewdriver.readtsv怎么用?Python SonicScrewdriver.readtsv使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在SonicScrewdriver的用法示例。


在下文中一共展示了SonicScrewdriver.readtsv方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: open

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# sort_anovaset.py

import SonicScrewdriver as utils
import csv

rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/19cmetadata.tsv')

with open('anovaset.txt', encoding = 'utf-8') as f:
    filelines = f.readlines()
    wholeset = [x.rstrip() for x in filelines]

the19c = list()
the20c = list()

for anid in wholeset:
    if anid in rows:
        the19c.append(anid)
    else:
        the20c.append(anid)

with open('anova19c.txt', mode = 'w', encoding = 'utf-8') as f:
    for anid in the19c:
        f.write(anid + '\n')

with open('anova20c.txt', mode = 'w', encoding = 'utf-8') as f:
    for anid in the20c:
        f.write(anid + '\n')



开发者ID:tedunderwood,项目名称:GenreProject,代码行数:29,代码来源:sort_anovaset.py

示例2: passfilter

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# refine fiction

import SonicScrewdriver as utils

def passfilter(genrestring):
	fields = genrestring.split(';')
	if "Autobiography" in fields or "Biography" in fields:
		return False
	else:
		return True

rows19c, columns19c, table19c = utils.readtsv('/Volumes/TARDIS/work/metadata/19cMetadata.tsv')

rows20c, columns20c, table20c = utils.readtsv('/Volumes/TARDIS/work/metadata/20cMonographMetadata.tsv')

with open("/Users/tunder/Dropbox/GenreProject/python/piketty/roughfiction.txt", encoding = 'utf-8') as f:
	filelines = f.readlines()

idlist = [utils.pairtreelabel(x.split('\t')[0]) for x in filelines]

filteredrows = list()

missing = 0

for anid in idlist:
	if anid in rows19c:
		genrestring = table19c["genres"][anid]
		rowdict = dict()
		for col in columns19c:
			rowdict[col] = table19c[col][anid]
	elif anid in rows20c:
开发者ID:tedunderwood,项目名称:GenreProject,代码行数:33,代码来源:refine_fiction.py

示例3: print

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
print()
# print("ROUGH MICROACCURACY:")
# print(roughaccuracy)
print("SMOOTHED MICROACCURACY:")
print(smoothaccuracy)
print("COALESCED MICROACCURACY:")
print(coalaccuracy)

with open("/Users/tunder/Dropbox/pagedata/interrater/ActualAccuracies.tsv", mode = "w", encoding="utf-8") as f:
	f.write("htid\taccuracy\n")
	for key, value in accuracies.items():
		outline = key + "\t" + str(value) + "\n"
		f.write(outline)

metadatapath = os.path.join(firstdir, "predictionMetadata.tsv")
rowindices, columns, metadata = utils.readtsv(metadatapath)

metadatatable['maxprob']= metadata['maxprob']
metadatatable['gap'] = metadata['gap']
metadatatable['accuracy'] = accuracies
metadatatable['dissent'] = dissentperfile

data = pd.DataFrame(metadatatable, dtype = "float")

data['intercept'] = 1.0
train_cols = data.columns[1:]
logit = sm.Logit(data['accuracy'], data[train_cols])
result = logit.fit()
print(result.summary())
predictions = result.predict(data[train_cols])
print(pearsonr(data['accuracy'], predictions))
开发者ID:deepfriedrabbit,项目名称:genre,代码行数:33,代码来源:JsonEnsemble.py

示例4: open

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# Generate Cotraining Set

# This script uses a set of volumes already classified and sorted by a model
# in order to generate additional training data for a new model.

import SonicScrewdriver as utils
from shutil import copyfile

indices, columns, metadata = utils.readtsv("/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv")

toget = indices[-200:]

toget = [utils.pairtreefile(x) for x in toget]

genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/"
featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/"

for htid in toget:

	featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv"
	featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv"
	copyfile(featuresource, featuredestination)

	genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict"
	genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map"
	with open(genresource, mode="r", encoding = "utf-8") as f:
		filelines = f.readlines()

	with open(genredestination, mode="w", encoding = "utf-8") as f:
		for line in filelines:
			line = line.rstrip()
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:33,代码来源:GenerateCotrainingSet.py

示例5: add_counts

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
def add_counts(wordcounts, year, word, count):
	if year in wordcounts:

		if word in wordcounts[year]:
			wordcounts[year][word] += count
		else:
			wordcounts[year][word] = count

	else:
		wordcounts[year] = dict()
		wordcounts[year][word] = count


metafile = '/Users/tunder/Dropbox/GenreProject/metadata/filteredfiction.tsv'
rows, columns, table = utils.readtsv(metafile)

dateindex = dict()

for volid in rows:
	startdate = table["startdate"][volid]
	enddate = table["enddate"][volid]
	textdate = table["textdate"][volid]

	intdate = infer_date(startdate, enddate, textdate)

	if intdate >= 1750 and intdate <= 1950:
		if intdate in dateindex:
			dateindex[intdate].append(volid)
		else:
			dateindex[intdate] = [volid]
开发者ID:tedunderwood,项目名称:GenreProject,代码行数:32,代码来源:FictionSample.py

示例6: main

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
def main():
	global testrun, datapath, slicepath, metadatapath, current_working,  metaoutpath, errorpath, pagevocabset

	if testrun:
		filelist = os.listdir(datapath)
		HTIDs = set()
		for afilename in filelist:
			if not (afilename.startswith(".") or afilename.startswith("_")):
				HTIDs.add(afilename)

	else:
		with open(slicepath, encoding="utf-8") as file:
			HTIDlist = file.readlines()

		HTIDs = set([x.rstrip() for x in HTIDlist])
		del HTIDlist

	## discard bad volume IDs

	with open(metadatapath + "badIDs.txt", encoding = 'utf-8') as file:
		filelines = file.readlines()

	for line in filelines:
		line = line.rstrip()
		line = line.split(delim)
		if line[0] in HTIDs:
			HTIDs.discard(line[0])

	if not os.path.isfile(metaoutpath):
		with open(metaoutpath, 'w', encoding = 'utf-8') as f:
			f.write("volID\ttotalwords\tprematched\tpreenglish\tpostmatched\tpostenglish\n")

	print(len(HTIDs))

	# Let's get some metadata to create metadata features.

	if testrun:
		rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")
	else:
		rowindices, columns, metadata = utils.readtsv("/projects/ichass/usesofscale/hathimeta/ExtractedMetadata.tsv")

	metadata_clues = list()
	for aHTID in HTIDs:
		evidence = get_metadata_evidence(aHTID, rowindices, columns, metadata)
		metadata_clues.append(evidence)

	assert len(HTIDs) == len(metadata_clues)
	file_tuples = zip(HTIDs, metadata_clues)

	pool = Pool(processes = 12)
	res = pool.map_async(process_a_file, file_tuples)

	# After all files are processed, write metadata, errorlog, and counts of phrases.
	res.wait()
	resultlist = res.get()

	processedmeta = list()
	errorlog = list()
	phrasecount = dict()

	for file_dict in resultlist:
		processedmeta.append(file_dict["metadata"])
		errorlog.extend(file_dict["errors"])
		htid = file_dict["htid"]

	# Metadata.

	with open(metaoutpath, mode = 'a', encoding = 'utf-8') as file:
		for metatuple in processedmeta:
			outlist = [x for x in metatuple]
			outline = delim.join(outlist) + '\n'
			file.write(outline)

	# Write the errorlog.

	if len(errorlog) > 0:
		with open(errorpath, mode = 'w', encoding = 'utf-8') as file:
			for line in errorlog:
				file.write(line + '\n')

	# Write phrase counts.

	# with open(phrasecountpath, mode="w", encoding = "utf-8") as file:
	#     j = json.dumps(phrasecount)
	#     file.write(j)

	print("Done.")
	pool.close()
	pool.join()
开发者ID:cmchurch,项目名称:DataMunging,代码行数:91,代码来源:MultiNormalizeOCR.py

示例7: loadwordcounts

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
        if htid in wordcountsbyfile:
            wordcountsbyfile[htid].append(count)
        else:
            wordcountsbyfile[htid] = [count]

    return wordcountsbyfile

# Begin main script.

TOL = 0.1
THRESH = 0.80

genrestocheck = ['fic', 'poe', 'dra']

metadatapath = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv'
rows, columns, table = utils.readtsv(metadatapath)

firstsource = "/Users/tunder/Dropbox/pagedata/to1923features/genremaps/"
secondsource = "/Users/tunder/Dropbox/pagedata/seventhfeatures/genremaps/"

firstmaps = os.listdir(firstsource)
secondmaps = os.listdir(secondsource)

firstwordcounts = loadwordcounts(firstsource)
secondwordcounts = loadwordcounts(secondsource)

predictsource = '/Users/tunder/Dropbox/pagedata/production/crosspredicts/'

predicts = os.listdir(predictsource)
predicts = [x for x in predicts if not x.startswith('.')]
开发者ID:deepfriedrabbit,项目名称:genre,代码行数:32,代码来源:logisticconfidence.py

示例8: censor

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# Uses metadata to help assess degrees

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

modelindices, modelcolumns, modeldata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")

options = ["non", "bio", "poe", "dra", "fic"]

def censor(htid, genresequence):

	htid = utils.pairtreelabel(htid)
	# convert the htid into a dirty pairtree label for metadata matching

	# Create a dictionary with entries for all possible conditions, initially set negative.
	symptoms = ["weakconfirmation", "weakdenial", "strongconfirmation", "strongdenial", "modelagrees", "modeldisagrees"]
	reported = dict()
	for symptom in symptoms:
		reported[symptom] = 0

	couldbefiction = True

	# Now we need to assess the largest genre in this volume.
	genrecounts = dict()
	genrecounts['fic'] = 0
	genrecounts['poe'] = 0
	genrecounts['dra'] = 0
	genrecounts['non'] = 0
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:32,代码来源:MetadataCensor.py

示例9: keywithmaxval

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# Figures out what call numbers mean for genre

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")

options = ["non", "bio", "poe", "dra", "fic"]

modelindices, modelcolumns, modeldata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/newgenretable.txt")

def keywithmaxval(dictionary):
    maxval = 0
    maxkey = ""

    for key, value in dictionary.items():
        if value > maxval:
            maxval = value
            maxkey = key

    return maxkey

def sequence_to_counts(genresequence):
    '''Converts a sequence of page-level predictions to
    a dictionary of counts reflecting the number of pages
    assigned to each genre. Also reports the largest genre.'''

    genrecounts = dict()
    genrecounts['fic'] = 0
    genrecounts['poe'] = 0
    genrecounts['dra'] = 0
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:33,代码来源:MetadataSorter.py

示例10: open

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1880-1899_200.csv'
with open(reviews) as f:
    reader = csv.reader(f)
    for fields in reader:
        htid = fields[0]
        if htid == "HTid":
            continue
        jgenre = fields[13]
        date = int(fields[1])

        if jgenre == 'poe':
            selecteddates[htid] = date
            selected.add(htid)

rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/filteredpoetry.tsv')

bydate = dict()

for row in rows:
    if row in selected:
        continue

    date = utils.simple_date(row, table)

    if date in bydate:
        bydate[date].append(row)
    else:
        bydate[date] = [row]

controlset = set()
开发者ID:tedunderwood,项目名称:GenreProject,代码行数:32,代码来源:select_poetry_corpus.py

示例11: open

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# Uses metadata to help assess degrees

import os, sys
import SonicScrewdriver as utils

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/pagedata/metascrape/EnrichedMetadata.tsv")

options = ["non", "bio", "poe", "dra", "fic"]

with open("/Users/tunder/Dropbox/pagedata/litlocs.tsv", encoding="utf-8") as f:
    filelines = f.readlines()
litlocs = dict()
for line in filelines:
    line = line.strip()
    fields = line.split('\t')
    litlocs[fields[0]] = int(round(1000 * float(fields[1])))

with open("/Users/tunder/Dropbox/pagedata/biolocs.tsv", encoding="utf-8") as f:
    filelines = f.readlines()
biolocs = dict()
for line in filelines:
    line = line.strip()
    fields = line.split('\t')
    biolocs[fields[0]] = int(round(1000 * float(fields[1])))

def letterpart(locnum):
    if locnum == "<blank>":
        return "<blank>"

    letterstring = ""
    for char in locnum:
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:33,代码来源:MetadataFeatures.py

示例12: list

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
##

import os, sys
import SonicScrewdriver as utils
import random

rowindices, columns, metadata = utils.readtsv("/Users/tunder/Dropbox/PythonScripts/hathimeta/ExtractedMetadata.tsv")

initialsample = random.sample(rowindices, 2000)

directorylist = os.listdir("/Users/tunder/Dropbox/pagedata/mixedtraining/pagefeatures")
existingfiles = list()

for filename in directorylist:
	if filename.startswith(".") or filename.startswith("_"):
		continue

	htid = utils.pairtreelabel(filename[0:-7])
	existingfiles.append(htid)

counter = 0
toremove = list()
for htid in initialsample:
	if htid in existingfiles:
		counter +=1
		toremove.append(htid)

print("Found " + str(counter) + " duplicates.")
for htid in toremove:
	initialsample.remove(htid)
开发者ID:tedunderwood,项目名称:HathiGenreTrainingset,代码行数:32,代码来源:RandomSample.py

示例13: count_words

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
        title = fields[2]
        author = fields[3] + ', ' + fields[4]
        date = fields[8]
        filename = idcode + '.txt'
        filepath = os.path.join(sourcedir, filename)
        if os.path.isfile(filepath):
            tokencount, wordcount = count_words(filepath)
        else:
            print("Missing file: " + filepath)
            sys.exit(0)
        newrow = [idcode, date, tokencount, wordcount, author, title]
        outtable.append(newrow)
        print(counter)
        counter += 1

rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv')

sourcedir = "/Volumes/TARDIS/work/moneytexts/"

for row in rows:
    filename = utils.pairtreefile(row) + ".fic.txt"
    filepath = os.path.join(sourcedir, filename)
    if os.path.isfile(filepath):
        tokencount, wordcount = count_words(filepath)
    else:
        print("Missing file: " + filepath)
        sys.exit(0)

    idcode = table["HTid"][row]
    date = str(utils.simple_date(row, table))
    author = table["author"][row]
开发者ID:tedunderwood,项目名称:GenreProject,代码行数:33,代码来源:better_metadata_maker.py

示例14: open

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
modelfolder = "/Volumes/TARDIS/work/moneycontext/"
modelpath = modelfolder + "logisticmodel.p"
with open(modelpath, mode = 'rb') as f:
    logisticmodel = pickle.load(f)

standardizerpath = modelfolder + 'standardizer.p'
with open(standardizerpath, mode = 'rb') as f:
    standardizer = pickle.load(f)

featurepath = modelfolder + 'featurelist.p'
with open(featurepath, mode = 'rb') as f:
    features = pickle.load(f)

# Now load HathiTrust metadata.

rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/MergedMonographs.tsv')

ambiguouswords = {'crown', 'crowns', 'guinea', 'guineas', 'nickel', 'sovereign', 'sovereigns', 'pound', 'pounds', 'quid'}

moneywords = {'dollar', 'dollars', 'dime', 'dimes', 'nickel', 'nickels', 'pound', 'pounds', 'shilling', 'shillings', 'sovereign', 'sovereigns','cent', 'cents', 'centime', 'centimes', 'crown', 'crowns', 'halfcrown', 'half-crown','penny', 'pennies', 'pence', 'farthing', 'farthings', 'franc', 'francs', 'guilder', 'guilders', 'florin', 'florins', 'guinea', 'guineas', "ha'penny", 'tuppence', 'twopence', 'sixpence', '|arabicprice|', '|price|', 'quid'}

# Words I explicitly decided not to include: 'quarter', 'quarters', 'mark', 'marks.' Monetary uses
# seemed rare enough relative to others that they'd be more likely to introduce noise than to help.
# |arabicprice| is a code the tokenizer in modelingcounter produces whenever it encounters
# a number connected to £, $, ¢, s, or d. In the output we convert that to |price|, for no very
# good reason.

wealthwords = {'fortune', 'fortunes', 'wealth', 'rich', 'riches', 'money', 'moneys', 'fund', 'funds', 'sum', 'sums', 'price', 'prices', 'priced'}

# This is by no means an exhaustive list. Owe, loan, borrowed, etc.
# If we really want to get at the full range of words potentially
开发者ID:tedunderwood,项目名称:GenreProject,代码行数:33,代码来源:fifteenwordsnippets.py

示例15: open

# 需要导入模块: import SonicScrewdriver [as 别名]
# 或者: from SonicScrewdriver import readtsv [as 别名]
# make fiction subset

import SonicScrewdriver as utils

rows, columns, table = utils.readtsv("/Users/tunder/Dropbox/bookNLP/metadata/enrichedmetadataDec6.tsv")

datedict = dict()

selected = []

for row in rows:
	date = int(table["date"][row])

	if date in datedict:
		datedict[date] += 1
	else:
		datedict[date] = 1

	if datedict[date] > 3:
		continue
	else:
		selected.append(row)

with open("/Users/tunder/Dropbox/GenreProject/python/piketty/fictionsubset.txt", mode='w', encoding = 'utf-8') as f:
	for line in selected:
		f.write(line + '\n')



开发者ID:tedunderwood,项目名称:GenreProject,代码行数:28,代码来源:make_fiction_subset.py


注:本文中的SonicScrewdriver.readtsv方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。