本文整理汇总了Python中maxCommon.iterTsvRows函数的典型用法代码示例。如果您正苦于以下问题:Python iterTsvRows函数的具体用法?Python iterTsvRows怎么用?Python iterTsvRows使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了iterTsvRows函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parseTabPublisherFile
def parseTabPublisherFile(fname):
" parse a file with columns eIssn, publisher (optional) and urls into a list of records "
logging.info("Parsing %s" % fname)
journals = list(maxCommon.iterTsvRows(fname, encoding="latin1"))
# modify publisher field
datasetName = splitext(basename(fname))[0]
headers = list(journals[0]._fields)
addPubField = False
if "publisher" not in headers:
headers.insert(0, "publisher")
addPubField =True
JRec = collections.namedtuple("Journal", headers)
newJournals = []
for j in journals:
if j.eIssn.lower()=="print only" or j.eIssn.lower()=="unknown":
logging.debug("Skipping journal %s, no eIssn" % j.title)
continue
if addPubField:
newJ = [datasetName]
newJ.extend(j)
newJRec = JRec(*newJ)
else:
newJRec = j
newJournals.append(newJRec)
return newJournals
示例2: updatePmids
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None):
""" go over subdirs of crawlDir, for each: read the ISSNs, and add new
PMIDs we have in medlineDir to subdir/pmids.txt
We never remove a PMID from pmids.txt.
"""
logging.info("Now updating crawler directories with the new PMIDs")
eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable)
issnToPmid, issnToJournal = getIssnPmidDict(medlineDir, updateIds, minYear)
for subdir in getSubdirs(crawlDir):
pmidFname = join(crawlDir, subdir, "pmids.txt")
issnFname = join(crawlDir, subdir, "issns.tab")
if not isfile(issnFname) or not isfile(pmidFname):
continue
logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname))
issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)]
logging.debug("ISSNs: %s" % ",".join(issns))
# read old pmids
oldPmids = set([int(line.rstrip()) for line in open(pmidFname)])
newPmids = set()
# add new pmids, for each issn
for issn in issns:
if issn not in issnToPmid:
if issn in eIssnToPIssn:
logging.debug("Looks like eISSN, mapped to printISSN %s" % issn)
issn = eIssnToPIssn[issn]
else:
logging.debug("No Pmids for ISSN %s and not eIssn for it" % issn)
issnPmids = issnToPmid.get(issn, None)
if issnPmids==None:
logging.debug("No Pmids for ISSN %s" % issn)
continue
logging.debug("Issn %s, %d PMIDs" % (issn, len(issnPmids)))
newPmids.update(issnPmids)
# get some counts and output to user
oldCount = len(oldPmids)
updateCount = len(newPmids)
oldPmids.update(newPmids) # faster to add new to old set than old to new set
pmids = oldPmids
newCount = len(pmids)
addCount = newCount - oldCount
logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \
(subdir, oldCount, updateCount, newCount, addCount))
# write new pmids
pmids = [str(x) for x in pmids]
# randomize order, to distribute errors
random.shuffle(pmids)
# write all pmids to a tmp file
pmidTmpFname = pmidFname+".new"
pmidFh = open(pmidTmpFname, "w")
pmidFh.write("\n".join(pmids))
pmidFh.close()
# keep a copy of the original pmid file
shutil.copy(pmidFname, pmidFname+".bak")
# rename the tmp file to the original file
# to make sure that an intact pmid file always exists
os.rename(pmidTmpFname, pmidFname)
示例3: iterArticleDataDir
def iterArticleDataDir(textDir, type="articles", filterFname=None, updateIds=None):
""" yields all articleData from all files in textDir
Can filter to yield only a set of filenames or files for a
given list of updateIds.
"""
fcount = 0
if type=="articles":
baseMask = "*.articles.gz"
elif type=="files":
baseMask = "*.files.gz"
elif type=="annots":
baseMask = "*.tab.gz"
else:
logging.error("Article type %s not valid" % type)
sys.exit(1)
if isfile(textDir):
fileNames = [textDir]
logging.debug("Found 1 file, %s" % textDir)
else:
fileMask = os.path.join(textDir, baseMask)
fileNames = glob.glob(fileMask)
logging.debug("Looking for all fulltext files in %s, found %d files" % \
(fileMask, len(fileNames)))
if updateIds!=None and len(updateIds)!=0:
logging.debug("Restricting fulltext files to updateIds %s" % str(updateIds))
filteredFiles = []
for updateId in updateIds:
for fname in fileNames:
if basename(fname).startswith(str(updateId)+"_"):
filteredFiles.append(fname)
logging.debug("Update Id %s, %d files" % (str(updateId), len(filteredFiles)))
fileNames = list(filteredFiles)
logging.debug("Found %d files in input dir %s" % (len(fileNames), textDir))
pm = maxCommon.ProgressMeter(len(fileNames), stepCount=100)
for textFname in fileNames:
if filterFname!=None and not filterFname in textFname:
logging.warn("Skipping %s, because file filter is set" % textFname)
continue
reader = PubReaderFile(textFname)
logging.debug("Reading %s, %d files left" % (textFname, len(fileNames)-fcount))
fcount+=1
if type=="articles":
for articleData in reader.articleRows:
if "publisher" not in articleData._fields: # XX temporary bugfix as I have some old files
articleData = list(articleData)
articleData.insert(2, "")
articleData[3] = ""
yield articleData
elif type=="files":
for fileData in reader.fileRows:
yield fileData
elif type=="annots":
for row in maxCommon.iterTsvRows(textFname):
yield row
else:
assert(False) # illegal type parameter
pm.taskCompleted()
示例4: parseHighwire
def parseHighwire():
""" create two dicts
printIssn -> url to pmidlookup-cgi of highwire
and
publisherName -> top-level hostnames
>>> temps, domains = parseHighwire()
>>> temps['0270-6474']
u'http://www.jneurosci.org/cgi/pmidlookup?view=long&pmid=%(pmid)s'
>>> domains["Society for Neuroscience"]
set([u'jneurosci.org'])
>>> domains["American Society for Biochemistry and Molecular Biology"]
set([u'jbc.org', u'mcponline.org', u'jlr.org'])
>>> temps["1535-9476"]
u'http://www.mcponline.org/cgi/pmidlookup?view=long&pmid=%(pmid)s'
"""
templates = {}
domains = {}
pubFname = pubConf.publisherIssnTable
logging.info("Parsing %s to find highwire ISSNs/webservers" % pubFname)
for row in maxCommon.iterTsvRows(pubFname):
if not row.pubName.startswith("HIGHWIRE"):
continue
pubName = row.pubName.replace("HIGHWIRE ","")
issns = [i.strip() for i in row.journalIssns.split("|")]
servers = row.webservers.split("|")
for issn, server in zip(issns, servers):
template = "http://www."+server+"/cgi/pmidlookup?view=long&pmid=%(pmid)s"
templates[issn] = template
domains.setdefault(pubName, set()).add(server)
#logging.debug("HIGHWIRE CONFIG %s, %s, %s" % (pubName, template, domains[pubName]))
return templates, domains
示例5: __init__
def __init__(self, fname):
" fname can end in .articles.gz, reader will still read both articles and files "
logging.debug("Reading data from file with prefix %s (.articles.gz, .files.gz)" % fname)
baseDir = dirname(fname)
base = basename(fname).split('.')[0]
articleFn = join(baseDir, base+".articles.gz")
fileFn = join(baseDir, base+".files.gz")
logging.debug("Reading %s and %s" % (articleFn, fileFn))
self.articleRows = None
if isfile(articleFn) and getsize(articleFn)!=0:
self.articleRows = maxCommon.iterTsvRows(articleFn, encoding="utf8")
self.fileRows = None
if isfile(fileFn) and getsize(fileFn)!=0:
self.fileRows = maxCommon.iterTsvRows(fileFn, encoding="utf8")
示例6: __init__
def __init__(self, taxId):
" open db files, compile patterns, parse input as far as possible "
mutDataDir = pubConf.varDataDir
geneDataDir = pubConf.geneDataDir
if mutDataDir==None:
return
self.mutDataDir = mutDataDir
self.entrez2sym, self.entrez2refprots = parseEntrez(join(geneDataDir, "entrez.tab"))
# refseq sequences
fname = join(mutDataDir, "seqs")
logging.info("opening %s" % fname)
seqs = pubKeyVal.SqliteKvDb(fname)
self.seqs = seqs
# refprot to refseqId
# refseq to CDS Start
fname = join(mutDataDir, "refseqInfo.tab")
logging.debug("Reading %s" % fname)
self.refProtToRefSeq = {}
self.refSeqCds = {}
for row in maxCommon.iterTsvRows(fname):
self.refProtToRefSeq[row.refProt] = row.refSeq
self.refSeqCds[row.refSeq] = int(row.cdsStart)-1 # NCBI is 1-based
# refseq to genome
self.pslCache = {}
self.refGenePsls = openIndexedPsls(mutDataDir, "refGenePsls.9606")
# dbsnp db
fname = join(self.mutDataDir, "dbSnp.sqlite")
self.snpDb = sqlite3.connect(fname)
logging.info("Reading of data finished")
示例7: readArticleChunkAssignment
def readArticleChunkAssignment(inDir, updateIds):
"read the assignment of articleId -> chunkId from text directory"
if updateIds == None:
inFiles = glob.glob(os.path.join(inDir, "*_index.tab"))
else:
inFiles = []
for updateId in updateIds:
updateId = str(updateId)
indexFname = "%s_index.tab" % updateId
if isfile(indexFname):
inFiles.append(os.path.join(inDir, indexFname))
if len(inFiles) == 0:
logging.warn("No article chunk assignment")
return None
logging.debug("Input files for article -> chunk assignment: %s" % inFiles)
articleChunks = {}
for inFile in inFiles:
logging.info("Parsing %s" % inFile)
for row in maxCommon.iterTsvRows(inFile):
chunkId = int(row.chunkId.split("_")[1])
articleChunks[int(row.articleId)] = int(chunkId)
return articleChunks
示例8: updatePmids
def updatePmids(medlineDir, crawlDir, updateIds, minYear=None):
""" go over subdirs of crawlDir, for each: read the ISSNs, and add new
PMIDs we have in medlineDir to subdir/pmids.txt
We never remove a PMID from pmids.txt.
"""
logging.info("Now updating crawler directories with the new PMIDs")
eIssnToPIssn = getEIssnToPIssn(pubConf.publisherIssnTable)
subDirs = getSubdirs(crawlDir)
con, cur = pubStore.openArticleDb("medline", mustOpen=True, useRamdisk=True)
for subdir in subDirs:
if subdir.endswith(".tmp"):
continue
subPath = join(crawlDir, subdir)
logging.info("Processing subdirectory %s" % subPath)
if isfile(pubCrawlLib.getLockFname(subPath)):
logging.warn("Found lockfile, looks like a crawl is going on in %s, skipping" % subPath)
continue
pmidFname = join(crawlDir, subdir, "pmids.txt")
issnFname = join(crawlDir, subdir, "issns.tab")
if not isfile(issnFname) or not isfile(pmidFname):
logging.info("Skipping %s, ISSN or docId file not found" % subPath)
continue
logging.debug("reading subdir %s: %s and %s" % (subdir, pmidFname, issnFname))
issns = [row.issn.strip() for row in maxCommon.iterTsvRows(issnFname)]
logging.debug("ISSNs: %s" % ",".join(issns))
# read old pmids
oldPmids = set([int(line.rstrip()) for line in open(pmidFname)])
#newPmids = set()
# add new pmids, for each issn
newPmids = getPmidsForIssns(con, cur, issns, minYear)
logging.debug("%d PMIDs" % (len(newPmids)))
oldCount = len(oldPmids)
updateCount = len(newPmids)
oldPmids.update(newPmids) # faster to add new to old set than old to new set
pmids = oldPmids
newCount = len(pmids)
addCount = newCount - oldCount
logging.info("crawl dir %s: old PMID count %d, update has %d, new total %d, added %d" % \
(subdir, oldCount, updateCount, newCount, addCount))
# write new pmids
pmids = [str(x) for x in pmids]
# randomize order, to distribute errors
random.shuffle(pmids)
# write all pmids to a tmp file
pmidTmpFname = pmidFname+".new"
pmidFh = open(pmidTmpFname, "w")
pmidFh.write("\n".join(pmids))
pmidFh.close()
# keep a copy of the original pmid file
shutil.copy(pmidFname, pmidFname+".bak")
# atomic rename the tmp file to the original file
# to make sure that an intact pmid file always exists
os.rename(pmidTmpFname, pmidFname)
示例9: getAllBatchIds
def getAllBatchIds(outDir):
""" parse batches.tab and return all available batchIds
"""
batchIds = []
for row in maxCommon.iterTsvRows(join(outDir, "batches.tab")):
batchIds.append(row.batchId)
logging.debug("Found batchIds %s in directory %s" % (batchIds, outDir))
return batchIds
示例10: loadTsvSqlite
def loadTsvSqlite(dbFname, tableName, tsvFnames, headers=None, intFields=[], \
primKey=None, idxFields=[], dropTable=True):
" load tabsep file into sqlLite db table "
# if first parameter is string, make it to a list
if len(tsvFnames)==0:
logging.debug("No filenames to load")
return
if isinstance(tsvFnames, basestring):
tsvFnames = [tsvFnames]
if os.path.isfile(dbFname):
lockDb = False
finalDbFname = None
else:
lockDb = True
finalDbFname = dbFname
dbFname = pubGeneric.getFastUniqueTempFname()
logging.info("writing first to db on ramdisk %s" % dbFname)
con, cur = openSqlite(dbFname, lockDb=lockDb)
# drop old table
if dropTable:
logging.debug("dropping old sqlite table")
cur.execute('DROP TABLE IF EXISTS %s;'% tableName)
con.commit()
# create table
createSql, idxSqls = makeTableCreateStatement(tableName, headers, \
intFields=intFields, idxFields=idxFields, primKey=primKey)
logging.log(5, "creating table with %s" % createSql)
cur.execute(createSql)
con.commit()
logging.info("Loading data into table")
tp = maxCommon.ProgressMeter(len(tsvFnames))
sql = "INSERT INTO %s (%s) VALUES (%s)" % (tableName, ", ".join(headers), ", ".join(["?"]*len(headers)))
for tsvName in tsvFnames:
logging.debug("Importing %s" % tsvName)
if os.path.getsize(tsvName)==0:
logging.debug("Skipping %s, zero size" % tsvName)
continue
rows = list(maxCommon.iterTsvRows(tsvName))
logging.log(5, "Running Sql %s against %d rows" % (sql, len(rows)))
cur.executemany(sql, rows)
con.commit()
tp.taskCompleted()
logging.info("Adding indexes to table")
for idxSql in idxSqls:
cur.execute(idxSql)
con.commit()
con.close()
if finalDbFname!=None:
logging.info("moving over ramdisk db to %s" % dbFname)
shutil.move(dbFname, finalDbFname)
示例11: parseDoneIds
def parseDoneIds(fname):
" parse all already converted identifiers from inDir "
doneIds = set()
if os.path.getsize(fname) == 0:
return doneIds
for row in maxCommon.iterTsvRows(fname):
doneIds.add(row.doi)
logging.info("Found %d identifiers of already parsed files" % len(doneIds))
return doneIds
示例12: convertOneChunk
def convertOneChunk(inIndexFile, outFile):
"""
get files from inIndexFile, parse Xml,
write everything to outfile in ascii format
"""
store = pubStore.PubWriterFile(outFile)
i = 0
inRows = list(maxCommon.iterTsvRows(inIndexFile))
doi2pmid = None
logging.info("Converting %d files" % len(inRows))
convCount = 0
for row in inRows:
# read line
i+=1
articleId, baseDir = row.articleId, row.baseDir
zipFilename, filename = row.zipFilename, row.filename
articleId=int(articleId)
# open file from zipfile
fullZipPath = join(baseDir, zipFilename)
zipFile = zipfile.ZipFile(fullZipPath)
logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i))
if doi2pmid==None:
doi2pmid = parseDoi2Pmid(baseDir)
xmlString = zipFile.open(filename).read()
xmlTree = pubXml.etreeFromXml(xmlString)
# parse xml
articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
articleData = parseElsevier(xmlTree, articleData)
if articleData==None:
logging.warn("Parser got no data for %s" % filename)
continue
articleData["origFile"]="consyn://"+zipFilename+"/"+filename
if articleData["doi"] in doi2pmid:
articleData["pmid"] = doi2pmid[articleData["doi"]]
pii = splitext(basename(filename))[0]
articleData["externalId"]="PII"+pii
articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii
# convert to ascii
asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
if asciiString==None:
logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
continue
store.writeArticle(articleId, articleData)
# write to output
fileData = createFileData(articleData, mimeType, asciiString)
store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])
convCount += 1
logging.info("Converted %d files" % convCount)
store.close()
示例13: convertOneChunk
def convertOneChunk(gzDir, idFname, inIndexFile, outFile):
# for each row in index:
store = pubStore.PubWriterFile(outFile)
donePiis = pubGeneric.parseDoneIds(idFname)
# log to file
outBase = join(dirname(outFile), basename(outFile).split(".")[0])
logFname = outBase+".log"
pubGeneric.setupLogging(__file__, None, logFileName=logFname)
idFname = outBase+"_ids.tab"
logging.debug("Writing ids to %s" % idFname)
idFh = open(idFname, "w")
idFh.write("#articleId\texternalId\n")
lastTsvFname = None
tsvFile = None
pmidFinder = pubCompare.PmidFinder()
for row in maxCommon.iterTsvRows(inIndexFile, encoding=None):
# open file and seek, if necessry
if tsvFile==None or lastTsvFname!=row.tsvFile:
logging.debug("Seeking to %s in tsvfile %s" % (row.offset, row.tsvFile))
tsvFile = gzip.open(join(gzDir, row.tsvFile))
tsvFile.seek(int(row.offset))
lastTsvFname = row.tsvFile
line = tsvFile.readline()
if row.url.startswith("!"):
logging.info("Ignoring %s, marked as duplicated" % row.url)
continue
#fields are: ["articleId", "tsvFile", "url", "offset"]
fields = line.split("\t")
url = fields[0]
logging.debug("Replacing weird bing chars")
content = fields[-1]
assert(url==row.url)
assert(len(content)!=0)
url = url.decode("utf8")
logging.debug("Converting to text")
content = convertMicrosoft(content)
artDict, fileDict = convertHtmlToDicts(url, content)
if artDict==None:
artDict, fileDict = minimalHtmlToDicts(url, content)
if artDict==None:
continue
artDict["pmid"] = pmidFinder.lookupPmid(artDict)
# write file
articleId = int(row.articleId)
fileId = articleId*1000
store.writeFile(articleId, fileId, fileDict)
store.writeArticle(articleId, artDict)
store.close()
示例14: parseHighwire
def parseHighwire():
""" create two dicts
printIssn -> url to pmidlookup-cgi of highwire
and
publisherName -> top-level hostnames
>>> temps, domains = parseHighwire()
>>> temps['0270-6474']
u'http://www.jneurosci.org/cgi/pmidlookup?view=long&pmid=%(pmid)s'
>>> domains["Society for Neuroscience"]
set([u'jneurosci'])
"""
# highwire's publisher names are not resolved ("SAGE", "SAGE Pub", etc)
# so: first get dict printIssn -> resolved publisherName from publishers.tab
pubFname = join(pubConf.publisherDir, "publishers.tab")
pIssnToPub = {}
for row in maxCommon.iterTsvRows(pubFname):
if not row.pubName.startswith("HIGHWIRE"):
continue
for issn in row.journalIssns.split("|"):
issn = issn.rstrip(" ")
pIssnToPub[issn] = row.pubName.replace("HIGHWIRE ","").strip()
# go over highwire table and make dict pubName -> issn -> templates
# and dict pubName -> domains
fname = join(pubConf.journalListDir, "highwire.tab")
templates = {}
domains = {}
for row in maxCommon.iterTsvRows(fname, encoding="latin1"):
if row.eIssn.strip()=="Unknown":
continue
pubName = pIssnToPub[row.pIssn.strip()].strip()
templates.setdefault(pubName, {})
templates[row.pIssn.strip()] = row.urls.strip()+"/cgi/pmidlookup?view=long&pmid=%(pmid)s"
host = urlparse.urlparse(row.urls).hostname
domain = ".".join(host.split('.')[-2:]).strip()
domains.setdefault(pubName, set()).add(domain)
return templates, domains
示例15: startup
def startup(paramDict):
global geneIds
fname = join(dirname(__file__), "data", "wormFinder", "wormIds.tab.gz")
geneCount = 0
for row in maxCommon.iterTsvRows(fname):
if row.locus!="":
geneIds[row.locus] = row.geneId
if row.seqId!="":
geneIds[row.seqId] = row.geneId
geneCount +=1
#if row.geneId!="":
#geneIds[row.geneId] = row.geneId
logging.info("Loaded %d words mapped to %d genes" % (len(geneIds), geneCount))