本文整理汇总了Python中pymodule.PassingData.jobDataLs方法的典型用法代码示例。如果您正苦于以下问题:Python PassingData.jobDataLs方法的具体用法?Python PassingData.jobDataLs怎么用?Python PassingData.jobDataLs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pymodule.PassingData
的用法示例。
在下文中一共展示了PassingData.jobDataLs方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: preReduce
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def preReduce(self, workflow=None, passingData=None, transferOutput=True, **keywords):
"""
2013.2.10
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
return returnData
示例2: reduceAfterEachAlignment
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def reduceAfterEachAlignment(self, workflow=None, passingData=None, mapEachChromosomeDataLs=None,\
reduceAfterEachChromosomeDataLs=None,\
transferOutput=True, **keywords):
"""
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs
returnData.reduceAfterEachChromosomeDataLs = reduceAfterEachChromosomeDataLs
return returnData
示例3: reduceEachVCF
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def reduceEachVCF(self, workflow=None, chromosome=None, passingData=None, mapEachIntervalDataLs=None,\
transferOutput=True, **keywords):
"""
2013.05.01
#. concatenate all the sub-VCFs into one
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
returnData.mapEachIntervalDataLs = mapEachIntervalDataLs
refineGenotypeJobLs = [pdata.refineGenotypeJob for pdata in mapEachIntervalDataLs]
mergeVCFReplicateColumnsJobLs = [pdata.mergeVCFReplicateColumnsJob for pdata in mapEachIntervalDataLs]
realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci
baseInputVolume = 200*2000000
#base is 4X coverage in 20Mb region => 120 minutes
walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
minJobPropertyValue=60, maxJobPropertyValue=500).value
#base is 4X, => 5000M
job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
baseInputVolume=baseInputVolume, baseJobPropertyValue=2000, \
minJobPropertyValue=2000, maxJobPropertyValue=8000).value
self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \
intervalJobLs=[pdata.beagleJob for pdata in mapEachIntervalDataLs],\
outputDirJob=self.beagleReduceDirJob, \
transferOutput=True, job_max_memory=job_max_memory, walltime=walltime,\
**keywords)
self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \
intervalJobLs=refineGenotypeJobLs, outputDirJob=self.replicateVCFDirJob, \
transferOutput=True, job_max_memory=job_max_memory, walltime=walltime, \
**keywords)
self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \
intervalJobLs=mergeVCFReplicateColumnsJobLs, outputDirJob=self.reduceOutputDirJob, \
transferOutput=True, job_max_memory=job_max_memory, walltime=walltime,\
**keywords)
for pdata in mapEachIntervalDataLs:
#add this output to the union job
"""
self.addInputToStatMergeJob(statMergeJob=self.reduceBeaglePhaseReplicateConcordanceJob_AllSites, \
parentJobLs=[pdata.beaglePhasedReplicateConcordanceJob])
self.addInputToStatMergeJob(statMergeJob=self.reduceBeaglePhaseReplicateConcordanceJob_HomoOnly, \
parentJobLs=[pdata.beaglePhasedReplicateConcordanceJob])
"""
self.addInputToStatMergeJob(statMergeJob=self.reduceTrioCallerReplicateConcordanceJob_AllSites, \
parentJobLs=[pdata.trioCallerReplicateConcordanceJob])
self.addInputToStatMergeJob(statMergeJob=self.reduceTrioCallerReplicateConcordanceJob_HomoOnly, \
parentJobLs=[pdata.trioCallerReplicateConcordanceJob])
return returnData
示例4: mapEachInterval
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def mapEachInterval(self, workflow=None, alignmentData=None, intervalData=None,\
VCFJobData=None, passingData=None, transferOutput=True, **keywords):
"""
2012.9.17
"""
if workflow is None:
workflow = self
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
topOutputDirJob = passingData.topOutputDirJob
alignment = alignmentData.alignment
parentJobLs = alignmentData.jobLs
bamF = alignmentData.bamF
baiF = alignmentData.baiF
bamFnamePrefix = passingData.bamFnamePrefix
if intervalData.file:
mpileupInterval = intervalData.interval
bcftoolsInterval = intervalData.file
else:
mpileupInterval = intervalData.interval
bcftoolsInterval = intervalData.interval
intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature
overlapInterval = intervalData.overlapInterval
overlapFileBasenameSignature = intervalData.overlapIntervalFnameSignature
VCFFile = VCFJobData.file
annotationName = passingData.annotationName
outputFile = File(os.path.join(topOutputDirJob.output, '%s_%s.%s.vcf'%(bamFnamePrefix, overlapFileBasenameSignature, annotationName)))
variantAnnotatorJob = self.addGATKVariantAnnotatorJob(workflow, executable=workflow.annotateVariantJava, \
GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, bamFile=bamF, \
VCFFile=VCFFile, annotationName=annotationName, interval=bcftoolsInterval, outputFile=outputFile, \
refFastaFList=passingData.refFastaFList, parentJobLs=[topOutputDirJob]+parentJobLs,
extraDependentInputLs=[baiF, VCFFile.tbi_F], \
transferOutput=False, \
extraArguments=None, job_max_memory=4000)
outputFile = File(os.path.join(topOutputDirJob.output, '%s_%s.%s.tsv'%(bamFnamePrefix, overlapFileBasenameSignature, annotationName)))
extractInfoJob = self.addGenericJob(workflow=workflow, executable=workflow.ExtractInfoFromVCF, inputFile=variantAnnotatorJob.output, \
inputArgumentOption="-i", \
outputFile=outputFile, outputArgumentOption="-o", \
parentJobLs=[variantAnnotatorJob], extraDependentInputLs=None, extraOutputLs=None, transferOutput=False, \
extraArguments="-k %s"%(annotationName), extraArgumentList=None, job_max_memory=2000, sshDBTunnel=None, \
key2ObjectForJob=None)
returnData.jobDataLs.append(PassingData(jobLs=[variantAnnotatorJob, extractInfoJob], file=variantAnnotatorJob.output, \
fileLs=[variantAnnotatorJob.output, extractInfoJob.output]))
returnData.variantAnnotatorJob=variantAnnotatorJob
returnData.extractInfoJob=extractInfoJob
#add the sub-alignment to the alignment merge job
self.no_of_jobs += 2
return returnData
示例5: linkMapToReduce
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def linkMapToReduce(self, workflow=None, mapEachIntervalData=None, preReduceReturnData=None, passingData=None, transferOutput=True, **keywords):
"""
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
for jobData in mapEachIntervalData.jobDataLs:
calculaJob = jobData.jobLs[0]
self.addInputToStatMergeJob(workflow, statMergeJob=preReduceReturnData.aggregateAndHClusterDistanceMatrixJob, \
inputF=calculaJob.output, \
parentJobLs=[calculaJob])
return returnData
示例6: addAllJobs
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def addAllJobs(self, workflow=None, db_250k=None, association_result_ls=None, \
data_dir=None, min_MAF=None, \
neighbor_distance=None, max_neighbor_distance=None, \
min_score_ls=None, min_overlap_ratio_ls=None, ground_score=None,\
peakPadding=None, tax_id=None, \
outputDirPrefix="", transferOutput=True, job_max_memory=2000, **keywords):
"""
2013.2.27
run ms
estimate parameters from ms
forward simulator with estimated ms-parameters or take the output of ms as input
"""
if workflow is None:
workflow = self
sys.stderr.write("Adding jobs for pop-gen & pedigree sequence simulation #jobs=%s... \n"%\
(self.no_of_jobs))
returnData = PassingData()
returnData.jobDataLs = []
passingData = PassingData(fileBasenamePrefix=None, \
outputDirPrefix=outputDirPrefix, \
jobData=None,\
preReduceReturnData=None,\
association_group_key2orderIndex = {},\
association_group_key2resultList = {},\
association_group_key2reduceAssociationPeakJobMatrix = {},\
association_group_key2countAssociationLocusJobList = {},\
resultID2defineLandscapeJobData = {},
)
preReduceReturnData = self.preReduce(workflow=workflow, outputDirPrefix=outputDirPrefix, \
passingData=passingData, transferOutput=False,\
**keywords)
mapDirJob = preReduceReturnData.mapDirJob
plotOutputDirJob = preReduceReturnData.plotOutputDirJob
countAssociationLocusOutputDirJob = preReduceReturnData.countAssociationLocusOutputDirJob
reduceOutputDirJob = preReduceReturnData.reduceOutputDirJob
passingData.preReduceReturnData = preReduceReturnData
#add output pedigree job
for i in xrange(self.noOfReplicates):
popGenSimulationFolderJob = self.addMkDirJob(outputDir=os.path.join(mapDirJob.output, 'popGenSim%s'%(i)), \
parentJobLs=[mapDirJob])
popSimulationJob = self.addPopGenSimulationJob()
示例7: addJobs
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def addJobs(self, workflow=None, inputData=None, db_vervet=None, genotypeMethodShortName=None, commit=None,\
data_dir=None, checkEmptyVCFByReading=False, transferOutput=True,\
maxContigID=None, outputDirPrefix="", needSSHDBTunnel=False):
"""
2012.5.9
"""
sys.stderr.write("Adding VCF2DB jobs for %s vcf files ... "%(len(inputData.jobDataLs)))
topOutputDir = "%sVCF2DB"%(outputDirPrefix)
topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir)
firstVCFFile = inputData.jobDataLs[0].vcfFile
logFile = File(os.path.join(topOutputDir, 'AddGenotypeMethod2DB.log'))
addGM2DBJob = self.addAddGenotypeMethod2DBJob(executable=self.AddGenotypeMethod2DB, inputFile=firstVCFFile, \
genotypeMethodShortName=genotypeMethodShortName,\
logFile=logFile, data_dir=data_dir, commit=commit, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, \
extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel)
updateGMlogFile = File(os.path.join(topOutputDir, 'updateGM.log'))
updateGMNoOfLociJob = self.addUpdateGenotypeMethodNoOfLociJob(executable=self.UpdateGenotypeMethodNoOfLoci, \
genotypeMethodShortName=genotypeMethodShortName,\
logFile=updateGMlogFile, data_dir=data_dir, commit=commit, parentJobLs=[topOutputDirJob], \
extraDependentInputLs=[], transferOutput=True, \
extraArguments=None, job_max_memory=20, sshDBTunnel=needSSHDBTunnel)
returnData = PassingData()
returnData.jobDataLs = []
for jobData in inputData.jobDataLs:
inputF = jobData.vcfFile
if maxContigID:
contig_id = self.getContigIDFromFname(inputF.name)
try:
contig_id = int(contig_id)
if contig_id>maxContigID: #skip the small contigs
continue
except:
sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
import traceback
traceback.print_exc()
logFile = File(os.path.join(topOutputDir, 'AddVCFFile2DB_%s.log'%(self.getChrFromFname(inputF.name))))
addVCFJob = self.addAddVCFFile2DBJob(executable=self.AddVCFFile2DB, inputFile=inputF, genotypeMethodShortName=genotypeMethodShortName,\
logFile=logFile, format="VCF", data_dir=data_dir, checkEmptyVCFByReading=checkEmptyVCFByReading, commit=commit, \
parentJobLs=[addGM2DBJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=True, \
extraArguments=None, job_max_memory=1000, sshDBTunnel=needSSHDBTunnel)
workflow.depends(parent=addVCFJob, child=updateGMNoOfLociJob)
sys.stderr.write("%s jobs.\n"%(self.no_of_jobs))
#include the tfam (outputList[1]) into the fileLs
returnData.jobDataLs.append(PassingData(jobLs=[updateGMNoOfLociJob], file=updateGMlogFile, \
fileLs=[updateGMlogFile]))
return returnData
示例8: reduceBeforeEachAlignment
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def reduceBeforeEachAlignment(self, workflow=None, passingData=None, preReduceReturnData=None, transferOutput=True, **keywords):
"""
2012.9.17
add a merge variant annotation job, GW plot job
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
outputDirPrefix = passingData.outputDirPrefix
statOutputDirJob = preReduceReturnData.statOutputDirJob
plotOutputDirJob = preReduceReturnData.plotOutputDirJob
mergeOutputF = File(os.path.join(statOutputDirJob.output, '%s_%s.tsv'%(passingData.bamFnamePrefix, passingData.annotationName)))
mergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
outputF=mergeOutputF, transferOutput=transferOutput, parentJobLs=[statOutputDirJob],)
returnData.jobDataLs.append(PassingData(jobLs=[mergeJob ], file=mergeJob.output, fileLs=[mergeJob.output], mergeJob=mergeJob))
self.no_of_jobs += 1
outputFnamePrefix = os.path.join(plotOutputDirJob.output, '%s_%s_Plot'%(passingData.bamFnamePrefix, passingData.annotationName))
# whichColumnPlotLabel and xColumnPlotLabel should not contain spaces or ( or ). because they will disrupt shell commandline
self.addPlotVCFtoolsStatJob(executable=workflow.PlotVCFtoolsStat, inputFileList=[mergeOutputF], \
outputFnamePrefix=outputFnamePrefix, \
whichColumn=None, whichColumnHeader=passingData.annotationName, whichColumnPlotLabel=passingData.annotationName, \
need_svg=False, \
logY=0, valueForNonPositiveYValue=-1, \
xColumnPlotLabel="position", chrLengthColumnHeader=None, chrColumnHeader="CHROM", \
minChrLength=None, xColumnHeader="POS", minNoOfTotal=50,\
figureDPI=100, ylim_type=2, samplingRate=0.01,\
parentJobLs=[mergeJob, plotOutputDirJob], \
extraDependentInputLs=None, \
extraArguments=None, transferOutput=True, sshDBTunnel=self.needSSHDBTunnel)
self.no_of_jobs += 1
outputFile = File( os.path.join(plotOutputDirJob.output, '%s_%s_Hist.png'%(passingData.bamFnamePrefix, passingData.annotationName)))
#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[mergeJob.output], \
outputFile=outputFile, \
whichColumn=None, whichColumnHeader=passingData.annotationName, whichColumnPlotLabel=passingData.annotationName, \
logY=None, logCount=True, valueForNonPositiveYValue=-1,\
minNoOfTotal=10,\
figureDPI=100, samplingRate=0.1,\
parentJobLs=[plotOutputDirJob, mergeJob], \
extraDependentInputLs=None, \
extraArguments=None, transferOutput=True, job_max_memory=2000)
self.no_of_jobs += 1
return returnData
示例9: preReduce
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords):
"""
2012.9.17
"""
parentPreReduceData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, passingData=passingData, \
transferOutput=transferOutput, **keywords)
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
callOutputDir = "call"
callOutputDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=callOutputDir)
passingData.callOutputDirJob = callOutputDirJob
matrixDir = "pairwiseDistMatrix"
matrixDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=matrixDir)
passingData.matrixDirJob = matrixDirJob
reduceOutputDirJob = passingData.reduceOutputDirJob
#2012.10.9 reduceOutputDirJob was added to passingData during AbstractVCFWorkflow.preReduce()
#reduceOutputDir = "aggregateData"
#reduceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=reduceOutputDir)
#passingData.reduceOutputDirJob = reduceOutputDirJob
figureFnamePrefix = os.path.join(reduceOutputDirJob.output, 'aggregateDistanceMatrix')
aggregateDistanceMatrixOutputF = File('%s.tsv'%(figureFnamePrefix))
PCAFile = File('%s_PCA.tsv'%(figureFnamePrefix))
aggregateAndHClusterDistanceMatrixJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.AggregateAndHClusterDistanceMatrix, \
outputF=aggregateDistanceMatrixOutputF, \
parentJobLs=[reduceOutputDirJob],extraOutputLs=[PCAFile, File('%s.png'%(figureFnamePrefix)), \
File('%s.svg'%(figureFnamePrefix))], \
extraDependentInputLs=[], transferOutput=True, extraArguments="-f %s"%(figureFnamePrefix))
returnData.aggregateAndHClusterDistanceMatrixJob = aggregateAndHClusterDistanceMatrixJob
#2012.9.5 add the job to append meta info (country, sex, latitude, etc. of each monkey)
outputF = File('%s_withMetaInfo.tsv'%(figureFnamePrefix))
appendInfo2PCAOutputJob = self.addGenericDBJob(executable=self.AppendInfo2SmartPCAOutput, inputFile=PCAFile, \
outputFile=outputF, \
parentJobLs=[aggregateAndHClusterDistanceMatrixJob], extraDependentInputLs=None, \
extraOutputLs=None,\
transferOutput=True, \
extraArgumentList=None, extraArguments=None, sshDBTunnel=self.needSSHDBTunnel, \
key2ObjectForJob=None, job_max_memory=2000)
return returnData
示例10: preReduce
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def preReduce(self, workflow=None, passingData=None, transferOutput=True, **keywords):
"""
2012.9.17
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
outputDirPrefix = passingData.outputDirPrefix
#pass it along
passingData.annotationName = 'HaplotypeScore'
statOutputDir = "%sstat"%(outputDirPrefix)
statOutputDirJob = self.addMkDirJob(outputDir=statOutputDir)
plotOutputDir = "%splot"%(outputDirPrefix)
plotOutputDirJob = self.addMkDirJob(outputDir=plotOutputDir)
self.no_of_jobs += 2
returnData.plotOutputDirJob = plotOutputDirJob
returnData.statOutputDirJob = statOutputDirJob
return returnData
示例11: mapEachInterval
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def mapEachInterval(self, workflow=None, \
VCFJobData=None, passingData=None, transferOutput=False, **keywords):
"""
2012.9.22
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
topOutputDirJob = passingData.topOutputDirJob
intervalFileBasenamePrefix = passingData.intervalFileBasenamePrefix
jobData = passingData.jobData
callOutputDirJob = passingData.callOutputDirJob
splitVCFJob = passingData.mapEachVCFData.splitVCFJob
genotypeCallOutputFname = os.path.join(callOutputDirJob.output, '%s.call'%(intervalFileBasenamePrefix))
genotypeCallOutput = File(genotypeCallOutputFname)
genotypeCallByCoverage_job = self.addVCF2MatrixJob(workflow, executable=self.GenotypeCallByCoverage, \
inputVCF=VCFJobData.file, outputFile=genotypeCallOutput, \
refFastaF=None, run_type=3, numberOfReadGroups=10, minDepth=self.minDepth,\
parentJobLs=[callOutputDirJob, splitVCFJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=False, \
extraArguments=None, job_max_memory=2000)
matrixDirJob = passingData.matrixDirJob
calculaOutputFname =os.path.join(matrixDirJob.output, '%s.pairwiseDist.convertHetero2NA%s.minMAF%.2f.maxNA%.2f.tsv'%(intervalFileBasenamePrefix, \
self.convertHetero2NA, self.min_MAF, self.max_NA_rate))
calculaOutput = File(calculaOutputFname)
calculaJob = self.addCalculatePairwiseDistanceFromSNPXStrainMatrixJob(workflow, \
executable=self.CalculatePairwiseDistanceOutOfSNPXStrainMatrix, \
inputFile=genotypeCallOutput, outputFile=calculaOutput, \
min_MAF=self.min_MAF, max_NA_rate=self.max_NA_rate, convertHetero2NA=self.convertHetero2NA, \
hetHalfMatchDistance=self.hetHalfMatchDistance,\
parentJobLs=[genotypeCallByCoverage_job, matrixDirJob], extraDependentInputLs=[], transferOutput=False, \
extraArguments=None, job_max_memory=2000)
returnData.jobDataLs.append(PassingData(jobLs=[calculaJob], file=calculaJob.output, \
fileLs=[calculaJob.output]))
returnData.calculaJob = calculaJob
return returnData
示例12: reduceEachChromosome
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def reduceEachChromosome(self, workflow=None, chromosome=None, passingData=None, mapEachVCFDataLs=None,\
reduceEachVCFDataLs=None, \
transferOutput=True, \
**keywords):
"""
2012.10.3
#. merge all VCF-level reduce job (from one chromosome) output (passingData.reduceEachVCFDataLs) into one first
#taking the input jobs of each reduceEachVCFData as input of this per-chromosome reduce job.
#. don't use passingData.mapEachVCFDataLsLs, cuz it's empty.
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
topOutputDirJob = passingData.topOutputDirJob
reduceOutputDirJob = passingData.reduceOutputDirJob
chromosome = passingData.chromosome
fnamePrefix = os.path.join(reduceOutputDirJob.output, '%s_frequency_juxtapose'%(chromosome))
outputFile = File('%s.tsv'%(fnamePrefix))
reduceEachChromosomeJob = self.addStatMergeJob(workflow, \
statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
outputF=outputFile, \
parentJobLs=[reduceOutputDirJob],extraOutputLs=[], \
extraDependentInputLs=[], transferOutput=transferOutput,)
#2012.10.7 don't add it to returnData.jobDataLs unless it needs to be gzipped and transferred out
#returnData.jobDataLs.append(PassingData(jobLs=[reduceEachChromosomeJob], file=reduceEachChromosomeJob.output, \
# fileLs=[reduceEachChromosomeJob.output]))
returnData.reduceEachChromosomeJob = reduceEachChromosomeJob
for reduceEachVCFData in reduceEachVCFDataLs:
for mapEachIntervalData in reduceEachVCFData.mapEachIntervalDataLs:
juxtaposeAFJob = mapEachIntervalData.juxtaposeAFJob
self.addInputToStatMergeJob(workflow, statMergeJob=reduceEachChromosomeJob, \
parentJobLs=[juxtaposeAFJob])
return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:38,代码来源:CompareAlleleFrequencyOfTwoPopulationFromOneVCFFolder.py
示例13: reduce
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def reduce(self, workflow=None, passingData=None, reduceEachChromosomeDataLs=None,\
transferOutput=True, **keywords):
"""
2012.10.3
#. reduce all previous jobs (passingData.reduceEachChromosomeDataLs) into one final output
#. merge all the output and run Draw2DHistogramOfMatrix.py
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
reduceOutputDirJob = passingData.reduceOutputDirJob
fnamePrefix = os.path.join(reduceOutputDirJob.output, 'frequency_juxtapose_%s_vs_%s'%(self.pop1Header, self.pop2Header))
outputFile = File('%s.tsv'%(fnamePrefix))
reduceJob = self.addStatMergeJob(workflow, \
statMergeProgram=self.MergeSameHeaderTablesIntoOne, \
outputF=outputFile, \
parentJobLs=[reduceOutputDirJob],extraOutputLs=[], \
extraDependentInputLs=[], transferOutput=transferOutput,)
returnData.jobDataLs.append(PassingData(jobLs=[reduceJob], file=reduceJob.output, \
fileLs=[reduceJob.output]))
for reduceEachChromosomeData in reduceEachChromosomeDataLs:
parentJob = reduceEachChromosomeData.reduceEachChromosomeJob
self.addInputToStatMergeJob(workflow, statMergeJob=reduceJob, \
parentJobLs=[parentJob])
#add a Draw2DHistogramOfMatrix.py job
outputFile = File('%s.png'%(fnamePrefix))
drawJob = self.addDraw2DHistogramOfMatrixJob(workflow=workflow, executable=self.Draw2DHistogramOfMatrix, \
inputFileList=None, inputFile=reduceJob.output, outputFile=outputFile, \
outputFnamePrefix=None, whichColumn=None, whichColumnHeader=self.pop1Header, whichColumnPlotLabel=None, \
valueForNonPositiveYValue=-1, \
missingDataNotation='NA',\
xColumnHeader=self.pop2Header, xColumnPlotLabel=None, \
minNoOfTotal=100,\
figureDPI=300, formatString='.', samplingRate=1, need_svg=False, \
zColumnHeader=None, logX=False, logY=False, logZ=False,\
parentJobLs=[reduceJob], \
extraDependentInputLs=None, \
extraArgumentList=None, extraArguments=None, transferOutput=True, job_max_memory=2000)
returnData.drawJob = drawJob
#2012.10.15 add a EstimateOutliersIn2DData job
extraArgumentList= ['--minAbsDelta %s'%(self.minAbsDelta)]
outputFile = File('%s_outlierStat_minAbsDelta%s.tsv'%(fnamePrefix, self.minAbsDelta))
estimateOutlierJob = self.addAbstractPlotJob(workflow=workflow, executable=self.EstimateOutliersIn2DData, \
inputFileList=None, inputFile=reduceJob.output, outputFile=outputFile, \
outputFnamePrefix=None, whichColumn=None, whichColumnHeader=self.pop1Header, whichColumnPlotLabel=None, \
logY=False, valueForNonPositiveYValue=-1, \
missingDataNotation='NA',\
xColumnHeader=self.pop2Header, xColumnPlotLabel=None, \
minNoOfTotal=0,\
samplingRate=1, \
parentJob=reduceJob, parentJobLs=None, \
extraDependentInputLs=None, \
extraArgumentList=extraArgumentList, extraArguments=None, transferOutput=transferOutput, job_max_memory=2000)
returnData.jobDataLs.append(PassingData(jobLs=[estimateOutlierJob], file=estimateOutlierJob.output, \
fileLs=[estimateOutlierJob.output]))
returnData.estimateOutlierJob = estimateOutlierJob
return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:66,代码来源:CompareAlleleFrequencyOfTwoPopulationFromOneVCFFolder.py
示例14: mapEachInterval
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def mapEachInterval(self, workflow=None, VCFJobData=None, chromosome=None,intervalData=None,\
mapEachChromosomeData=None, passingData=None, transferOutput=False, \
**keywords):
"""
2012.9.22
argument VCFJobData looks like PassingData(file=splitVCFFile, vcfFile=splitVCFFile, fileLs=[splitVCFFile], \
job=splitVCFJob, jobLs=[splitVCFJob], tbi_F=None)
"""
returnData = PassingData(no_of_jobs = 0)
returnData.jobDataLs = []
passingData.intervalFileBasenamePrefix
passingData.splitVCFFile
passingData.unitNumber
"""
passingData.fileBasenamePrefix
## 2013.06.19 structures available from passingData, specific to the interval
passingData.splitVCFFile = splitVCFFile
#VCFJobData.file is same as passingData.splitVCFFile
passingData.unitNumber = unitNumber
passingData.intervalFileBasenamePrefix = '%s_%s_splitVCF_u%s'%(chromosome, commonPrefix, unitNumber)
passingData.noOfIndividuals = jobData.file.noOfIndividuals
passingData.span = self.intervalSize + self.intervalOverlapSize*2 #2013.06.19 for memory/walltime gauging
"""
# 2013.12.04 VCF combine
#a combine VCF horizontally job
#2011-9-22 union of all GATK intervals for one contig
combineCallOutputFname = os.path.join(self.mapDirJob.output, '%s.vcf.gz'%passingData.intervalFileBasenamePrefix)
combineCallOutputF = File(combineCallOutputFname)
returnData.combineCallJob = self.addGATKCombineVariantsJob(executable=self.CombineVariantsJavaInReduce, \
refFastaFList=self.registerReferenceData.refFastaFList, \
outputFile=combineCallOutputF, \
genotypeMergeOptions='UNSORTED', \
parentJobLs=[self.mapDirJob], \
extraArguments=None, extraArgumentList=None, extraDependentInputLs=None,\
transferOutput=False, job_max_memory=10000, walltime=180)
#add a job to calculate missing fraction per locus in medium and higher coverage individuals
outputFile = File(os.path.join(self.reduceStatDirJob.output, '%s_missingFractionPerLocus.tsv.gz'%(passingData.intervalFileBasenamePrefix)))
missingFractionPerLocusJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
outputF=outputFile, extraArgumentList=["--keyColumnLs 1", "--valueColumnLs 5", "--operatorType 3",\
"--fixedValueDenominator %s"%(self.noOfMediumCoverageIndividuals)],\
parentJobLs=[self.reduceStatDirJob],extraOutputLs=None, \
extraDependentInputLs=None, transferOutput=False)
returnData.markGenotypeMissingJobLs = []
#for each sample from the genotype method
for alignmentData in self.alignmentDataLs:
if not alignmentData.newAlignment:
continue
individual_alignment = alignmentData.alignment
sampleID = individual_alignment.read_group
# select that sample into VCF
outputFile = File(os.path.join(self.mapDirJob.output, '%s_sample_%s.vcf.gz'%(passingData.intervalFileBasenamePrefix, sampleID)))
selectSampleJob = self.addSelectVariantsJob(\
inputF=passingData.splitVCFFile, outputF=outputFile, \
interval=None,\
refFastaFList=self.registerReferenceData.refFastaFList, sampleIDKeepFile=None, \
snpIDKeepFile=None, sampleIDExcludeFile=None, \
parentJobLs=[self.mapDirJob] + VCFJobData.jobLs, extraDependentInputLs=None, transferOutput=False, \
extraArguments="--sample_name %s"%(alignmentData.alignment.read_group), \
extraArgumentList=None, job_max_memory=2000, walltime=None)
# mask genotype missing
# given single-sample VCF file, alignment file, median-depth
# output two files, one is a VCF , one is a missing genotype stat file
genotypeMissingStatFile = File(os.path.join(self.mapDirJob.output, '%s_sample_%s_genotypeMissingStat.tsv.gz'%(passingData.intervalFileBasenamePrefix,\
sampleID)))
outputFile = File(os.path.join(self.mapDirJob.output, '%s_sample_%s_markedMissing.vcf.gz'%(passingData.intervalFileBasenamePrefix,\
sampleID)))
alignmentFile = alignmentData.newAlignment.file
if individual_alignment.individual_sequence.individual.target_coverage>=self.minMediumCoverageThreshold:
alignmentDepthFold=2
else:
alignmentDepthFold=10
markGenotypeMissingJob = self.addGenericJob(executable=self.MarkGenotypeMissingByAlignmentQuality, \
inputFile=selectSampleJob.output, inputArgumentOption="-i", \
outputFile=outputFile, outputArgumentOption="-o", \
parentJob=None, parentJobLs=[self.mapDirJob, selectSampleJob], \
extraDependentInputLs=alignmentData.newAlignment.fileLs, extraOutputLs=[genotypeMissingStatFile], \
extraArgumentList=["--alignmentFilename", alignmentFile, "--missingStatFname", genotypeMissingStatFile, \
"--alignmentMedianDepth %s"%(alignmentData.newAlignment.median_depth), \
"--alignmentDepthFold %s"%(alignmentDepthFold), \
"--minMapQGoodRead 2", "--minFractionOfGoodRead 0.9",\
"--sampleID %s"%(sampleID)], \
transferOutput=False, sshDBTunnel=None, \
key2ObjectForJob=None, objectWithDBArguments=None, objectWithDBGenomeArguments=None,\
no_of_cpus=None, job_max_memory=2000, walltime=180, \
max_walltime=None)
markGenotypeMissingJob.genotypeMissingStatFile = genotypeMissingStatFile
returnData.markGenotypeMissingJobLs.append(markGenotypeMissingJob)
if individual_alignment.individual_sequence.individual.target_coverage>=self.minMediumCoverageThreshold:
#missing fraction only from medium or high coverage individuals
self.addInputToStatMergeJob(statMergeJob=missingFractionPerLocusJob, parentJobLs=[markGenotypeMissingJob],\
inputF=markGenotypeMissingJob.genotypeMissingStatFile)
#.........这里部分代码省略.........
示例15: addTrioCallerJobsONVCFFiles
# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
def addTrioCallerJobsONVCFFiles(self, workflow=None, alignmentLs=None, inputData=None, samtools=None, \
genotyperJava=None, SelectVariantsJava=None, GenomeAnalysisTKJar=None, \
addOrReplaceReadGroupsJava=None, AddOrReplaceReadGroupsJar=None, \
CreateSequenceDictionaryJava=None, CreateSequenceDictionaryJar=None, \
MergeSamFilesJar=None, \
BuildBamIndexFilesJava=None, BuildBamIndexJar=None,\
mv=None, CallVariantBySamtools=None,\
trioCallerPath=None, trioCallerWrapper=None, \
replicateIndividualTag="copy", treatEveryOneIndependent=False,\
bgzip_tabix=None, vcf_convert=None, vcf_isec=None, vcf_concat=None, \
concatGATK=None, concatSamtools=None, ligateVcf=None, ligateVcfExecutableFile=None,\
registerReferenceData=None, \
namespace='workflow', version="1.0", site_handler=None, input_site_handler=None,\
needFastaIndexJob=False, needFastaDictJob=False, \
intervalSize=2000000, intervalOverlapSize=100000, site_type=1, data_dir=None, no_of_gatk_threads = 1, \
outputDirPrefix="", \
maxSNPMissingRate=None, alnStatForFilterF=None, onlyKeepBiAllelicSNP=True, \
cumulativeMedianDepth=5000, job_max_memory = 2000, vcf_job_max_memory = 1000,\
run_type=2, transferOutput=True, **keywords):
"""
2012.12.5 added argument run_type (same as self.run_type) 2: TrioCaller; 3: polymutt
2012.8.15
"""
sys.stderr.write("Adding trioCaller jobs for %s vcf files ..."%(len(inputData.jobDataLs)))
if workflow is None :
workflow = self
refFastaFList = registerReferenceData.refFastaFList
refFastaF = refFastaFList[0]
if needFastaDictJob or registerReferenceData.needPicardFastaDictJob:
fastaDictJob = self.addRefFastaDictJob(workflow, CreateSequenceDictionaryJava=CreateSequenceDictionaryJava, \
refFastaF=refFastaF)
refFastaDictF = fastaDictJob.refFastaDictF
else:
fastaDictJob = None
refFastaDictF = registerReferenceData.refPicardFastaDictF
if needFastaIndexJob or registerReferenceData.needSAMtoolsFastaIndexJob:
fastaIndexJob = self.addRefFastaFaiIndexJob(workflow, samtools=samtools, refFastaF=refFastaF)
refFastaIndexF = fastaIndexJob.refFastaIndexF
else:
fastaIndexJob = None
refFastaIndexF = registerReferenceData.refSAMtoolsFastaIndexF
trioCallerOutputDir = "%sRefinedCalls"%(outputDirPrefix)
trioCallerOutputDirJob = self.addMkDirJob(outputDir=trioCallerOutputDir)
round1CallDir = "%sPreRefinedCalls"%(outputDirPrefix)
round1CallDirJob = self.addMkDirJob(outputDir=round1CallDir)
outputPedigreeJob = None
# add merge jobs for every reference
returnData = PassingData()
returnData.jobDataLs = []
for i in xrange(len(inputData.jobDataLs)):
jobData = inputData.jobDataLs[i]
inputF = jobData.vcfFile
inputFBaseName = os.path.basename(inputF.name)
chr_id = self.getChrFromFname(inputFBaseName)
commonPrefix = inputFBaseName.split('.')[0]
overlapInterval = chr_id
#split VCF job
outputFnamePrefix = os.path.join(round1CallDirJob.folder, '%s_splitVCF'%commonPrefix)
splitVCFJob = self.addSplitVCFFileJob(executable=self.SplitVCFFile, inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \
noOfOverlappingSites=intervalOverlapSize, noOfSitesPerUnit=intervalSize, noOfTotalSites=inputF.noOfLoci, \
parentJobLs=jobData.jobLs+[round1CallDirJob], \
extraDependentInputLs=[jobData.tbi_F], \
extraArguments=None, transferOutput=False, job_max_memory=job_max_memory)
#ligate vcf job (different segments of a chromosome into one chromosome)
concatTrioCallerOutputFname = os.path.join(trioCallerOutputDirJob.folder, '%s.vcf'%chr_id)
concatTrioCallerOutputF = File(concatTrioCallerOutputFname)
trioCallerWholeContigConcatJob = self.addLigateVcfJob(executable=ligateVcf, ligateVcfExecutableFile=ligateVcfExecutableFile, \
outputFile=concatTrioCallerOutputF, \
parentJobLs=[trioCallerOutputDirJob], extraDependentInputLs=[], transferOutput=False, \
extraArguments=None, job_max_memory=vcf_job_max_memory)
#bgzip and tabix the trio caller output
bgzip_concatTrioCallerOutputF = File("%s.gz"%concatTrioCallerOutputFname)
bgzip_tabix_concatTrioCallerOutput_job = self.addBGZIP_tabix_Job(workflow, bgzip_tabix=bgzip_tabix, \
parentJob=trioCallerWholeContigConcatJob, inputF=concatTrioCallerOutputF, outputF=bgzip_concatTrioCallerOutputF, \
transferOutput=transferOutput)
returnData.jobDataLs.append(PassingData(vcfFile=bgzip_concatTrioCallerOutputF, jobLs=[bgzip_tabix_concatTrioCallerOutput_job]))
#self.addRefFastaJobDependency(workflow, wholeRefUnionOfIntersectionJob, refFastaF=refFastaF, fastaDictJob=fastaDictJob, \
# refFastaDictF=refFastaDictF, fastaIndexJob = fastaIndexJob, refFastaIndexF = refFastaIndexF)
noOfUnits = max(1, utils.getNoOfUnitsNeededToCoverN(N=inputF.noOfLoci, s=intervalSize, o=intervalOverlapSize)-1)
for unitNumber in xrange(1, noOfUnits+1):
splitVCFFile = getattr(splitVCFJob, 'unit%sFile'%(unitNumber))
#2012.4.2
tranferIntermediateFilesForDebug=False
overlapIntervalFnameSignature = '%s_%s'%(commonPrefix, unitNumber)
#selectVariants would generate AC, AF so that TrioCaller could read it. (samtools uses 'AC1' instead of AC, 'AF1' instead of AF.
round1_VCF4OutputFname = os.path.join(round1CallDirJob.folder, '%s.niceformat.vcf'%overlapIntervalFnameSignature)
round1_VCF4OutputF = File(round1_VCF4OutputFname)
round1_vcf_convert_job = self.addSelectVariantsJob(SelectVariantsJava=SelectVariantsJava, \
#.........这里部分代码省略.........