当前位置: 首页>>代码示例>>Python>>正文


Python PassingData.jobDataLs方法代码示例

本文整理汇总了Python中pymodule.PassingData.jobDataLs方法的典型用法代码示例。如果您正苦于以下问题:Python PassingData.jobDataLs方法的具体用法?Python PassingData.jobDataLs怎么用?Python PassingData.jobDataLs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pymodule.PassingData的用法示例。


在下文中一共展示了PassingData.jobDataLs方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: preReduce

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def preReduce(self, workflow=None, passingData=None, transferOutput=True, **keywords):
		"""
		2013.2.10
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:10,代码来源:PSMCOnAlignmentWorkflow.py

示例2: reduceAfterEachAlignment

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def reduceAfterEachAlignment(self, workflow=None, passingData=None, mapEachChromosomeDataLs=None,\
								reduceAfterEachChromosomeDataLs=None,\
								transferOutput=True, **keywords):
		"""
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs
		returnData.reduceAfterEachChromosomeDataLs = reduceAfterEachChromosomeDataLs
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:12,代码来源:InspectAlignmentPipeline.py

示例3: reduceEachVCF

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def reduceEachVCF(self, workflow=None, chromosome=None, passingData=None, mapEachIntervalDataLs=None,\
					transferOutput=True, **keywords):
		"""
		2013.05.01
			#. concatenate all the sub-VCFs into one
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		returnData.mapEachIntervalDataLs = mapEachIntervalDataLs
		
		refineGenotypeJobLs = [pdata.refineGenotypeJob for pdata in mapEachIntervalDataLs]
		mergeVCFReplicateColumnsJobLs = [pdata.mergeVCFReplicateColumnsJob for pdata in mapEachIntervalDataLs]
		
		
		realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci
		baseInputVolume = 200*2000000
		#base is 4X coverage in 20Mb region => 120 minutes
		walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=500).value
		#base is 4X, => 5000M
		job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=2000, \
							minJobPropertyValue=2000, maxJobPropertyValue=8000).value
		self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \
						intervalJobLs=[pdata.beagleJob for pdata in mapEachIntervalDataLs],\
						outputDirJob=self.beagleReduceDirJob, \
						transferOutput=True, job_max_memory=job_max_memory, walltime=walltime,\
						**keywords)
		
		self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \
						intervalJobLs=refineGenotypeJobLs, outputDirJob=self.replicateVCFDirJob, \
						transferOutput=True, job_max_memory=job_max_memory, walltime=walltime, \
						**keywords)
		
		self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \
						intervalJobLs=mergeVCFReplicateColumnsJobLs, outputDirJob=self.reduceOutputDirJob, \
						transferOutput=True, job_max_memory=job_max_memory, walltime=walltime,\
						**keywords)
		
		
		for pdata in mapEachIntervalDataLs:
			#add this output to the union job
			"""
			self.addInputToStatMergeJob(statMergeJob=self.reduceBeaglePhaseReplicateConcordanceJob_AllSites, \
							parentJobLs=[pdata.beaglePhasedReplicateConcordanceJob])
			self.addInputToStatMergeJob(statMergeJob=self.reduceBeaglePhaseReplicateConcordanceJob_HomoOnly, \
							parentJobLs=[pdata.beaglePhasedReplicateConcordanceJob])
			"""
			self.addInputToStatMergeJob(statMergeJob=self.reduceTrioCallerReplicateConcordanceJob_AllSites, \
							parentJobLs=[pdata.trioCallerReplicateConcordanceJob])
			self.addInputToStatMergeJob(statMergeJob=self.reduceTrioCallerReplicateConcordanceJob_HomoOnly, \
							parentJobLs=[pdata.trioCallerReplicateConcordanceJob])
		
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:57,代码来源:BeagleAndTrioCallerOnVCFWorkflow.py

示例4: mapEachInterval

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def mapEachInterval(self, workflow=None, alignmentData=None, intervalData=None,\
			VCFJobData=None, passingData=None, transferOutput=True, **keywords):
		"""
		2012.9.17
		"""
		if workflow is None:
			workflow = self
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		topOutputDirJob = passingData.topOutputDirJob
		
		alignment = alignmentData.alignment
		parentJobLs = alignmentData.jobLs
		bamF = alignmentData.bamF
		baiF = alignmentData.baiF
		bamFnamePrefix = passingData.bamFnamePrefix
		
		
		if intervalData.file:
			mpileupInterval = intervalData.interval
			bcftoolsInterval = intervalData.file
		else:
			mpileupInterval = intervalData.interval
			bcftoolsInterval = intervalData.interval
		intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature
		overlapInterval = intervalData.overlapInterval
		overlapFileBasenameSignature = intervalData.overlapIntervalFnameSignature
		VCFFile = VCFJobData.file
		annotationName = passingData.annotationName
		outputFile = File(os.path.join(topOutputDirJob.output, '%s_%s.%s.vcf'%(bamFnamePrefix, overlapFileBasenameSignature, annotationName)))
		variantAnnotatorJob = self.addGATKVariantAnnotatorJob(workflow, executable=workflow.annotateVariantJava, \
								GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, bamFile=bamF, \
								VCFFile=VCFFile, annotationName=annotationName, interval=bcftoolsInterval, outputFile=outputFile, \
								refFastaFList=passingData.refFastaFList, parentJobLs=[topOutputDirJob]+parentJobLs, 
								extraDependentInputLs=[baiF, VCFFile.tbi_F], \
								transferOutput=False, \
								extraArguments=None, job_max_memory=4000)
		
		outputFile = File(os.path.join(topOutputDirJob.output, '%s_%s.%s.tsv'%(bamFnamePrefix, overlapFileBasenameSignature, annotationName)))
		extractInfoJob = self.addGenericJob(workflow=workflow, executable=workflow.ExtractInfoFromVCF, inputFile=variantAnnotatorJob.output, \
						inputArgumentOption="-i", \
						outputFile=outputFile, outputArgumentOption="-o", \
						parentJobLs=[variantAnnotatorJob], extraDependentInputLs=None, extraOutputLs=None, transferOutput=False, \
						extraArguments="-k %s"%(annotationName), extraArgumentList=None, job_max_memory=2000,  sshDBTunnel=None, \
						key2ObjectForJob=None)
		
		returnData.jobDataLs.append(PassingData(jobLs=[variantAnnotatorJob, extractInfoJob], file=variantAnnotatorJob.output, \
											fileLs=[variantAnnotatorJob.output, extractInfoJob.output]))
		returnData.variantAnnotatorJob=variantAnnotatorJob
		returnData.extractInfoJob=extractInfoJob
		#add the sub-alignment to the alignment merge job
		self.no_of_jobs += 2
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:56,代码来源:HaplotypeScoreWorkflow.py

示例5: linkMapToReduce

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def linkMapToReduce(self, workflow=None, mapEachIntervalData=None, preReduceReturnData=None, passingData=None, transferOutput=True, **keywords):
		"""
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		for jobData in mapEachIntervalData.jobDataLs:
			calculaJob = jobData.jobLs[0]
			self.addInputToStatMergeJob(workflow, statMergeJob=preReduceReturnData.aggregateAndHClusterDistanceMatrixJob, \
						inputF=calculaJob.output, \
						parentJobLs=[calculaJob])
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:14,代码来源:CalculateDistanceMatrixFromVCFPipe.py

示例6: addAllJobs

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def addAllJobs(self, workflow=None, db_250k=None, association_result_ls=None, \
				data_dir=None, min_MAF=None, \
				neighbor_distance=None, max_neighbor_distance=None, \
				min_score_ls=None, min_overlap_ratio_ls=None, ground_score=None,\
				peakPadding=None, tax_id=None, \
				outputDirPrefix="", transferOutput=True, job_max_memory=2000, **keywords):
		"""
		2013.2.27
			run ms
			estimate parameters from ms
			forward simulator with estimated ms-parameters or take the output of ms as input
			
			
		"""
		if workflow is None:
			workflow = self
		
		sys.stderr.write("Adding jobs for pop-gen & pedigree sequence simulation #jobs=%s... \n"%\
							(self.no_of_jobs))
		
		returnData = PassingData()
		returnData.jobDataLs = []
		
		passingData = PassingData(fileBasenamePrefix=None, \
					outputDirPrefix=outputDirPrefix, \
					jobData=None,\
					preReduceReturnData=None,\
					association_group_key2orderIndex = {},\
					association_group_key2resultList = {},\
					association_group_key2reduceAssociationPeakJobMatrix = {},\
					association_group_key2countAssociationLocusJobList = {},\
					resultID2defineLandscapeJobData = {},
					)
		
		preReduceReturnData = self.preReduce(workflow=workflow, outputDirPrefix=outputDirPrefix, \
									passingData=passingData, transferOutput=False,\
									**keywords)
		
		mapDirJob = preReduceReturnData.mapDirJob
		plotOutputDirJob = preReduceReturnData.plotOutputDirJob
		countAssociationLocusOutputDirJob = preReduceReturnData.countAssociationLocusOutputDirJob
		reduceOutputDirJob = preReduceReturnData.reduceOutputDirJob
		
		passingData.preReduceReturnData = preReduceReturnData
		
		#add output pedigree job
		
		for i in xrange(self.noOfReplicates):
			popGenSimulationFolderJob = self.addMkDirJob(outputDir=os.path.join(mapDirJob.output, 'popGenSim%s'%(i)), \
														parentJobLs=[mapDirJob])
			popSimulationJob = self.addPopGenSimulationJob()
开发者ID:mjmontague,项目名称:vervet-web,代码行数:53,代码来源:SimulatePedigreeSequenceWorkflow.py

示例7: addJobs

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def addJobs(self, workflow=None, inputData=None, db_vervet=None, genotypeMethodShortName=None, commit=None,\
			data_dir=None, checkEmptyVCFByReading=False, transferOutput=True,\
			maxContigID=None, outputDirPrefix="", needSSHDBTunnel=False):
		"""
		2012.5.9
		"""
		sys.stderr.write("Adding VCF2DB jobs for %s vcf files ... "%(len(inputData.jobDataLs)))
		
		
		topOutputDir = "%sVCF2DB"%(outputDirPrefix)
		topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir)
		
		firstVCFFile = inputData.jobDataLs[0].vcfFile
		logFile = File(os.path.join(topOutputDir, 'AddGenotypeMethod2DB.log'))
		addGM2DBJob = self.addAddGenotypeMethod2DBJob(executable=self.AddGenotypeMethod2DB, inputFile=firstVCFFile, \
								genotypeMethodShortName=genotypeMethodShortName,\
								logFile=logFile, data_dir=data_dir, commit=commit, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, \
								extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel)
		updateGMlogFile = File(os.path.join(topOutputDir, 'updateGM.log'))
		updateGMNoOfLociJob = self.addUpdateGenotypeMethodNoOfLociJob(executable=self.UpdateGenotypeMethodNoOfLoci, \
																	genotypeMethodShortName=genotypeMethodShortName,\
								logFile=updateGMlogFile, data_dir=data_dir, commit=commit, parentJobLs=[topOutputDirJob], \
								extraDependentInputLs=[], transferOutput=True, \
								extraArguments=None, job_max_memory=20, sshDBTunnel=needSSHDBTunnel)
		
		returnData = PassingData()
		returnData.jobDataLs = []
		for jobData in inputData.jobDataLs:
			inputF = jobData.vcfFile
			if maxContigID:
				contig_id = self.getContigIDFromFname(inputF.name)
				try:
					contig_id = int(contig_id)
					if contig_id>maxContigID:	#skip the small contigs
						continue
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
			logFile = File(os.path.join(topOutputDir, 'AddVCFFile2DB_%s.log'%(self.getChrFromFname(inputF.name))))
			addVCFJob = self.addAddVCFFile2DBJob(executable=self.AddVCFFile2DB, inputFile=inputF, genotypeMethodShortName=genotypeMethodShortName,\
						logFile=logFile, format="VCF", data_dir=data_dir, checkEmptyVCFByReading=checkEmptyVCFByReading, commit=commit, \
						parentJobLs=[addGM2DBJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=True, \
						extraArguments=None, job_max_memory=1000, sshDBTunnel=needSSHDBTunnel)
			workflow.depends(parent=addVCFJob, child=updateGMNoOfLociJob)
		sys.stderr.write("%s jobs.\n"%(self.no_of_jobs))
		#include the tfam (outputList[1]) into the fileLs
		returnData.jobDataLs.append(PassingData(jobLs=[updateGMNoOfLociJob], file=updateGMlogFile, \
											fileLs=[updateGMlogFile]))
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:52,代码来源:AddVCFFolder2DBWorkflow.py

示例8: reduceBeforeEachAlignment

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def reduceBeforeEachAlignment(self, workflow=None, passingData=None, preReduceReturnData=None, transferOutput=True, **keywords):
		"""
		2012.9.17
			add a merge variant annotation job, GW plot job
			
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		outputDirPrefix = passingData.outputDirPrefix
		
		statOutputDirJob = preReduceReturnData.statOutputDirJob
		plotOutputDirJob = preReduceReturnData.plotOutputDirJob
		
		mergeOutputF = File(os.path.join(statOutputDirJob.output, '%s_%s.tsv'%(passingData.bamFnamePrefix, passingData.annotationName)))
		mergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
							outputF=mergeOutputF, transferOutput=transferOutput, parentJobLs=[statOutputDirJob],)
		returnData.jobDataLs.append(PassingData(jobLs=[mergeJob ], file=mergeJob.output, fileLs=[mergeJob.output], mergeJob=mergeJob))
		self.no_of_jobs += 1
		
		outputFnamePrefix = os.path.join(plotOutputDirJob.output, '%s_%s_Plot'%(passingData.bamFnamePrefix, passingData.annotationName))
		# whichColumnPlotLabel and xColumnPlotLabel should not contain spaces or ( or ). because they will disrupt shell commandline
		self.addPlotVCFtoolsStatJob(executable=workflow.PlotVCFtoolsStat, inputFileList=[mergeOutputF], \
							outputFnamePrefix=outputFnamePrefix, \
							whichColumn=None, whichColumnHeader=passingData.annotationName, whichColumnPlotLabel=passingData.annotationName, \
							need_svg=False, \
							logY=0, valueForNonPositiveYValue=-1, \
							xColumnPlotLabel="position", chrLengthColumnHeader=None, chrColumnHeader="CHROM", \
							minChrLength=None, xColumnHeader="POS", minNoOfTotal=50,\
							figureDPI=100, ylim_type=2, samplingRate=0.01,\
							parentJobLs=[mergeJob, plotOutputDirJob], \
							extraDependentInputLs=None, \
							extraArguments=None, transferOutput=True, sshDBTunnel=self.needSSHDBTunnel)
		self.no_of_jobs += 1
		
		outputFile = File( os.path.join(plotOutputDirJob.output, '%s_%s_Hist.png'%(passingData.bamFnamePrefix, passingData.annotationName)))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[mergeJob.output], \
					outputFile=outputFile, \
					whichColumn=None, whichColumnHeader=passingData.annotationName, whichColumnPlotLabel=passingData.annotationName, \
					logY=None, logCount=True, valueForNonPositiveYValue=-1,\
					minNoOfTotal=10,\
					figureDPI=100, samplingRate=0.1,\
					parentJobLs=[plotOutputDirJob, mergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=True,  job_max_memory=2000)
		self.no_of_jobs += 1
		
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:50,代码来源:HaplotypeScoreWorkflow.py

示例9: preReduce

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords):
		"""
		2012.9.17
		"""
		parentPreReduceData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, passingData=passingData, \
							transferOutput=transferOutput, **keywords)
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		callOutputDir = "call"
		callOutputDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=callOutputDir)
		passingData.callOutputDirJob = callOutputDirJob
		
		matrixDir = "pairwiseDistMatrix"
		matrixDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=matrixDir)
		passingData.matrixDirJob = matrixDirJob
		
		reduceOutputDirJob = passingData.reduceOutputDirJob
		#2012.10.9 reduceOutputDirJob was added to passingData during AbstractVCFWorkflow.preReduce()
		
		#reduceOutputDir = "aggregateData"
		#reduceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=reduceOutputDir)
		#passingData.reduceOutputDirJob = reduceOutputDirJob
		
		figureFnamePrefix = os.path.join(reduceOutputDirJob.output, 'aggregateDistanceMatrix')
		aggregateDistanceMatrixOutputF = File('%s.tsv'%(figureFnamePrefix))
		PCAFile = File('%s_PCA.tsv'%(figureFnamePrefix))
		aggregateAndHClusterDistanceMatrixJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.AggregateAndHClusterDistanceMatrix, \
									outputF=aggregateDistanceMatrixOutputF, \
									parentJobLs=[reduceOutputDirJob],extraOutputLs=[PCAFile, File('%s.png'%(figureFnamePrefix)), \
																				File('%s.svg'%(figureFnamePrefix))], \
									extraDependentInputLs=[], transferOutput=True, extraArguments="-f %s"%(figureFnamePrefix))
		returnData.aggregateAndHClusterDistanceMatrixJob = aggregateAndHClusterDistanceMatrixJob
		
		#2012.9.5 add the job to append meta info (country, sex, latitude, etc. of each monkey)
		outputF = File('%s_withMetaInfo.tsv'%(figureFnamePrefix))
		appendInfo2PCAOutputJob = self.addGenericDBJob(executable=self.AppendInfo2SmartPCAOutput, inputFile=PCAFile, \
				outputFile=outputF, \
				parentJobLs=[aggregateAndHClusterDistanceMatrixJob], extraDependentInputLs=None, \
				extraOutputLs=None,\
				transferOutput=True, \
				extraArgumentList=None, extraArguments=None, sshDBTunnel=self.needSSHDBTunnel, \
				key2ObjectForJob=None, job_max_memory=2000)
		
		
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:48,代码来源:CalculateDistanceMatrixFromVCFPipe.py

示例10: preReduce

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def preReduce(self, workflow=None, passingData=None, transferOutput=True, **keywords):
		"""
		2012.9.17
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		outputDirPrefix = passingData.outputDirPrefix
		
		#pass it along
		passingData.annotationName = 'HaplotypeScore'
		
		statOutputDir = "%sstat"%(outputDirPrefix)
		statOutputDirJob = self.addMkDirJob(outputDir=statOutputDir)
		
		plotOutputDir = "%splot"%(outputDirPrefix)
		plotOutputDirJob = self.addMkDirJob(outputDir=plotOutputDir)
		self.no_of_jobs += 2
		
		returnData.plotOutputDirJob = plotOutputDirJob
		returnData.statOutputDirJob = statOutputDirJob
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:23,代码来源:HaplotypeScoreWorkflow.py

示例11: mapEachInterval

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def mapEachInterval(self, workflow=None, \
					VCFJobData=None, passingData=None, transferOutput=False, **keywords):
		"""
		2012.9.22
		"""
		
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		topOutputDirJob = passingData.topOutputDirJob
		intervalFileBasenamePrefix = passingData.intervalFileBasenamePrefix
		jobData = passingData.jobData
		callOutputDirJob = passingData.callOutputDirJob
		
		splitVCFJob = passingData.mapEachVCFData.splitVCFJob
		
		genotypeCallOutputFname = os.path.join(callOutputDirJob.output, '%s.call'%(intervalFileBasenamePrefix))
		genotypeCallOutput = File(genotypeCallOutputFname)
		genotypeCallByCoverage_job = self.addVCF2MatrixJob(workflow, executable=self.GenotypeCallByCoverage, \
											inputVCF=VCFJobData.file, outputFile=genotypeCallOutput, \
					refFastaF=None, run_type=3, numberOfReadGroups=10, minDepth=self.minDepth,\
					parentJobLs=[callOutputDirJob, splitVCFJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=False, \
					extraArguments=None, job_max_memory=2000)
		
		matrixDirJob = passingData.matrixDirJob
		calculaOutputFname =os.path.join(matrixDirJob.output, '%s.pairwiseDist.convertHetero2NA%s.minMAF%.2f.maxNA%.2f.tsv'%(intervalFileBasenamePrefix, \
							self.convertHetero2NA, self.min_MAF, self.max_NA_rate))
		calculaOutput = File(calculaOutputFname)
		calculaJob = self.addCalculatePairwiseDistanceFromSNPXStrainMatrixJob(workflow, \
										executable=self.CalculatePairwiseDistanceOutOfSNPXStrainMatrix, \
										inputFile=genotypeCallOutput, outputFile=calculaOutput, \
					min_MAF=self.min_MAF, max_NA_rate=self.max_NA_rate, convertHetero2NA=self.convertHetero2NA, \
					hetHalfMatchDistance=self.hetHalfMatchDistance,\
					parentJobLs=[genotypeCallByCoverage_job, matrixDirJob], extraDependentInputLs=[], transferOutput=False, \
					extraArguments=None, job_max_memory=2000)
		returnData.jobDataLs.append(PassingData(jobLs=[calculaJob], file=calculaJob.output, \
											fileLs=[calculaJob.output]))
		returnData.calculaJob = calculaJob
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:41,代码来源:CalculateDistanceMatrixFromVCFPipe.py

示例12: reduceEachChromosome

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def reduceEachChromosome(self, workflow=None, chromosome=None, passingData=None, mapEachVCFDataLs=None,\
					reduceEachVCFDataLs=None, \
					transferOutput=True, \
					**keywords):
		"""
		2012.10.3
			#. merge all VCF-level reduce job (from one chromosome) output (passingData.reduceEachVCFDataLs) into one first
				#taking the input jobs of each reduceEachVCFData as input of this per-chromosome reduce job.
			#. don't use passingData.mapEachVCFDataLsLs, cuz it's empty.
			
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		topOutputDirJob = passingData.topOutputDirJob
		reduceOutputDirJob = passingData.reduceOutputDirJob
		chromosome = passingData.chromosome
		
		fnamePrefix = os.path.join(reduceOutputDirJob.output, '%s_frequency_juxtapose'%(chromosome))
		outputFile = File('%s.tsv'%(fnamePrefix))
		reduceEachChromosomeJob = self.addStatMergeJob(workflow, \
									statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
									outputF=outputFile, \
									parentJobLs=[reduceOutputDirJob],extraOutputLs=[], \
									extraDependentInputLs=[], transferOutput=transferOutput,)
		#2012.10.7 don't add it to returnData.jobDataLs unless it needs to be gzipped and transferred out
		#returnData.jobDataLs.append(PassingData(jobLs=[reduceEachChromosomeJob], file=reduceEachChromosomeJob.output, \
		#									fileLs=[reduceEachChromosomeJob.output]))
		returnData.reduceEachChromosomeJob = reduceEachChromosomeJob
		
		for reduceEachVCFData in reduceEachVCFDataLs:
			for mapEachIntervalData in reduceEachVCFData.mapEachIntervalDataLs:
				juxtaposeAFJob = mapEachIntervalData.juxtaposeAFJob
				self.addInputToStatMergeJob(workflow, statMergeJob=reduceEachChromosomeJob, \
						parentJobLs=[juxtaposeAFJob])
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:38,代码来源:CompareAlleleFrequencyOfTwoPopulationFromOneVCFFolder.py

示例13: reduce

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def reduce(self, workflow=None, passingData=None, reduceEachChromosomeDataLs=None,\
			transferOutput=True, **keywords):
		"""
		2012.10.3
			#. reduce all previous jobs (passingData.reduceEachChromosomeDataLs) into one final output
			#. merge all the output and run Draw2DHistogramOfMatrix.py
		
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		reduceOutputDirJob = passingData.reduceOutputDirJob
		
		fnamePrefix = os.path.join(reduceOutputDirJob.output, 'frequency_juxtapose_%s_vs_%s'%(self.pop1Header, self.pop2Header))
		outputFile = File('%s.tsv'%(fnamePrefix))
		reduceJob = self.addStatMergeJob(workflow, \
									statMergeProgram=self.MergeSameHeaderTablesIntoOne, \
									outputF=outputFile, \
									parentJobLs=[reduceOutputDirJob],extraOutputLs=[], \
									extraDependentInputLs=[], transferOutput=transferOutput,)
		returnData.jobDataLs.append(PassingData(jobLs=[reduceJob], file=reduceJob.output, \
											fileLs=[reduceJob.output]))
		
		for reduceEachChromosomeData in reduceEachChromosomeDataLs:
			parentJob = reduceEachChromosomeData.reduceEachChromosomeJob
			self.addInputToStatMergeJob(workflow, statMergeJob=reduceJob, \
						parentJobLs=[parentJob])
		
		#add a Draw2DHistogramOfMatrix.py job
		outputFile = File('%s.png'%(fnamePrefix))
		drawJob = self.addDraw2DHistogramOfMatrixJob(workflow=workflow, executable=self.Draw2DHistogramOfMatrix, \
											inputFileList=None, inputFile=reduceJob.output, outputFile=outputFile, \
				outputFnamePrefix=None, whichColumn=None, whichColumnHeader=self.pop1Header, whichColumnPlotLabel=None, \
				valueForNonPositiveYValue=-1, \
				missingDataNotation='NA',\
				xColumnHeader=self.pop2Header, xColumnPlotLabel=None, \
				minNoOfTotal=100,\
				figureDPI=300, formatString='.', samplingRate=1, need_svg=False, \
				zColumnHeader=None, logX=False, logY=False, logZ=False,\
				parentJobLs=[reduceJob], \
				extraDependentInputLs=None, \
				extraArgumentList=None, extraArguments=None, transferOutput=True,  job_max_memory=2000)
		returnData.drawJob = drawJob
		
		#2012.10.15 add a EstimateOutliersIn2DData job
		extraArgumentList= ['--minAbsDelta %s'%(self.minAbsDelta)]
		outputFile = File('%s_outlierStat_minAbsDelta%s.tsv'%(fnamePrefix, self.minAbsDelta))
		estimateOutlierJob = self.addAbstractPlotJob(workflow=workflow, executable=self.EstimateOutliersIn2DData, \
					inputFileList=None, inputFile=reduceJob.output, outputFile=outputFile, \
					outputFnamePrefix=None, whichColumn=None, whichColumnHeader=self.pop1Header, whichColumnPlotLabel=None, \
					logY=False, valueForNonPositiveYValue=-1, \
					missingDataNotation='NA',\
					xColumnHeader=self.pop2Header, xColumnPlotLabel=None, \
					minNoOfTotal=0,\
					samplingRate=1, \
					parentJob=reduceJob, parentJobLs=None, \
					extraDependentInputLs=None, \
					extraArgumentList=extraArgumentList, extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		
		returnData.jobDataLs.append(PassingData(jobLs=[estimateOutlierJob], file=estimateOutlierJob.output, \
											fileLs=[estimateOutlierJob.output]))
		returnData.estimateOutlierJob = estimateOutlierJob
		
		return returnData
开发者ID:mjmontague,项目名称:vervet-web,代码行数:66,代码来源:CompareAlleleFrequencyOfTwoPopulationFromOneVCFFolder.py

示例14: mapEachInterval

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def mapEachInterval(self, workflow=None, VCFJobData=None, chromosome=None,intervalData=None,\
					mapEachChromosomeData=None, passingData=None, transferOutput=False, \
					**keywords):
		"""
		2012.9.22
			argument VCFJobData looks like PassingData(file=splitVCFFile, vcfFile=splitVCFFile, fileLs=[splitVCFFile], \
																		job=splitVCFJob, jobLs=[splitVCFJob], tbi_F=None)
		"""
		
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		passingData.intervalFileBasenamePrefix
		passingData.splitVCFFile
		passingData.unitNumber
		"""
		passingData.fileBasenamePrefix
		## 2013.06.19 structures available from passingData, specific to the interval
		passingData.splitVCFFile = splitVCFFile
		#VCFJobData.file is same as passingData.splitVCFFile 
		passingData.unitNumber = unitNumber
		passingData.intervalFileBasenamePrefix = '%s_%s_splitVCF_u%s'%(chromosome, commonPrefix, unitNumber)
		passingData.noOfIndividuals = jobData.file.noOfIndividuals
		passingData.span = self.intervalSize + self.intervalOverlapSize*2 	#2013.06.19 for memory/walltime gauging
		"""
		
		
		# 2013.12.04 VCF combine
		#a combine VCF horizontally job
		#2011-9-22 union of all GATK intervals for one contig
		combineCallOutputFname = os.path.join(self.mapDirJob.output, '%s.vcf.gz'%passingData.intervalFileBasenamePrefix)
		combineCallOutputF = File(combineCallOutputFname)
		returnData.combineCallJob = self.addGATKCombineVariantsJob(executable=self.CombineVariantsJavaInReduce, \
					refFastaFList=self.registerReferenceData.refFastaFList, \
					outputFile=combineCallOutputF, \
					genotypeMergeOptions='UNSORTED', \
					parentJobLs=[self.mapDirJob], \
					extraArguments=None, extraArgumentList=None, extraDependentInputLs=None,\
					transferOutput=False, job_max_memory=10000, walltime=180)
		
		#add a job to calculate missing fraction per locus in medium and higher coverage individuals
		outputFile = File(os.path.join(self.reduceStatDirJob.output, '%s_missingFractionPerLocus.tsv.gz'%(passingData.intervalFileBasenamePrefix)))
		missingFractionPerLocusJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
									outputF=outputFile, extraArgumentList=["--keyColumnLs 1", "--valueColumnLs 5", "--operatorType 3",\
												"--fixedValueDenominator %s"%(self.noOfMediumCoverageIndividuals)],\
									parentJobLs=[self.reduceStatDirJob],extraOutputLs=None, \
									extraDependentInputLs=None, transferOutput=False)
		
		returnData.markGenotypeMissingJobLs = []
		#for each sample from the genotype method
		for alignmentData in self.alignmentDataLs:
			if not alignmentData.newAlignment:
				continue
			individual_alignment = alignmentData.alignment
			sampleID = individual_alignment.read_group
			
			# select that sample into VCF
			outputFile = File(os.path.join(self.mapDirJob.output, '%s_sample_%s.vcf.gz'%(passingData.intervalFileBasenamePrefix, sampleID)))
			selectSampleJob = self.addSelectVariantsJob(\
					inputF=passingData.splitVCFFile, outputF=outputFile, \
					interval=None,\
					refFastaFList=self.registerReferenceData.refFastaFList, sampleIDKeepFile=None, \
					snpIDKeepFile=None, sampleIDExcludeFile=None, \
					parentJobLs=[self.mapDirJob] + VCFJobData.jobLs, extraDependentInputLs=None, transferOutput=False, \
					extraArguments="--sample_name %s"%(alignmentData.alignment.read_group), \
					extraArgumentList=None, job_max_memory=2000, walltime=None)
			
			# mask genotype missing
			# given single-sample VCF file, alignment file, median-depth
			# output two files, one is a VCF , one is a missing genotype stat file
			genotypeMissingStatFile = File(os.path.join(self.mapDirJob.output, '%s_sample_%s_genotypeMissingStat.tsv.gz'%(passingData.intervalFileBasenamePrefix,\
																	sampleID)))
			outputFile = File(os.path.join(self.mapDirJob.output, '%s_sample_%s_markedMissing.vcf.gz'%(passingData.intervalFileBasenamePrefix,\
																	sampleID)))
			alignmentFile = alignmentData.newAlignment.file
			if individual_alignment.individual_sequence.individual.target_coverage>=self.minMediumCoverageThreshold:
				alignmentDepthFold=2
			else:
				alignmentDepthFold=10
			markGenotypeMissingJob = self.addGenericJob(executable=self.MarkGenotypeMissingByAlignmentQuality, \
							inputFile=selectSampleJob.output, inputArgumentOption="-i", \
					outputFile=outputFile, outputArgumentOption="-o", \
					parentJob=None, parentJobLs=[self.mapDirJob, selectSampleJob], \
					extraDependentInputLs=alignmentData.newAlignment.fileLs, extraOutputLs=[genotypeMissingStatFile], \
					extraArgumentList=["--alignmentFilename", alignmentFile, "--missingStatFname", genotypeMissingStatFile, \
									"--alignmentMedianDepth %s"%(alignmentData.newAlignment.median_depth), \
									"--alignmentDepthFold %s"%(alignmentDepthFold), \
									"--minMapQGoodRead 2", "--minFractionOfGoodRead 0.9",\
									"--sampleID %s"%(sampleID)], \
					transferOutput=False, sshDBTunnel=None, \
					key2ObjectForJob=None, objectWithDBArguments=None, objectWithDBGenomeArguments=None,\
					no_of_cpus=None, job_max_memory=2000, walltime=180, \
					max_walltime=None)
			markGenotypeMissingJob.genotypeMissingStatFile = genotypeMissingStatFile
			returnData.markGenotypeMissingJobLs.append(markGenotypeMissingJob)
			
			if individual_alignment.individual_sequence.individual.target_coverage>=self.minMediumCoverageThreshold:
				#missing fraction only from medium or high coverage individuals
				self.addInputToStatMergeJob(statMergeJob=missingFractionPerLocusJob, parentJobLs=[markGenotypeMissingJob],\
									inputF=markGenotypeMissingJob.genotypeMissingStatFile)
			
#.........这里部分代码省略.........
开发者ID:mjmontague,项目名称:vervet-web,代码行数:103,代码来源:MarkGenotypeMissingByAlignmentQualityWorkflow.py

示例15: addTrioCallerJobsONVCFFiles

# 需要导入模块: from pymodule import PassingData [as 别名]
# 或者: from pymodule.PassingData import jobDataLs [as 别名]
	def addTrioCallerJobsONVCFFiles(self, workflow=None, alignmentLs=None, inputData=None, samtools=None, \
				genotyperJava=None, SelectVariantsJava=None, GenomeAnalysisTKJar=None, \
				addOrReplaceReadGroupsJava=None, AddOrReplaceReadGroupsJar=None, \
				CreateSequenceDictionaryJava=None, CreateSequenceDictionaryJar=None, \
				MergeSamFilesJar=None, \
				BuildBamIndexFilesJava=None, BuildBamIndexJar=None,\
				mv=None, CallVariantBySamtools=None,\
				trioCallerPath=None, trioCallerWrapper=None, \
				replicateIndividualTag="copy", treatEveryOneIndependent=False,\
				bgzip_tabix=None, vcf_convert=None, vcf_isec=None, vcf_concat=None, \
				concatGATK=None, concatSamtools=None, ligateVcf=None, ligateVcfExecutableFile=None,\
				registerReferenceData=None, \
				namespace='workflow', version="1.0", site_handler=None, input_site_handler=None,\
				needFastaIndexJob=False, needFastaDictJob=False, \
				intervalSize=2000000, intervalOverlapSize=100000, site_type=1, data_dir=None, no_of_gatk_threads = 1, \
				outputDirPrefix="", \
				maxSNPMissingRate=None, alnStatForFilterF=None, onlyKeepBiAllelicSNP=True, \
				cumulativeMedianDepth=5000, job_max_memory = 2000, vcf_job_max_memory = 1000,\
				run_type=2, transferOutput=True, **keywords):
		"""
		2012.12.5 added argument run_type (same as self.run_type) 2: TrioCaller; 3: polymutt
		2012.8.15
		"""
		sys.stderr.write("Adding trioCaller jobs for  %s vcf files ..."%(len(inputData.jobDataLs)))
		if workflow is None :
			workflow = self
		refFastaFList = registerReferenceData.refFastaFList
		refFastaF = refFastaFList[0]
		
		if needFastaDictJob or registerReferenceData.needPicardFastaDictJob:
			fastaDictJob = self.addRefFastaDictJob(workflow, CreateSequenceDictionaryJava=CreateSequenceDictionaryJava, \
												refFastaF=refFastaF)
			refFastaDictF = fastaDictJob.refFastaDictF
		else:
			fastaDictJob = None
			refFastaDictF = registerReferenceData.refPicardFastaDictF
		
		if needFastaIndexJob or registerReferenceData.needSAMtoolsFastaIndexJob:
			fastaIndexJob = self.addRefFastaFaiIndexJob(workflow, samtools=samtools, refFastaF=refFastaF)
			refFastaIndexF = fastaIndexJob.refFastaIndexF
		else:
			fastaIndexJob = None
			refFastaIndexF = registerReferenceData.refSAMtoolsFastaIndexF
		
		trioCallerOutputDir = "%sRefinedCalls"%(outputDirPrefix)
		trioCallerOutputDirJob = self.addMkDirJob(outputDir=trioCallerOutputDir)
		round1CallDir = "%sPreRefinedCalls"%(outputDirPrefix)
		round1CallDirJob = self.addMkDirJob(outputDir=round1CallDir)
		
		outputPedigreeJob = None
		
		# add merge jobs for every reference
		returnData = PassingData()
		returnData.jobDataLs = []
		for i in xrange(len(inputData.jobDataLs)):
			jobData = inputData.jobDataLs[i]
			inputF = jobData.vcfFile
			inputFBaseName = os.path.basename(inputF.name)
			chr_id = self.getChrFromFname(inputFBaseName)
			commonPrefix = inputFBaseName.split('.')[0]
			
			overlapInterval = chr_id
			#split VCF job
			outputFnamePrefix = os.path.join(round1CallDirJob.folder, '%s_splitVCF'%commonPrefix)
			splitVCFJob = self.addSplitVCFFileJob(executable=self.SplitVCFFile, inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \
					noOfOverlappingSites=intervalOverlapSize, noOfSitesPerUnit=intervalSize, noOfTotalSites=inputF.noOfLoci, \
					parentJobLs=jobData.jobLs+[round1CallDirJob], \
					extraDependentInputLs=[jobData.tbi_F], \
					extraArguments=None, transferOutput=False, job_max_memory=job_max_memory)
			
			#ligate vcf job (different segments of a chromosome into one chromosome)
			concatTrioCallerOutputFname = os.path.join(trioCallerOutputDirJob.folder, '%s.vcf'%chr_id)
			concatTrioCallerOutputF = File(concatTrioCallerOutputFname)
			trioCallerWholeContigConcatJob = self.addLigateVcfJob(executable=ligateVcf, ligateVcfExecutableFile=ligateVcfExecutableFile, \
										outputFile=concatTrioCallerOutputF, \
										parentJobLs=[trioCallerOutputDirJob], extraDependentInputLs=[], transferOutput=False, \
										extraArguments=None, job_max_memory=vcf_job_max_memory)
			
			#bgzip and tabix the trio caller output
			bgzip_concatTrioCallerOutputF = File("%s.gz"%concatTrioCallerOutputFname)
			bgzip_tabix_concatTrioCallerOutput_job = self.addBGZIP_tabix_Job(workflow, bgzip_tabix=bgzip_tabix, \
					parentJob=trioCallerWholeContigConcatJob, inputF=concatTrioCallerOutputF, outputF=bgzip_concatTrioCallerOutputF, \
					transferOutput=transferOutput)
			
			returnData.jobDataLs.append(PassingData(vcfFile=bgzip_concatTrioCallerOutputF, jobLs=[bgzip_tabix_concatTrioCallerOutput_job]))
			#self.addRefFastaJobDependency(workflow, wholeRefUnionOfIntersectionJob, refFastaF=refFastaF, fastaDictJob=fastaDictJob, \
			#							refFastaDictF=refFastaDictF, fastaIndexJob = fastaIndexJob, refFastaIndexF = refFastaIndexF)
			
			noOfUnits = max(1, utils.getNoOfUnitsNeededToCoverN(N=inputF.noOfLoci, s=intervalSize, o=intervalOverlapSize)-1)
			for unitNumber in xrange(1, noOfUnits+1):
				splitVCFFile = getattr(splitVCFJob, 'unit%sFile'%(unitNumber))
				
				#2012.4.2
				tranferIntermediateFilesForDebug=False
				overlapIntervalFnameSignature = '%s_%s'%(commonPrefix, unitNumber)
				
				#selectVariants would generate AC, AF so that TrioCaller could read it. (samtools uses 'AC1' instead of AC, 'AF1' instead of AF.
				round1_VCF4OutputFname = os.path.join(round1CallDirJob.folder, '%s.niceformat.vcf'%overlapIntervalFnameSignature)
				round1_VCF4OutputF = File(round1_VCF4OutputFname)
				round1_vcf_convert_job = self.addSelectVariantsJob(SelectVariantsJava=SelectVariantsJava, \
#.........这里部分代码省略.........
开发者ID:mjmontague,项目名称:vervet-web,代码行数:103,代码来源:AlignmentToTrioCallPipeline.py


注:本文中的pymodule.PassingData.jobDataLs方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。