本文整理匯總了Python中oncotator.utils.GenericTsvReader.GenericTsvReader類的典型用法代碼示例。如果您正苦於以下問題:Python GenericTsvReader類的具體用法?Python GenericTsvReader怎麽用?Python GenericTsvReader使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了GenericTsvReader類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_full_seg_file_annotations
def test_full_seg_file_annotations(self):
"""Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV"""
inputFilename = "testdata/seg/Patient0.seg.txt"
output_filename = "out/test_full_seg_file_annotations.tsv"
db_dir = self.config.get('DEFAULT',"dbDir")
if os.path.exists(output_filename):
os.remove(output_filename)
annotator = Annotator()
run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "SIMPLE_TSV", inputFilename, output_filename,
datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
annotator.initialize(run_spec)
annotator.annotate()
# Now check the output
output_reader = GenericTsvReader(output_filename)
required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
headers = output_reader.getFieldNames()
for rcol in required_cols:
self.assertTrue(rcol in headers)
for line_dict in output_reader:
self.assertTrue(line_dict['start'] is not None)
self.assertTrue(line_dict['start'].strip() != "")
self.assertTrue(line_dict['end'] is not None)
self.assertTrue(line_dict['end'].strip() != "")
self.assertTrue("genes" in line_dict.keys())
self.assertTrue(len(line_dict["genes"].split(",")) > 0)
示例2: sortFile
def sortFile(self, filename, func, length=50000):
"""
This method sorts the input file and writes out the sorted file to filename.
:param filename: sorted filename
:param func: function that converts each row of the input file to an unique, sortable key
:param length: maximum number of lines in a partition
"""
reader = GenericTsvReader(filename=self.readfilename, commentPrepend=self.commentPrepend,
delimiter=self.delimiter)
comments = reader.getComments()
fieldnames = reader.getFieldNames()
if fieldnames is None:
fieldnames = []
fieldnameIndexes = collections.OrderedDict()
if fieldnames is not None:
fieldnameIndexes = collections.OrderedDict([(x, i) for (i, x) in enumerate(fieldnames)])
iterable = iter(reader.getInputContentFP())
partitions = self._yieldPartitions(iterable, func, fieldnameIndexes, length)
with open(name=filename, mode='wb', buffering=64 * 1024) as writer:
writer.write(comments)
writer.write(string.join(fieldnames, self.delimiter) + "\n")
writer.writelines(self._merge(partitions)) # generators are allowed as inputs to writelines function
示例3: test_basic_rendering
def test_basic_rendering(self):
"""Test that we can render a basic seg file as a gene list"""
inputFilename = "testdata/seg/Patient0.seg.txt"
output_filename = "out/test_basic_rendering.gene_list.tsv"
db_dir = self.config.get('DEFAULT',"dbDir")
if os.path.exists(output_filename):
os.remove(output_filename)
annotator = Annotator()
run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "GENE_LIST", inputFilename, output_filename,
datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
annotator.initialize(run_spec)
annotator.annotate()
# Now check the output
output_reader = GenericTsvReader(output_filename)
headers = output_reader.getFieldNames()
for line_dict in output_reader:
self.assertTrue(line_dict['segment_start'] is not None)
self.assertTrue(line_dict['segment_start'].strip() != "")
self.assertTrue(line_dict['segment_end'] is not None)
self.assertTrue(line_dict['segment_end'].strip() != "")
self.assertTrue("gene" in line_dict.keys())
self.assertTrue(len(line_dict["gene"]) > 0)
self.assertTrue(float(line_dict["segment_num_probes"]))
self.assertTrue(line_dict['sample'] == "Patient0")
示例4: test_simple_seg_file_input
def test_simple_seg_file_input(self):
"""Test that we can read in a seg file, do no annotation, and output as SIMPLE_TSV"""
inputFilename = "testdata/seg/Patient0.seg.txt"
output_filename = "out/test_simple_seg_file_input.tsv"
if os.path.exists(output_filename):
os.remove(output_filename)
ic = MafliteInputMutationCreator(inputFilename, 'configs/seg_file_input.config')
segs = ic.createMutations()
i = 1
for i,seg in enumerate(segs):
pass
self.assertTrue((i+1) == 27, "Found %d segments when there should have been 27." % (i+1))
ic = MafliteInputMutationCreator(inputFilename, 'configs/seg_file_input.config')
segs = ic.createMutations()
outputRenderer = SimpleOutputRenderer(output_filename, '')
outputRenderer.renderMutations(segs)
# Now check the output
output_reader = GenericTsvReader(output_filename)
required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
headers = output_reader.getFieldNames()
for rcol in required_cols:
self.assertTrue(rcol in headers)
for line_dict in output_reader:
self.assertTrue(line_dict['start'] is not None)
self.assertTrue(line_dict['start'].strip() != "")
self.assertTrue(line_dict['end'] is not None)
self.assertTrue(line_dict['end'].strip() != "")
示例5: testDuplicateAnnotation
def testDuplicateAnnotation(self):
"""
Tests that the duplicate annotations are parsed correctly.
"""
inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_annotation.vcf"])
outputFilename = os.path.join("out", "example.duplicate_annotation.out.tsv")
creator = VcfInputMutationCreator(inputFilename)
creator.createMutations()
renderer = SimpleOutputRenderer(outputFilename)
annotator = Annotator()
annotator.setInputCreator(creator)
annotator.setOutputRenderer(renderer)
annotator.annotate()
tsvReader = GenericTsvReader(outputFilename)
fieldnames = tsvReader.getFieldNames()
self.assertTrue("variant_status" in fieldnames, "variant_status field is missing in the header.")
self.assertTrue("sample_variant_status" in fieldnames, "sample_variant_status is missing in the header.")
row = tsvReader.next()
self.assertTrue("variant_status" in row, "variant_status field is missing in the row.")
self.assertTrue("sample_variant_status" in row, "sample_variant_status is missing in the row.")
self.assertEqual("2", row["variant_status"], "Incorrect value of variant_status.")
self.assertEqual("0", row["sample_variant_status"], "Incorrect value of sample_variant_status")
示例6: _create_test_ds
def _create_test_ds(self, input_tsv, dir_name, index_cols):
base_name = "test_snp_leveldb"
full_name = dir_name + "/" + base_name
if os.path.exists(full_name):
shutil.rmtree(full_name)
os.makedirs(full_name)
tsv_reader = GenericTsvReader(input_tsv, commentPrepend="%")
annotation_cols = copy.copy(tsv_reader.getFieldNames())
for icol in index_cols:
if icol in annotation_cols:
annotation_cols.remove(icol)
ds_creator = SnpOnlyLevelDbDatasourceCreator()
ds_creator.createDatasource(full_name, input_tsv, ",".join(index_cols), full_name + "/" + base_name + ".config", "snp_leveldb", base_name, "TEST",
"exact", annotation_cols, [])
config_filename = "out/test_simple_annotate_snp_only_leveldb/test_snp_leveldb/test_snp_leveldb.config"
ds = DatasourceFactory.createDatasource(os.path.abspath(config_filename), os.path.dirname(config_filename))
return ds
示例7: createDatasource
def createDatasource(self, destDir, ds_file, index_column_names, configFilename, ds_type, ds_name, ds_version,
ds_match_mode, annotation_column_names, indexCols):
"""
:param destDir:
:param ds_file:
:param index_column_names:
:param configFilename:
:param ds_type:
:param ds_name:
:param ds_version:
:param ds_match_mode:
:param annotation_column_names: If blank, assume all in the tsv (minus the index columns)
:param indexCols: list of the index columns. Assumed to be five corresponding to chrom, start, end, ref, and alt.
"""
index_column_names = index_column_names.split(",")
output_filename = destDir + "/" + ds_name + ".leveldb"
src_file = os.path.basename(output_filename)
db = leveldb.LevelDB(output_filename, create_if_missing=True)
comment_prepend = "#"
if any([True for icol in index_column_names if icol.startswith("#")]):
comment_prepend = "%"
tsv_file = ds_file
tsv_reader = GenericTsvReader(tsv_file, commentPrepend=comment_prepend)
if annotation_column_names is None:
annotation_column_names = copy.copy(tsv_reader.getFieldNames())
for icol in index_column_names:
if icol in annotation_column_names:
annotation_column_names.remove(icol)
logging.getLogger(__name__).info("Creating SNP LevelDB for the following index headers: " + str(index_column_names))
logging.getLogger(__name__).info("Creating SNP LevelDB for the following data headers: " + str(annotation_column_names))
# Create the config file
self._createConfigFile(configFilename, src_file, ds_name, ds_version, index_column_names, annotation_columns=annotation_column_names)
batch = leveldb.WriteBatch()
for i,line_dict in enumerate(tsv_reader):
chrom = line_dict[index_column_names[0]]
start = line_dict[index_column_names[1]]
end = line_dict[index_column_names[2]]
ref = line_dict[index_column_names[3]]
alt = line_dict[index_column_names[4]]
h = SnpOnlyLevelDbDatasource.generate_hash(chrom, start, end, ref, alt)
if i % 5000 == 0:
logging.getLogger(__name__).info("Rendering %d entries" % (i))
line_list = [line_dict.get(k, "") for k in annotation_column_names]
db.Put(h, ",".join(line_list))
db.Write(batch, sync = True)
示例8: testExposedColumns
def testExposedColumns(self):
"""Test that columns listed in the config file as exposed do not get the i_ prepend"""
testOutputFilename = self._annotateTest('testdata/maflite/tiny_maflite.maf.txt', "out/testExposedCols.maf.tsv", self._determine_db_dir())
# Sanity checks to make sure that the generated maf file is not junk.
self._validateTcgaMafContents(testOutputFilename)
# Check the columns, since the input has a couple of exposed columns.
tsvReader = GenericTsvReader(testOutputFilename)
headers = tsvReader.getFieldNames()
headersToCheck = ['t_alt_count', 't_ref_count']
for h in headersToCheck:
self.assertFalse(("i_" + h) in headers, "i_ was prepended to " + h)
self.assertTrue(h in headers, h + " not found.")
示例9: _renderSortedTsv
def _renderSortedTsv(self, templateFilename, vcfFilename, tsvFilename, sampleNames, dataManager, inferGenotypes):
"""
Turn a sorted tsv into a VCF
:param templateFilename: basic VCF to model output VCF.
:param vcfFilename: output VCF filename
:param tsvFilename: input sorted tsv
:param sampleNames: sample names that should be used in output
:param dataManager: dataManager instance used in creating pyvcf records.
:param inferGenotypes: whether we should try to infer the genotypes, since we may not have add GT explicitly
on input
"""
tempVcfReader = vcf.Reader(filename=templateFilename, strict_whitespace=True)
pointer = file(vcfFilename, "w")
tsvReader = GenericTsvReader(tsvFilename, delimiter=self.delimiter)
index = 0
nrecords = 1000
chrom = None
pos = None
refAllele = None
recordBuilder = None
vcfWriter = vcf.Writer(pointer, tempVcfReader, self.lineterminator)
for ctr, m in enumerate(tsvReader):
isNewRecord = self._isNewVcfRecordNeeded(chrom, m["chr"], pos, m["start"], refAllele, m["ref_allele"])
if isNewRecord:
if recordBuilder is not None:
record = recordBuilder.createRecord()
vcfWriter.write_record(record)
index += 1
if index % nrecords == 0:
self.logger.info("Rendered " + str(index) + " vcf records.")
vcfWriter.flush()
chrom = m["chr"]
pos = m["start"]
refAllele = m["ref_allele"]
recordBuilder = RecordBuilder(chrom, int(pos), refAllele, sampleNames)
recordBuilder = self._parseRecordBuilder(m, recordBuilder, dataManager, inferGenotypes)
if recordBuilder is not None:
record = recordBuilder.createRecord()
vcfWriter.write_record(record)
vcfWriter.close()
tsvReader.close()
self.logger.info("Rendered all " + str(index) + " vcf records.")
示例10: _validateTcgaMafContents
def _validateTcgaMafContents(self, filename):
""" This is a utility, private method for unit tests to get a semblance that a valid maf file was created.
Note: This method has nothing to do with the TCGA validator.
"""
configFile = ConfigUtils.createConfigParser(os.path.join("configs", "tcgaMAF2.4_output.config"))
statinfo = os.stat(filename)
self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.")
tsvReader = GenericTsvReader(filename)
self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number")
ctr = 1
for lineDict in tsvReader:
# TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf)
# if lineDict['Entrez_Gene_Id'] == "0":
# self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr))
unknownKeys = []
self.assertTrue(lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"], "Reference and alternate were equal in TCGA MAF output on line %d (%s)" % (ctr, lineDict["Tumor_Seq_Allele1"]))
self.assertTrue(lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"], "Reference Allele should match Tumor_Seq_Allele1 on line " + str(ctr))
uniprot_aa_xform_counter = 0
for k in lineDict.keys():
if lineDict[k] == "__UNKNOWN__":
unknownKeys.append(k)
self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.")
requiredColumns = configFile.get("general", "requiredColumns")
optionalColumns = configFile.get("general", "optionalColumns")
exposedColumns = configFile.get("general", "exposedColumns")
if (k not in requiredColumns) and (k not in optionalColumns) and (k not in exposedColumns):
self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'")
if lineDict['UniProt_AApos'] == "0":
uniprot_aa_xform_counter += 1
if lineDict["Variant_Type"] == VariantClassification.VT_DEL:
self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-")
if lineDict["Variant_Type"] == VariantClassification.VT_INS:
self.assertTrue(lineDict["Reference_Allele"] == "-")
unknownKeys.sort()
self.assertTrue(len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys))
self.assertTrue(uniprot_aa_xform_counter < 10, "Too many uniprot aa xform values are zero (" + str(uniprot_aa_xform_counter) + "). This is probably an error.")
ctr += 1
示例11: _validateTcgaMafContents
def _validateTcgaMafContents(self, filename):
"""
This is a utility, private method for unit tests to get a semblance that a valid maf file was created.
Note: This method has nothing to do with the TCGA validator.
TODO: This is code duplication from TCGA MAF Output RendererTest. This should be refactored into a base class
(to preserve self.assertTrue, etc).
"""
statinfo = os.stat(filename)
self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.")
tsvReader = GenericTsvReader(filename)
self.assertTrue(tsvReader.getComments().find('#version') <> -1, "First line did not specify a version number")
ctr = 1
for lineDict in tsvReader:
if lineDict['Entrez_Gene_Id'] == "0":
self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown",
"Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr))
unknownKeys = []
for k in lineDict.keys():
if lineDict[k] == "__UNKNOWN__":
unknownKeys.append(k)
self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.")
configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.3_output.config')
requiredColumns = configFile.get("general", "requiredColumns")
optionalColumns = configFile.get("general", "optionalColumns")
if (k not in requiredColumns) and (k not in optionalColumns):
self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'")
unknownKeys.sort()
self.assertTrue(len(unknownKeys) == 0,
"__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(
ctr) + ", in fields: " + ", ".join(unknownKeys))
ctr += 1
示例12: testBasicAnnotation
def testBasicAnnotation(self):
""" Test annotation from a generic TSV based on a transcript annotation. Only confirms the proper headers of the output. """
# We need a gaf data source to annotate gene
gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
transcriptDS = DatasourceFactory.createDatasource(
"testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/"
)
outputFilename = "out/genericTranscriptTest.out.tsv"
annotator = Annotator()
annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt"))
annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
annotator.addDatasource(gafDatasource)
annotator.addDatasource(transcriptDS)
outputFilename = annotator.annotate()
tsvReader = GenericTsvReader(outputFilename)
headers = tsvReader.getFieldNames()
self.assertTrue("refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers))
self.assertTrue("refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
示例13: testTCGAMAFAsInputAndQuickAnnotate
def testTCGAMAFAsInputAndQuickAnnotate(self):
""" Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """
inputFilename = "testdata/maf/Patient0.maf.annotated"
tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config')
outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv"
outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config')
annotator = Annotator()
annotator.setInputCreator(tmp)
annotator.setOutputRenderer(outputRenderer)
ds = DatasourceFactory.createDatasource("testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/")
annotator.addDatasource(ds)
annotator.annotate()
statinfo = os.stat(outputFilename)
self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")
tsvReaderIn = GenericTsvReader(inputFilename)
tsvReader = GenericTsvReader(outputFilename)
self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number")
self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header")
self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header")
ctrOut = 0
for lineDict in tsvReader:
ctrOut += 1
ctrIn = 0
for lineDict in tsvReaderIn:
ctrIn += 1
ctrIn += len(tsvReaderIn.getCommentsAsList())
ctrOut += len(tsvReader.getCommentsAsList())
self.assertTrue(ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
示例14: testAnnotationWithExampleVcf
def testAnnotationWithExampleVcf(self):
"""
Tests whether parsed annotations match the actual annotations in a simple TSV. Missing format fields yield -->"" ".,." --> ","
"""
inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"])
outputFilename = os.path.join("out", "example.out.tsv")
expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.out.tsv"])
creator = VcfInputMutationCreator(inputFilename)
creator.createMutations()
renderer = SimpleOutputRenderer(outputFilename)
annotator = Annotator()
annotator.setInputCreator(creator)
annotator.setOutputRenderer(renderer)
annotator.annotate()
tsvReader = GenericTsvReader(outputFilename)
current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList()))
expected = pandas.read_csv(expectedOutputFilename, sep='\t')
currentColNames = set()
for i in range(len(current.columns)):
currentColNames.add(current.columns[i])
expectedColNames = set()
for i in range(len(expected.columns)):
expectedColNames.add(expected.columns[i])
self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0,
"Should have the same columns")
self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows")
for colName in currentColNames:
self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) &
pandas.isnull(expected[colName]))) ==
len(current.index), "Should have the same values in column " + colName + ": \n" +
str(current[colName]) + "\nvs\n" + str(expected[colName]))
示例15: testMissingFilter
def testMissingFilter(self):
"""
Tests that the missing FILTER fields are parsed correctly.
"""
inputFilename = os.path.join(*["testdata", "vcf", "example.missing_filters.vcf"])
outputFilename = os.path.join("out", "example.missing_filters.out.tsv")
expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.missing_filters.out.tsv"])
creator = VcfInputMutationCreator(inputFilename)
creator.createMutations()
renderer = SimpleOutputRenderer(outputFilename)
annotator = Annotator()
annotator.setInputCreator(creator)
annotator.setOutputRenderer(renderer)
annotator.annotate()
tsvReader = GenericTsvReader(outputFilename)
current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList()))
expected = pandas.read_csv(expectedOutputFilename, sep='\t')
currentColNames = set()
for i in range(len(current.columns)):
currentColNames.add(current.columns[i])
expectedColNames = set()
for i in range(len(expected.columns)):
expectedColNames.add(expected.columns[i])
self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0,
"Should have the same columns")
self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows")
for colName in currentColNames:
self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) &
pandas.isnull(expected[colName]))) ==
len(current.index), "Should have the same values in column " + colName)