本文整理汇总了Python中rgt.GenomicRegionSet.GenomicRegionSet.read方法的典型用法代码示例。如果您正苦于以下问题:Python GenomicRegionSet.read方法的具体用法?Python GenomicRegionSet.read怎么用?Python GenomicRegionSet.read使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类rgt.GenomicRegionSet.GenomicRegionSet
的用法示例。
在下文中一共展示了GenomicRegionSet.read方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: estimate_bias_vom
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def estimate_bias_vom(args):
regions = GenomicRegionSet("regions")
regions.read(args.regions_file)
create_signal(args, regions)
hmm_data = HmmData()
learn_dependency_model = hmm_data.get_dependency_model()
slim_dimont_predictor = hmm_data.get_slim_dimont_predictor()
test_fa = hmm_data.get_default_test_fa()
shutil.copy(test_fa, args.output_location)
os.chdir(args.output_location)
print(os.getcwd())
output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))
infix = "{}_f_obs".format(str(args.k_nb))
create_model(args, output_fname_f_obs, infix, learn_dependency_model, slim_dimont_predictor)
infix = "{}_f_exp".format(str(args.k_nb))
create_model(args, output_fname_f_exp, infix, learn_dependency_model, slim_dimont_predictor)
infix = "{}_r_obs".format(str(args.k_nb))
create_model(args, output_fname_r_obs, infix, learn_dependency_model, slim_dimont_predictor)
infix = "{}_r_exp".format(str(args.k_nb))
create_model(args, output_fname_r_exp, infix, learn_dependency_model, slim_dimont_predictor)
os.remove(os.path.join(args.output_location, "test.fa"))
compute_bias(args)
示例2: get_bc_signal
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def get_bc_signal(arguments):
(mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments
mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
mpbs1.read(mpbs_file1)
mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
mpbs2.read(mpbs_file2)
mpbs = mpbs1.combine(mpbs2, output=True)
mpbs.sort()
bam1 = Samfile(reads_file1, "rb")
bam2 = Samfile(reads_file2, "rb")
genome_data = GenomeData(organism)
fasta = Fastafile(genome_data.get_genome())
signal_1 = np.zeros(window_size)
signal_2 = np.zeros(window_size)
motif_len = None
pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
("G", [0.0] * window_size), ("T", [0.0] * window_size),
("N", [0.0] * window_size)])
mpbs_regions = mpbs.by_names([mpbs_name])
num_motif = len(mpbs_regions)
# Fetch bias corrected signal
for region in mpbs_regions:
if motif_len is None:
motif_len = region.final - region.initial
mid = (region.final + region.initial) / 2
p1 = mid - window_size / 2
p2 = mid + window_size / 2
if p1 <= 0:
continue
# Fetch raw signal
signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1,
bias_table=bias_table1, genome_file_name=genome_data.get_genome(),
forward_shift=forward_shift, reverse_shift=reverse_shift)
signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2,
bias_table=bias_table2, genome_file_name=genome_data.get_genome(),
forward_shift=forward_shift, reverse_shift=reverse_shift)
if len(signal1) != len(signal_1) or len(signal2) != len(signal_2):
continue
# smooth the signal
signal_1 = np.add(signal_1, np.array(signal1))
signal_2 = np.add(signal_2, np.array(signal2))
update_pwm(pwm, fasta, region, p1, p2)
return signal_1, signal_2, motif_len, pwm, num_motif
示例3: test_fisher_table
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def test_fisher_table(self):
regions = GenomicRegionSet("regions")
regions.read(os.path.join(os.path.dirname(__file__), "test.bed"))
mpbs = GenomicRegionSet("mpbs")
mpbs.read(os.path.join(os.path.dirname(__file__), "test_mpbs.bed"))
i, ni, gs, ms = fisher_table("GGT1", regions, mpbs)
self.assertEqual(i, 0)
self.assertEqual(ni, 36)
i, ni, gs, ms = fisher_table("HIC2", regions, mpbs)
self.assertEqual(i, 8)
self.assertEqual(ni, 28)
i, ni, gs, ms = fisher_table("RAC2", regions, mpbs, gene_set=True, mpbs_set=True)
self.assertEqual(len(gs), 0)
self.assertEqual(len(ms), 0)
示例4: get_raw_tracks
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def get_raw_tracks(args):
# Initializing Error Handler
err = ErrorHandler()
if len(args.input_files) != 2:
err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")
output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))
bam = Samfile(args.input_files[0], "rb")
regions = GenomicRegionSet("Interested regions")
regions.read(args.input_files[1])
regions.merge()
reads_file = GenomicSignal()
with open(output_fname, "a") as output_f:
for region in regions:
# Raw counts
signal = [0.0] * (region.final - region.initial)
for read in bam.fetch(region.chrom, region.initial, region.final):
if not read.is_reverse:
cut_site = read.pos + args.forward_shift
if region.initial <= cut_site < region.final:
signal[cut_site - region.initial] += 1.0
else:
cut_site = read.aend + args.reverse_shift - 1
if region.initial <= cut_site < region.final:
signal[cut_site - region.initial] += 1.0
if args.norm:
signal = reads_file.boyle_norm(signal)
perc = scoreatpercentile(signal, 98)
std = np.std(signal)
signal = reads_file.hon_norm_atac(signal, perc, std)
output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
"\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
output_f.close()
if args.bigWig:
genome_data = GenomeData(args.organism)
chrom_sizes_file = genome_data.get_chromosome_sizes()
bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
os.remove(output_fname)
示例5: test_subtract_exact
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def test_subtract_exact(self):
reference = GenomicRegionSet("reference")
reference.read(os.path.join(os.path.dirname(__file__), "test_result.bed"))
background = GenomicRegionSet("background")
background.read(os.path.join(os.path.dirname(__file__), "test_background.bed"))
target = GenomicRegionSet("target")
target.read(os.path.join(os.path.dirname(__file__), "test_target.bed"))
background_tmp = background.subtract(target, exact=True)
reference.sort()
self.assertEqual(len(background_tmp.sequences), len(reference.sequences))
for region, region_ref in zip(background_tmp.sequences, reference.sequences):
self.assertEqual(region.__cmp__(region_ref), 0)
示例6: get_raw_signal
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def get_raw_signal(arguments):
(mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
window_size, forward_shift, reverse_shift) = arguments
mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
mpbs1.read(mpbs_file1)
mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
mpbs2.read(mpbs_file2)
mpbs = mpbs1.combine(mpbs2, output=True)
mpbs.sort()
bam1 = Samfile(reads_file1, "rb")
bam2 = Samfile(reads_file2, "rb")
genome_data = GenomeData(organism)
fasta = Fastafile(genome_data.get_genome())
signal_1 = np.zeros(window_size)
signal_2 = np.zeros(window_size)
motif_len = None
pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
("G", [0.0] * window_size), ("T", [0.0] * window_size),
("N", [0.0] * window_size)])
mpbs_regions = mpbs.by_names([mpbs_name])
num_motif = len(mpbs_regions)
for region in mpbs_regions:
if motif_len is None:
motif_len = region.final - region.initial
mid = (region.final + region.initial) / 2
p1 = mid - window_size / 2
p2 = mid + window_size / 2
if p1 <= 0:
continue
# Fetch raw signal
for read in bam1.fetch(region.chrom, p1, p2):
# check if the read is unmapped, according to issue #112
if read.is_unmapped:
continue
if not read.is_reverse:
cut_site = read.pos + forward_shift
if p1 <= cut_site < p2:
signal_1[cut_site - p1] += 1.0
else:
cut_site = read.aend + reverse_shift - 1
if p1 <= cut_site < p2:
signal_1[cut_site - p1] += 1.0
for read in bam2.fetch(region.chrom, p1, p2):
# check if the read is unmapped, according to issue #112
if read.is_unmapped:
continue
if not read.is_reverse:
cut_site = read.pos + forward_shift
if p1 <= cut_site < p2:
signal_2[cut_site - p1] += 1.0
else:
cut_site = read.aend + reverse_shift - 1
if p1 <= cut_site < p2:
signal_2[cut_site - p1] += 1.0
update_pwm(pwm, fasta, region, p1, p2)
return signal_1, signal_2, motif_len, pwm, num_motif
示例7: diff_analysis_run
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def diff_analysis_run(args):
# Initializing Error Handler
err = ErrorHandler()
output_location = os.path.join(args.output_location, "Lineplots")
try:
if not os.path.isdir(output_location):
os.makedirs(output_location)
except Exception:
err.throw_error("MM_OUT_FOLDER_CREATION")
# Check if the index file exists
base_name1 = "{}.bai".format(args.reads_file1)
if not os.path.exists(base_name1):
pysam.index(args.reads_file1)
base_name2 = "{}.bai".format(args.reads_file2)
if not os.path.exists(base_name2):
pysam.index(args.reads_file2)
mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
mpbs1.read(args.mpbs_file1)
mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
mpbs2.read(args.mpbs_file2)
mpbs = mpbs1.combine(mpbs2, output=True)
mpbs.sort()
mpbs.remove_duplicates()
mpbs_name_list = list(set(mpbs.get_names()))
signal_dict_by_tf_1 = dict()
signal_dict_by_tf_2 = dict()
motif_len_dict = dict()
motif_num_dict = dict()
pwm_dict_by_tf = dict()
pool = Pool(processes=args.nc)
# differential analysis using bias corrected signal
if args.bc:
hmm_data = HmmData()
table_F = hmm_data.get_default_bias_table_F_ATAC()
table_R = hmm_data.get_default_bias_table_R_ATAC()
bias_table1 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R)
bias_table2 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R)
mpbs_list = list()
for mpbs_name in mpbs_name_list:
mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2,
args.organism, args.window_size, args.forward_shift, args.reverse_shift,
bias_table1, bias_table2))
try:
res = pool.map(get_bc_signal, mpbs_list)
except Exception:
logging.exception("get bias corrected signal failed")
# differential analysis using raw signal
else:
mpbs_list = list()
for mpbs_name in mpbs_name_list:
mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2,
args.organism, args.window_size, args.forward_shift, args.reverse_shift))
try:
res = pool.map(get_raw_signal, mpbs_list)
except Exception:
logging.exception("get raw signal failed")
for idx, mpbs_name in enumerate(mpbs_name_list):
signal_dict_by_tf_1[mpbs_name] = res[idx][0]
signal_dict_by_tf_2[mpbs_name] = res[idx][1]
motif_len_dict[mpbs_name] = res[idx][2]
pwm_dict_by_tf[mpbs_name] = res[idx][3]
motif_num_dict[mpbs_name] = res[idx][4]
if args.factor1 is None or args.factor2 is None:
args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2)
output_factor(args, args.factor1, args.factor2)
if args.output_profiles:
output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location, args.condition1)
output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location, args.condition2)
ps_tc_results_by_tf = dict()
plots_list = list()
for mpbs_name in mpbs_name_list:
plots_list.append((mpbs_name, motif_num_dict[mpbs_name], signal_dict_by_tf_1[mpbs_name],
signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, args.condition1,
args.condition2, pwm_dict_by_tf[mpbs_name], output_location, args.window_size,
args.standardize))
pool.map(line_plot, plots_list)
for mpbs_name in mpbs_name_list:
res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name],
args.factor1, args.factor2, motif_num_dict[mpbs_name], motif_len_dict[mpbs_name])
#
# # only use the factors whose protection scores are greater than 0
# if res[0] > 0 and res[1] < 0:
ps_tc_results_by_tf[mpbs_name] = res
#.........这里部分代码省略.........
示例8: test_get_fisher_dict
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def test_get_fisher_dict(self):
motif_names = ["TFIP11", "ACO2", "HIC2", "HAT5"]
regions = GenomicRegionSet("regions")
regions.read(os.path.join(os.path.dirname(__file__), "test.bed"))
mpbs = GenomicRegionSet("mpbs")
mpbs.read(os.path.join(os.path.dirname(__file__), "test_mpbs.bed"))
regions2 = GenomicRegionSet("regions2")
regions2.read(os.path.join(os.path.dirname(__file__), "test2.bed"))
mpbs2 = GenomicRegionSet("mpbs2")
mpbs2.read(os.path.join(os.path.dirname(__file__), "test2_mpbs.bed"))
regions3 = GenomicRegionSet("regions3")
regions3.read(os.path.join(os.path.dirname(__file__), "test3.bed"))
mpbs3 = GenomicRegionSet("mpbs3")
mpbs3.read(os.path.join(os.path.dirname(__file__), "test3_mpbs.bed"))
result = get_fisher_dict(motif_names, regions, mpbs)
intersecting = result[0]
not_intersecting = result[1]
for mn in ["TFIP11", "ACO2", "HAT5"]:
self.assertEqual(intersecting[mn], 0)
self.assertEqual(intersecting["HIC2"], 8)
for mn in motif_names:
self.assertEqual(intersecting[mn]+not_intersecting[mn], 36)
result = get_fisher_dict(motif_names, regions, mpbs, gene_set=True, mpbs_set=True)
gs = result[2]
ms = result[3]
for mn in ["TFIP11", "ACO2", "HAT5"]:
self.assertEqual(len(gs[mn]), 0)
self.assertEqual(len(gs["HIC2"]), 8)
self.assertEqual(len(ms["HIC2"]), 8)
示例9: chip_evaluate
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def chip_evaluate(args):
# Evaluate Statistics
fpr = dict()
tpr = dict()
roc_auc_1 = dict()
roc_auc_10 = dict()
roc_auc_50 = dict()
roc_auc_100 = dict()
recall = dict()
precision = dict()
prc_auc_1 = dict()
prc_auc_10 = dict()
prc_auc_50 = dict()
prc_auc_100 = dict()
footprint_file = args.footprint_file.split(",")
footprint_name = args.footprint_name.split(",")
footprint_type = args.footprint_type.split(",")
max_score = 0
if "SEG" in footprint_type:
mpbs_regions = GenomicRegionSet("TFBS")
mpbs_regions.read(args.tfbs_file)
# Verifying the maximum score of the MPBS file
for region in iter(mpbs_regions):
score = int(region.data.split("\t")[0])
if score > max_score:
max_score = score
max_score += 1
max_points = []
for i in range(len(footprint_file)):
footprints_regions = GenomicRegionSet("Footprints Prediction")
footprints_regions.read(footprint_file[i])
footprints_regions.sort()
if footprint_type[i] == "SEG":
# Increasing the score of MPBS entry once if any overlaps found in the predicted footprints.
increased_score_mpbs_regions = GenomicRegionSet("Increased Regions")
intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL)
for region in iter(intersect_regions):
region.data = str(int(region.data.split("\t")[0]) + max_score)
increased_score_mpbs_regions.add(region)
# Keep the score of remained MPBS entry unchanged
without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True)
for region in iter(without_intersect_regions):
increased_score_mpbs_regions.add(region)
increased_score_mpbs_regions.sort_score()
fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \
roc_curve(increased_score_mpbs_regions)
recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \
precision_recall_curve(increased_score_mpbs_regions)
max_points.append(len(intersect_regions))
elif footprint_type[i] == "SC":
footprints_regions.sort_score()
fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \
roc_curve(footprints_regions)
recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \
precision_recall_curve(footprints_regions)
max_points.append(len(footprints_regions))
# Output the statistics results into text
stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix))
stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"]
with open(stats_fname, "w") as stats_file:
stats_file.write("\t".join(stats_header) + "\n")
for i in range(len(footprint_name)):
stats_file.write(footprint_name[i] + "\t" +
str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" +
str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" +
str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n")
# Output the curves
if args.print_roc_curve:
label_x = "False Positive Rate"
label_y = "True Positive Rate"
curve_name = "ROC"
plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix,
curve_name, max_points=max_points)
if args.print_pr_curve:
label_x = "Recall"
label_y = "Precision"
curve_name = "PRC"
plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y,
args.output_prefix, curve_name, max_points=max_points)
output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
示例10: get_bc_tracks
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def get_bc_tracks(args):
# Initializing Error Handler
err = ErrorHandler()
if len(args.input_files) != 2:
err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")
regions = GenomicRegionSet("Interested regions")
regions.read(args.input_files[1])
regions.merge()
reads_file = GenomicSignal()
bam = Samfile(args.input_files[0], "rb")
genome_data = GenomeData(args.organism)
fasta = Fastafile(genome_data.get_genome())
hmm_data = HmmData()
if args.bias_table:
bias_table_list = args.bias_table.split(",")
bias_table = BiasTable().load_table(table_file_name_F=bias_table_list[0],
table_file_name_R=bias_table_list[1])
else:
table_F = hmm_data.get_default_bias_table_F_ATAC()
table_R = hmm_data.get_default_bias_table_R_ATAC()
bias_table = BiasTable().load_table(table_file_name_F=table_F,
table_file_name_R=table_R)
if args.strand_specific:
fname_forward = os.path.join(args.output_location, "{}_forward.wig".format(args.output_prefix))
fname_reverse = os.path.join(args.output_location, "{}_reverse.wig".format(args.output_prefix))
f_forward = open(fname_forward, "a")
f_reverse = open(fname_reverse, "a")
for region in regions:
signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length(
ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table,
forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None,
strand=True)
if args.norm:
signal_f = reads_file.boyle_norm(signal_f)
perc = scoreatpercentile(signal_f, 98)
std = np.std(signal_f)
signal_f = reads_file.hon_norm_atac(signal_f, perc, std)
signal_r = reads_file.boyle_norm(signal_r)
perc = scoreatpercentile(signal_r, 98)
std = np.std(signal_r)
signal_r = reads_file.hon_norm_atac(signal_r, perc, std)
f_forward.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
"\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n")
f_reverse.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
"\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n")
f_forward.close()
f_reverse.close()
if args.bigWig:
genome_data = GenomeData(args.organism)
chrom_sizes_file = genome_data.get_chromosome_sizes()
bw_filename = os.path.join(args.output_location, "{}_forward.bw".format(args.output_prefix))
os.system(" ".join(["wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0"]))
os.remove(fname_forward)
bw_filename = os.path.join(args.output_location, "{}_reverse.bw".format(args.output_prefix))
os.system(" ".join(["wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0"]))
os.remove(fname_reverse)
else:
output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))
with open(output_fname, "a") as output_f:
for region in regions:
signal = reads_file.get_bc_signal_by_fragment_length(ref=region.chrom, start=region.initial,
end=region.final,
bam=bam, fasta=fasta, bias_table=bias_table,
forward_shift=args.forward_shift,
reverse_shift=args.reverse_shift,
min_length=None, max_length=None, strand=False)
if args.norm:
signal = reads_file.boyle_norm(signal)
perc = scoreatpercentile(signal, 98)
std = np.std(signal)
signal = reads_file.hon_norm_atac(signal, perc, std)
output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
"\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
output_f.close()
if args.bigWig:
genome_data = GenomeData(args.organism)
chrom_sizes_file = genome_data.get_chromosome_sizes()
bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
os.remove(output_fname)
示例11: estimate_bias_kmer
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def estimate_bias_kmer(args):
# Parameters
maxDuplicates = 100
pseudocount = 1.0
# Initializing bam and fasta
bamFile = Samfile(args.reads_file, "rb")
genome_data = GenomeData(args.organism)
fastaFile = Fastafile(genome_data.get_genome())
regions = GenomicRegionSet("regions")
regions.read(args.regions_file)
# Initializing dictionaries
obsDictF = dict()
obsDictR = dict()
expDictF = dict()
expDictR = dict()
ct_reads_r = 0
ct_reads_f = 0
ct_kmers = 0
# Iterating on HS regions
for region in regions:
# Initialization
prevPos = -1
trueCounter = 0
# Evaluating observed frequencies ####################################
# Fetching reads
for r in bamFile.fetch(region.chrom, region.initial, region.final):
# Calculating positions
if not r.is_reverse:
cut_site = r.pos + args.forward_shift - 1
p1 = cut_site - int(floor(args.k_nb / 2))
else:
cut_site = r.aend + args.reverse_shift + 1
p1 = cut_site - int(floor(args.k_nb / 2))
p2 = p1 + args.k_nb
# Verifying PCR artifacts
if p1 == prevPos:
trueCounter += 1
else:
prevPos = p1
trueCounter = 0
if trueCounter > maxDuplicates: continue
# Fetching k-mer
try:
currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
except Exception:
continue
if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)
# Counting k-mer in dictionary
if not r.is_reverse:
ct_reads_f += 1
try:
obsDictF[currStr] += 1
except Exception:
obsDictF[currStr] = 1
else:
ct_reads_r += 1
try:
obsDictR[currStr] += 1
except Exception:
obsDictR[currStr] = 1
# Evaluating expected frequencies ####################################
# Fetching whole sequence
try:
currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
except Exception:
continue
currRevComp = AuxiliaryFunctions.revcomp(currStr)
# Iterating on each sequence position
for i in range(0, len(currStr) - args.k_nb):
ct_kmers += 1
# Counting k-mer in dictionary
s = currStr[i:i + args.k_nb]
try:
expDictF[s] += 1
except Exception:
expDictF[s] = 1
# Counting k-mer in dictionary for reverse complement
s = currRevComp[i:i + args.k_nb]
try:
expDictR[s] += 1
except Exception:
expDictR[s] = 1
# Closing files
bamFile.close()
fastaFile.close()
#.........这里部分代码省略.........
示例12: estimate_bias_pwm
# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def estimate_bias_pwm(args):
# Parameters
max_duplicates = 100
# Initializing bam and fasta
bamFile = Samfile(args.reads_file, "rb")
genome_data = GenomeData(args.organism)
fastaFile = Fastafile(genome_data.get_genome())
regions = GenomicRegionSet("regions")
regions.read(args.regions_file)
obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
# Iterating on HS regions
for region in regions:
# Initialization
prev_pos = -1
true_counter = 0
# Evaluating observed frequencies
# Fetching reads
for r in bamFile.fetch(region.chrom, region.initial, region.final):
# Calculating positions
if not r.is_reverse:
cut_site = r.pos + args.forward_shift - 1
p1 = cut_site - int(floor(args.k_nb / 2))
else:
cut_site = r.aend + args.reverse_shift + 1
p1 = cut_site - int(floor(args.k_nb / 2))
p2 = p1 + args.k_nb
# Verifying PCR artifacts
if p1 == prev_pos:
true_counter += 1
else:
prev_pos = p1
true_counter = 0
if true_counter > max_duplicates: continue
# Fetching k-mer
try:
currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
except Exception:
continue
if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)
# Counting k-mer in dictionary
if not r.is_reverse:
for i in range(0, len(currStr)):
obs_f_pwm_dict[currStr[i]][i] += 1
else:
for i in range(0, len(currStr)):
obs_r_pwm_dict[currStr[i]][i] += 1
# Evaluating expected frequencies
# Fetching whole sequence
try:
currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
except Exception:
continue
# Iterating on each sequence position
s = None
for i in range(0, len(currStr) - args.k_nb):
# Counting k-mer in dictionary
s = currStr[i:i + args.k_nb]
for i in range(0, len(s)):
exp_f_pwm_dict[s[i]][i] += 1
# Counting k-mer in dictionary for reverse complement
s = AuxiliaryFunctions.revcomp(s)
for i in range(0, len(s)):
exp_r_pwm_dict[s[i]][i] += 1
# Closing files
bamFile.close()
fastaFile.close()
# Output pwms
os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
pwm_file_list = []
pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))
pwm_file_list.append(pwm_obs_f)
pwm_file_list.append(pwm_obs_r)
pwm_file_list.append(pwm_exp_f)
pwm_file_list.append(pwm_exp_r)
for i in range(len(pwm_dict_list)):
#.........这里部分代码省略.........