当前位置: 首页>>代码示例>>Python>>正文


Python GenomicRegionSet.read方法代码示例

本文整理汇总了Python中rgt.GenomicRegionSet.GenomicRegionSet.read方法的典型用法代码示例。如果您正苦于以下问题:Python GenomicRegionSet.read方法的具体用法?Python GenomicRegionSet.read怎么用?Python GenomicRegionSet.read使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在rgt.GenomicRegionSet.GenomicRegionSet的用法示例。


在下文中一共展示了GenomicRegionSet.read方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: estimate_bias_vom

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def estimate_bias_vom(args):
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)
    create_signal(args, regions)

    hmm_data = HmmData()

    learn_dependency_model = hmm_data.get_dependency_model()
    slim_dimont_predictor = hmm_data.get_slim_dimont_predictor()
    test_fa = hmm_data.get_default_test_fa()

    shutil.copy(test_fa, args.output_location)
    os.chdir(args.output_location)
    print(os.getcwd())

    output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
    output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
    output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
    output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))

    infix = "{}_f_obs".format(str(args.k_nb))
    create_model(args, output_fname_f_obs, infix, learn_dependency_model, slim_dimont_predictor)

    infix = "{}_f_exp".format(str(args.k_nb))
    create_model(args, output_fname_f_exp, infix, learn_dependency_model, slim_dimont_predictor)

    infix = "{}_r_obs".format(str(args.k_nb))
    create_model(args, output_fname_r_obs, infix, learn_dependency_model, slim_dimont_predictor)

    infix = "{}_r_exp".format(str(args.k_nb))
    create_model(args, output_fname_r_exp, infix, learn_dependency_model, slim_dimont_predictor)

    os.remove(os.path.join(args.output_location, "test.fa"))

    compute_bias(args)
开发者ID:CostaLab,项目名称:reg-gen,代码行数:37,代码来源:Estimation.py

示例2: get_bc_signal

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def get_bc_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    # Fetch bias corrected signal
    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1,
                                  bias_table=bias_table1, genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift, reverse_shift=reverse_shift)

        signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2,
                                  bias_table=bias_table2, genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift, reverse_shift=reverse_shift)

        if len(signal1) != len(signal_1) or len(signal2) != len(signal_2):
            continue

        # smooth the signal
        signal_1 = np.add(signal_1, np.array(signal1))
        signal_2 = np.add(signal_2, np.array(signal2))

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
开发者ID:CostaLab,项目名称:reg-gen,代码行数:61,代码来源:DifferentialAnalysis.py

示例3: test_fisher_table

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
    def test_fisher_table(self):
        regions = GenomicRegionSet("regions")
        regions.read(os.path.join(os.path.dirname(__file__), "test.bed"))
        mpbs = GenomicRegionSet("mpbs")
        mpbs.read(os.path.join(os.path.dirname(__file__), "test_mpbs.bed"))

        i, ni, gs, ms = fisher_table("GGT1", regions, mpbs)
        self.assertEqual(i, 0)
        self.assertEqual(ni, 36)

        i, ni, gs, ms = fisher_table("HIC2", regions, mpbs)
        self.assertEqual(i, 8)
        self.assertEqual(ni, 28)

        i, ni, gs, ms = fisher_table("RAC2", regions, mpbs, gene_set=True, mpbs_set=True)
        self.assertEqual(len(gs), 0)
        self.assertEqual(len(ms), 0)
开发者ID:CostaLab,项目名称:reg-gen,代码行数:19,代码来源:test_Statistics.py

示例4: get_raw_tracks

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def get_raw_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")

    output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))

    bam = Samfile(args.input_files[0], "rb")
    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()
    reads_file = GenomicSignal()

    with open(output_fname, "a") as output_f:
        for region in regions:
            # Raw counts
            signal = [0.0] * (region.final - region.initial)
            for read in bam.fetch(region.chrom, region.initial, region.final):
                if not read.is_reverse:
                    cut_site = read.pos + args.forward_shift
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0
                else:
                    cut_site = read.aend + args.reverse_shift - 1
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0

            if args.norm:
                signal = reads_file.boyle_norm(signal)
                perc = scoreatpercentile(signal, 98)
                std = np.std(signal)
                signal = reads_file.hon_norm_atac(signal, perc, std)

            output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                           "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
    output_f.close()

    if args.bigWig:
        genome_data = GenomeData(args.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()
        bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
        os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
        os.remove(output_fname)
开发者ID:CostaLab,项目名称:reg-gen,代码行数:47,代码来源:Tracks.py

示例5: test_subtract_exact

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
    def test_subtract_exact(self):
        reference = GenomicRegionSet("reference")
        reference.read(os.path.join(os.path.dirname(__file__), "test_result.bed"))
        background = GenomicRegionSet("background")
        background.read(os.path.join(os.path.dirname(__file__), "test_background.bed"))
        target = GenomicRegionSet("target")
        target.read(os.path.join(os.path.dirname(__file__), "test_target.bed"))

        background_tmp = background.subtract(target, exact=True)

        reference.sort()

        self.assertEqual(len(background_tmp.sequences), len(reference.sequences))

        for region, region_ref in zip(background_tmp.sequences, reference.sequences):
            self.assertEqual(region.__cmp__(region_ref), 0)
开发者ID:CostaLab,项目名称:reg-gen,代码行数:18,代码来源:test_Enrichment.py

示例6: get_raw_signal

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def get_raw_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue

        # Fetch raw signal
        for read in bam1.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0

        for read in bam2.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
开发者ID:CostaLab,项目名称:reg-gen,代码行数:74,代码来源:DifferentialAnalysis.py

示例7: diff_analysis_run

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def diff_analysis_run(args):
    # Initializing Error Handler
    err = ErrorHandler()

    output_location = os.path.join(args.output_location, "Lineplots")
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    # Check if the index file exists
    base_name1 = "{}.bai".format(args.reads_file1)
    if not os.path.exists(base_name1):
        pysam.index(args.reads_file1)

    base_name2 = "{}.bai".format(args.reads_file2)
    if not os.path.exists(base_name2):
        pysam.index(args.reads_file2)

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(args.mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(args.mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()
    mpbs.remove_duplicates()
    mpbs_name_list = list(set(mpbs.get_names()))

    signal_dict_by_tf_1 = dict()
    signal_dict_by_tf_2 = dict()
    motif_len_dict = dict()
    motif_num_dict = dict()
    pwm_dict_by_tf = dict()

    pool = Pool(processes=args.nc)
    # differential analysis using bias corrected signal
    if args.bc:
        hmm_data = HmmData()
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table1 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R)
        bias_table2 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R)

        mpbs_list = list()
        for mpbs_name in mpbs_name_list:
            mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2,
                              args.organism, args.window_size, args.forward_shift, args.reverse_shift,
                              bias_table1, bias_table2))
        try:
            res = pool.map(get_bc_signal, mpbs_list)
        except Exception:
            logging.exception("get bias corrected signal failed")

    # differential analysis using raw signal
    else:
        mpbs_list = list()
        for mpbs_name in mpbs_name_list:
            mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2,
                              args.organism, args.window_size, args.forward_shift, args.reverse_shift))
        try:
            res = pool.map(get_raw_signal, mpbs_list)
        except Exception:
            logging.exception("get raw signal failed")

    for idx, mpbs_name in enumerate(mpbs_name_list):
        signal_dict_by_tf_1[mpbs_name] = res[idx][0]
        signal_dict_by_tf_2[mpbs_name] = res[idx][1]
        motif_len_dict[mpbs_name] = res[idx][2]
        pwm_dict_by_tf[mpbs_name] = res[idx][3]
        motif_num_dict[mpbs_name] = res[idx][4]

    if args.factor1 is None or args.factor2 is None:
        args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2)
        output_factor(args, args.factor1, args.factor2)

    if args.output_profiles:
        output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location, args.condition1)
        output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location, args.condition2)

    ps_tc_results_by_tf = dict()

    plots_list = list()
    for mpbs_name in mpbs_name_list:
        plots_list.append((mpbs_name, motif_num_dict[mpbs_name], signal_dict_by_tf_1[mpbs_name],
                           signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, args.condition1,
                           args.condition2, pwm_dict_by_tf[mpbs_name], output_location, args.window_size,
                           args.standardize))

    pool.map(line_plot, plots_list)

    for mpbs_name in mpbs_name_list:
        res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name],
                                args.factor1, args.factor2, motif_num_dict[mpbs_name], motif_len_dict[mpbs_name])
        #
        #     # only use the factors whose protection scores are greater than 0
        #     if res[0] > 0 and res[1] < 0:
        ps_tc_results_by_tf[mpbs_name] = res
#.........这里部分代码省略.........
开发者ID:CostaLab,项目名称:reg-gen,代码行数:103,代码来源:DifferentialAnalysis.py

示例8: test_get_fisher_dict

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
    def test_get_fisher_dict(self):
        motif_names = ["TFIP11", "ACO2", "HIC2", "HAT5"]

        regions = GenomicRegionSet("regions")
        regions.read(os.path.join(os.path.dirname(__file__), "test.bed"))
        mpbs = GenomicRegionSet("mpbs")
        mpbs.read(os.path.join(os.path.dirname(__file__), "test_mpbs.bed"))

        regions2 = GenomicRegionSet("regions2")
        regions2.read(os.path.join(os.path.dirname(__file__), "test2.bed"))
        mpbs2 = GenomicRegionSet("mpbs2")
        mpbs2.read(os.path.join(os.path.dirname(__file__), "test2_mpbs.bed"))

        regions3 = GenomicRegionSet("regions3")
        regions3.read(os.path.join(os.path.dirname(__file__), "test3.bed"))
        mpbs3 = GenomicRegionSet("mpbs3")
        mpbs3.read(os.path.join(os.path.dirname(__file__), "test3_mpbs.bed"))

        result = get_fisher_dict(motif_names, regions, mpbs)
        intersecting = result[0]
        not_intersecting = result[1]

        for mn in ["TFIP11", "ACO2", "HAT5"]:
            self.assertEqual(intersecting[mn], 0)

        self.assertEqual(intersecting["HIC2"], 8)

        for mn in motif_names:
            self.assertEqual(intersecting[mn]+not_intersecting[mn], 36)

        result = get_fisher_dict(motif_names, regions, mpbs, gene_set=True, mpbs_set=True)
        gs = result[2]
        ms = result[3]

        for mn in ["TFIP11", "ACO2", "HAT5"]:
            self.assertEqual(len(gs[mn]), 0)

        self.assertEqual(len(gs["HIC2"]), 8)
        self.assertEqual(len(ms["HIC2"]), 8)
开发者ID:CostaLab,项目名称:reg-gen,代码行数:41,代码来源:test_Statistics.py

示例9: chip_evaluate

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def chip_evaluate(args):
    # Evaluate Statistics
    fpr = dict()
    tpr = dict()
    roc_auc_1 = dict()
    roc_auc_10 = dict()
    roc_auc_50 = dict()
    roc_auc_100 = dict()
    recall = dict()
    precision = dict()
    prc_auc_1 = dict()
    prc_auc_10 = dict()
    prc_auc_50 = dict()
    prc_auc_100 = dict()

    footprint_file = args.footprint_file.split(",")
    footprint_name = args.footprint_name.split(",")
    footprint_type = args.footprint_type.split(",")

    max_score = 0
    if "SEG" in footprint_type:
        mpbs_regions = GenomicRegionSet("TFBS")
        mpbs_regions.read(args.tfbs_file)

        # Verifying the maximum score of the MPBS file
        for region in iter(mpbs_regions):
            score = int(region.data.split("\t")[0])
            if score > max_score:
                max_score = score
        max_score += 1

    max_points = []
    for i in range(len(footprint_file)):
        footprints_regions = GenomicRegionSet("Footprints Prediction")
        footprints_regions.read(footprint_file[i])
        footprints_regions.sort()

        if footprint_type[i] == "SEG":
            # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints.
            increased_score_mpbs_regions = GenomicRegionSet("Increased Regions")
            intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL)
            for region in iter(intersect_regions):
                region.data = str(int(region.data.split("\t")[0]) + max_score)
                increased_score_mpbs_regions.add(region)

            # Keep the score of remained MPBS entry unchanged
            without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True)
            for region in iter(without_intersect_regions):
                increased_score_mpbs_regions.add(region)

            increased_score_mpbs_regions.sort_score()

            fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \
                roc_curve(increased_score_mpbs_regions)
            recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \
                precision_recall_curve(increased_score_mpbs_regions)

            max_points.append(len(intersect_regions))

        elif footprint_type[i] == "SC":
            footprints_regions.sort_score()
            fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \
                roc_curve(footprints_regions)
            recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \
                precision_recall_curve(footprints_regions)

            max_points.append(len(footprints_regions))

    # Output the statistics results into text
    stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix))
    stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"]
    with open(stats_fname, "w") as stats_file:
        stats_file.write("\t".join(stats_header) + "\n")
        for i in range(len(footprint_name)):
            stats_file.write(footprint_name[i] + "\t" +
                             str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" +
                             str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" +
                             str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n")

    # Output the curves
    if args.print_roc_curve:
        label_x = "False Positive Rate"
        label_y = "True Positive Rate"
        curve_name = "ROC"
        plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix,
                   curve_name, max_points=max_points)
    if args.print_pr_curve:
        label_x = "Recall"
        label_y = "Precision"
        curve_name = "PRC"
        plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y,
                   args.output_prefix, curve_name, max_points=max_points)

    output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
开发者ID:CostaLab,项目名称:reg-gen,代码行数:96,代码来源:Evaluation.py

示例10: get_bc_tracks

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def get_bc_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")

    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()

    reads_file = GenomicSignal()

    bam = Samfile(args.input_files[0], "rb")
    genome_data = GenomeData(args.organism)
    fasta = Fastafile(genome_data.get_genome())

    hmm_data = HmmData()
    if args.bias_table:
        bias_table_list = args.bias_table.split(",")
        bias_table = BiasTable().load_table(table_file_name_F=bias_table_list[0],
                                            table_file_name_R=bias_table_list[1])
    else:
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                            table_file_name_R=table_R)

    if args.strand_specific:
        fname_forward = os.path.join(args.output_location, "{}_forward.wig".format(args.output_prefix))
        fname_reverse = os.path.join(args.output_location, "{}_reverse.wig".format(args.output_prefix))

        f_forward = open(fname_forward, "a")
        f_reverse = open(fname_reverse, "a")
        for region in regions:
            signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length(
                ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table,
                forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None,
                strand=True)

            if args.norm:
                signal_f = reads_file.boyle_norm(signal_f)
                perc = scoreatpercentile(signal_f, 98)
                std = np.std(signal_f)
                signal_f = reads_file.hon_norm_atac(signal_f, perc, std)

                signal_r = reads_file.boyle_norm(signal_r)
                perc = scoreatpercentile(signal_r, 98)
                std = np.std(signal_r)
                signal_r = reads_file.hon_norm_atac(signal_r, perc, std)

            f_forward.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                            "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n")

            f_reverse.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                            "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n")

        f_forward.close()
        f_reverse.close()

        if args.bigWig:
            genome_data = GenomeData(args.organism)
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            bw_filename = os.path.join(args.output_location, "{}_forward.bw".format(args.output_prefix))
            os.system(" ".join(["wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0"]))
            os.remove(fname_forward)

            bw_filename = os.path.join(args.output_location, "{}_reverse.bw".format(args.output_prefix))
            os.system(" ".join(["wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0"]))
            os.remove(fname_reverse)

    else:
        output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))
        with open(output_fname, "a") as output_f:
            for region in regions:
                signal = reads_file.get_bc_signal_by_fragment_length(ref=region.chrom, start=region.initial,
                                                                     end=region.final,
                                                                     bam=bam, fasta=fasta, bias_table=bias_table,
                                                                     forward_shift=args.forward_shift,
                                                                     reverse_shift=args.reverse_shift,
                                                                     min_length=None, max_length=None, strand=False)

                if args.norm:
                    signal = reads_file.boyle_norm(signal)
                    perc = scoreatpercentile(signal, 98)
                    std = np.std(signal)
                    signal = reads_file.hon_norm_atac(signal, perc, std)

                output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                               "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
        output_f.close()

        if args.bigWig:
            genome_data = GenomeData(args.organism)
            chrom_sizes_file = genome_data.get_chromosome_sizes()
            bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
            os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
            os.remove(output_fname)
开发者ID:CostaLab,项目名称:reg-gen,代码行数:101,代码来源:Tracks.py

示例11: estimate_bias_kmer

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

#.........这里部分代码省略.........
开发者ID:CostaLab,项目名称:reg-gen,代码行数:103,代码来源:Estimation.py

示例12: estimate_bias_pwm

# 需要导入模块: from rgt.GenomicRegionSet import GenomicRegionSet [as 别名]
# 或者: from rgt.GenomicRegionSet.GenomicRegionSet import read [as 别名]
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
#.........这里部分代码省略.........
开发者ID:CostaLab,项目名称:reg-gen,代码行数:103,代码来源:Estimation.py


注:本文中的rgt.GenomicRegionSet.GenomicRegionSet.read方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。