Python GraphUtils.addSynonym方法代码示例

本文整理汇总了Python中dipper.utils.GraphUtils.GraphUtils.addSynonym方法的典型用法代码示例。如果您正苦于以下问题：Python GraphUtils.addSynonym方法的具体用法？Python GraphUtils.addSynonym怎么用？Python GraphUtils.addSynonym使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.utils.GraphUtils.GraphUtils的用法示例。

在下文中一共展示了GraphUtils.addSynonym方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _get_gene_history

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new
        gene id is the replacement for it.  The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))

                # add the two genes
                gu.addClassToGraph(g, gene_id, None)
                gu.addClassToGraph(g, discontinued_gene_id, discontinued_symbol)

                # add the new gene id to replace the old gene id
                gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                gu.addSynonym(g, gene_id, discontinued_symbol)

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        return

开发者ID:d3borah，项目名称:dipper，代码行数:62，代码来源:NCBIGene.py

示例2: _process_straininfo

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        gu = GraphUtils(curie_map.get())

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            f.readline()  # read the header row; skip
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:'+str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:'+str(mpd_strainid)
                gu.addIndividualToGraph(g, strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    gu.addSynonym(g, strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        gu.addSameIndividual(g, strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        gu.addSameIndividual(g, strain_id, reiken_id)
                    else:
                        if url != '':
                            gu.addXref(g, strain_id, url, True)
                        if vendor != '':
                            gu.addXref(
                                g, strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    gu.addDescription(g, strain_id, desc)

                # TODO make the panels as a resource collection

        return

开发者ID:JervenBolleman，项目名称:dipper，代码行数:57，代码来源:MPD.py

示例3: _process_ortholog_classes

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:
        :return:
        """

        logger.info("Processing ortholog classes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (orthology_class_id, orthology_class_name) = row

                if self.testMode and orthology_class_id not in self.test_ids['ortholog_classes']:
                    continue

                # FIXME: What's the proper route for this?
                # The orthology class is essentially a KEGG gene ID that is species agnostic.
                # Add the ID and label as a class. Would it be considered a gene as well?

                other_labels = re.split(';', orthology_class_name)
                orthology_label = other_labels[0]  # the first one is the label we'll use

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = OrthologyAssoc.terms['gene_family']
                gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        gu.addSynonym(g, orthology_class_id, s)

                    # add the last one as the description
                    gu.addDescription(g, orthology_class_id, other_labels[len(other_labels)-1])

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        logger.info("Done with ortholog classes")
        return

开发者ID:d3borah，项目名称:dipper，代码行数:56，代码来源:KEGG.py

示例4: process_gene_ids

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing Gene IDs")
        line_counter = 0
        geno = Genotype(g)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter=',',
                quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num, gene_num, gene_symbol, gene_synonym, live) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:'+taxon_num
                gene_id = 'WormBase:'+gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                gu.addClassToGraph(
                    g, gene_id, gene_symbol, Genotype.genoparts['gene'])
                if live == 'Dead':
                    gu.addDeprecatedClass(g, gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '':
                    gu.addSynonym(g, gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return

开发者ID:JervenBolleman，项目名称:dipper，代码行数:46，代码来源:WormBase.py

示例5: _get_titles

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (shortname, title, nbk_num) = row
                gr_id = 'GeneReviews:'+nbk_num

                self.book_ids.add(nbk_num)  # a global set of the book nums

                if limit is None or line_counter < limit:
                    gu.addClassToGraph(self.graph, gr_id, title)
                    gu.addSynonym(self.graph, gr_id, shortname)

        return

开发者ID:JervenBolleman，项目名称:dipper，代码行数:40，代码来源:GeneReviews.py

示例6: _get_gene_info

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and creates the genes as classes, typed with SO.  It will add their
        label, any alternate labels as synonyms, alternate ids as equivlaent classes.  HPRDs get added as
        protein products.  The chromosome and chr band get added as blank node regions, and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        # not unzipping the file
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", myfile)

        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            geno.addGenome(tax_id, str(tax_num))   # tax label can get added elsewhere
            gu.addClassToGraph(g, tax_id, None)   # label added elsewhere
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag,
                 synonyms, xrefs, chr, map_loc, desc,
                 gtype, authority_symbol, name,
                 nomenclature_status, other_designations, modification_date) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self._map_type_of_gene(gtype)

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol

                # TODO might have to figure out if things aren't genes, and make them individuals
                gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)

                # we have to do special things here for genes, because they're classes not individuals
                # f = Feature(gene_id,label,gene_type_id,desc)

                if name != '-':
                    gu.addSynonym(g, gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])

                # deal with the xrefs
                # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
                if xrefs.strip() != '-':
                    for r in xrefs.strip().split('|'):
                        fixedr = self._cleanup_id(r)
                        if fixedr is not None and fixedr.strip() != '':
                            if re.match('HPRD', fixedr):
                                # proteins are not == genes.
                                gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
                            else:
                                # skip some of these for now
                                if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
                                    gu.addEquivalentClass(g, gene_id, fixedr)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # 419     ART3      4    with   4q21.1|4p15.1-p14   # no idea why there's two bands listed - possibly 2 assemblies
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3   #this is of "unknown" type == susceptibility
                # 101928066       LOC101928066    1|Un    -         # unlocated scaffold
#.........这里部分代码省略.........

开发者ID:d3borah，项目名称:dipper，代码行数:103，代码来源:NCBIGene.py

示例7: Monochrom

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]

#.........这里部分代码省略.........
        for taxon in self.tax_ids:
            self._get_chrbands(limit, str(taxon))

        self.load_core_bindings()
        self.load_bindings()

        # using the full graph as the test here
        self.testgraph = self.graph
        logger.info("Found %d nodes", len(self.graph))
        logger.info("Done parsing files.")

        return

    def _get_chrbands(self, limit, taxon):
        """
        For the given taxon, it will fetch the chr band file.
        We will not deal with the coordinate information with this parser.
        Here, we only are concerned with building the partonomy.
        :param limit:
        :return:

        """
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)

        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:'+taxon

        # add the taxon as a class.  adding the class label elsewhere
        self.gu.addClassToGraph(self.graph, taxon_id, None)
        self.gu.addSynonym(self.graph, taxon_id, genome_label)

        self.gu.loadObjectProperties(self.graph, Feature.object_properties)

        genome_id = geno.makeGenomeID(taxon_id)
        geno.addGenome(taxon_id, genome_label)
        self.gu.addOWLPropertyClassRestriction(
            self.graph, genome_id, Genotype.object_properties['in_taxon'],
            taxon_id)

        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (chrom, start, stop, band, rtype) = line.split('\t')
                line_counter += 1

                # NOTE
                # some less-finished genomes have placed and unplaced scaffolds
                # * Placed scaffolds:
                #    Scaffold has an oriented location within a chromosome.
                # * Unlocalized scaffolds:
                #     scaffold 's chromosome  is known,
                #     scaffold's position, orientation or both is not known.
                # *Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to.

                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random

开发者ID:JervenBolleman，项目名称:dipper，代码行数:70，代码来源:Monochrom.py

示例8: _process_diseasegene

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        # PYLINT complains iterparse deprecated,
        # but as of py 3.4 only the optional & unsupplied parse arg is.
        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignoreS element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text

                disorder_id = 'Orphanet:'+str(disorder_num)

                if self.testMode and \
                        disorder_id not in \
                        config.get_config()['test_ids']['disease']:
                    continue

                disorder_label = elem.find('Name').text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find('GeneList')
                for gene in gene_list.findall('Gene'):
                    gene_iid = gene.get('id')
                    gene_type = gene.find('GeneType').get('id')
                    gene_iid_to_type[gene_iid] = gene_type

                # assuming that these are in the ontology
                gu.addClassToGraph(g, disorder_id, disorder_label)

                assoc_list = elem.find('DisorderGeneAssociationList')
                for a in assoc_list.findall('DisorderGeneAssociation'):
                    gene_iid = a.find('.//Gene').get('id')
                    gene_name = a.find('.//Gene/Name').text
                    gene_symbol = a.find('.//Gene/Symbol').text
                    gene_num = a.find('./Gene/OrphaNumber').text
                    gene_id = 'Orphanet:'+str(gene_num)
                    gene_type_id = \
                        self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    gu.addClassToGraph(
                        g, gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = a.find('./Gene/SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for s in syn_list.findall('./Synonym'):
                            gu.addSynonym(g, gene_id, s.text)

                    dgtype = a.find('DisorderGeneAssociationType').get('id')
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = \
                        a.find('./DisorderGeneAssociationType/Name').text
                    if rel_id is None:
                        logger.warning(
                            "Cannot map association type (%s) to RO " +
                            "for association (%s | %s).  Skipping.",
                            dg_label, disorder_label, gene_symbol)
                        continue

                    alt_locus_id = '_'+gene_num+'-'+disorder_num+'VL'
                    alt_label = \
                        ' '.join(('some variant of', gene_symbol.strip(),
                                  'that is a', dg_label.lower(),
                                  disorder_label))
                    if self.nobnodes:
                        alt_locus_id = ':'+alt_locus_id
                    gu.addIndividualToGraph(g, alt_locus_id, alt_label,
                                            geno.genoparts['variant_locus'])
                    geno.addAlleleOfGene(alt_locus_id, gene_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = \
                        a.find('DisorderGeneAssociationStatus').get('id')
                    # imported automatically asserted information
                    # used in automatic assertion
                    eco_id = 'ECO:0000323'
                    # Assessed
                    # TODO are these internal ids stable between releases?
                    if status_code == '17991':
                        # imported manually asserted information
                        # used in automatic assertion
                        eco_id = 'ECO:0000322'
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313
#.........这里部分代码省略.........

开发者ID:JervenBolleman，项目名称:dipper，代码行数:103，代码来源:Orphanet.py

示例9: OMIA

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]

#.........这里部分代码省略.........

        for event, elem in ET.iterparse(filereader):
            self.process_xml_table(
                elem, 'Article_Breed', self._process_article_breed_row, limit)
            self.process_xml_table(
                elem, 'Article_Phene', self._process_article_phene_row, limit)
            self.process_xml_table(
                elem, 'Breed_Phene', self._process_breed_phene_row, limit)
            self.process_xml_table(
                elem, 'Lida_Links', self._process_lida_links_row, limit)
            self.process_xml_table(
                elem, 'Phene_Gene', self._process_phene_gene_row, limit)
            self.process_xml_table(
                elem, 'Group_MPO', self._process_group_mpo_row, limit)

        f.close()

        return

    # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################

    def _process_species_table_row(self, row):
        # gb_species_id, sci_name, com_name, added_by, date_modified
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        sci_name = row['sci_name']
        com_name = row['com_name']

        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        self.gu.addClassToGraph(self.g, tax_id, sci_name)
        if com_name != '':
            self.gu.addSynonym(self.g, tax_id, com_name)
            self.label_hash[tax_id] = com_name  # for lookup later
        else:
            self.label_hash[tax_id] = sci_name

        return

    def _process_breed_row(self, row):

        # in test mode, keep all breeds of our test species
        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        # save the breed keys in the test_ids for later processing
        self.test_ids['breed'] += [int(row['breed_id'])]

        breed_id = self.make_breed_id(row['breed_id'])

        self.id_hash['breed'][row['breed_id']] = breed_id
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        breed_label = row['breed_name']
        species_label = self.label_hash.get(tax_id)
        if species_label is not None:
            breed_label = breed_label + ' ('+species_label+')'

        self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id)
        self.label_hash[breed_id] = breed_label

        return

    def _process_phene_row(self, row):

开发者ID:JervenBolleman，项目名称:dipper，代码行数:69，代码来源:OMIA.py

示例10: _get_equivids

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        gu = GraphUtils(curie_map.get())
        line_counter = 0

        # we look some stuff up in OMIM, so initialize here
        omim = OMIM()
        id_map = {}
        allomimids = set()
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (nbk_num, shortname, omim_num) = row
                gr_id = 'GeneReviews:'+nbk_num
                omim_id = 'OMIM:'+omim_num
                if not (
                        (self.testMode and
                         len(self.test_ids) > 0 and
                         omim_id in self.test_ids) or not
                        self.testMode):
                    continue

                # sometimes there's bad omim nums
                if len(omim_num) > 6:
                    logger.warning(
                        "OMIM number incorrectly formatted " +
                        "in row %d; skipping:\n%s",
                        line_counter, '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                gu.addClassToGraph(self.graph, gr_id, None)
                gu.addSynonym(self.graph, gr_id, shortname)

                allomimids.add(omim_num)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

            # end looping through file

        # get the omim ids that are not genes
        entries_that_are_phenotypes = \
            omim.process_entries(
                list(allomimids), filter_keep_phenotype_entry_ids,
                None, None, limit)

        logger.info("Filtered out %d/%d entries that are genes or features",
                    len(allomimids)-len(entries_that_are_phenotypes),
                    len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:'+nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:'+omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        gu.addClassToGraph(self.graph, omim_id, None)
                        gu.addSubclass(self.graph, gr_id, omim_id)
            # add this as a generic subclass of DOID:4
            gu.addSubclass(self.graph, 'DOID:4', gr_id)

        return

开发者ID:JervenBolleman，项目名称:dipper，代码行数:96，代码来源:GeneReviews.py

示例11: UCSCBands

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]

#.........这里部分代码省略.........
        self.load_bindings()

        # using the full graph as the test here
        self.testgraph = self.graph
        logger.info("Found %d nodes", len(self.graph))
        logger.info("Done parsing files.")

        return

    def _get_chrbands(self, limit, taxon):
        """
        :param limit:
        :return:

        """

        # TODO PYLINT figure out what limit was for and why it is unused
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)
        monochrom = Monochrom()

        # used to hold band definitions for a chr
        # in order to compute extent of encompasing bands

        mybands = {}
        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:'+taxon

        # add the taxon as a class.  adding the class label elsewhere
        self.gu.addClassToGraph(self.graph, taxon_id, None)
        self.gu.addSynonym(self.graph, taxon_id, genome_label)

        self.gu.loadObjectProperties(self.graph, Feature.object_properties)
        self.gu.loadProperties(self.graph, Feature.data_properties,
                               self.gu.DATAPROP)
        self.gu.loadAllProperties(self.graph)

        geno.addGenome(taxon_id, genome_label)

        # add the build and the taxon it's in
        build_num = self.files[taxon]['build_num']
        build_id = 'UCSC:'+build_num
        geno.addReferenceGenome(build_id, build_num, taxon_id)

        # process the bands
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (scaffold, start, stop, band_num, rtype) = line.split('\t')
                line_counter += 1

                # NOTE some less-finished genomes have
                # placed and unplaced scaffolds
                # * Placed scaffolds:
                #       the scaffolds have been placed within a chromosome.
                # * Unlocalized scaffolds:
                #       although the chromosome within which the scaffold occurs
                #       is known, the scaffold's position or orientation

开发者ID:JervenBolleman，项目名称:dipper，代码行数:70，代码来源:UCSCBands.py

示例12: _get_variants

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]

#.........这里部分代码省略.........
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':  # they use -1 to indicate unknown gene
                    gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing the xml
                # for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype in the csv data
                # so the dbsnp or dbvar should probably be primary, and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # todo clinical significance needs to be mapped to a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph(g)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(g, bandinbuild_id)

                # CHECK - this makes the assumption that there is only one affected chromosome per variant
                # what happens with chromosomal rearrangement variants?  shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    gu.addSynonym(g, seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    gu.addSynonym(g, seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    gu.addIndividualToGraph(g, dbsnp_id, None)
                    gu.addSameIndividual(g, seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    gu.addIndividualToGraph(g, dbvar_id, None)
                    gu.addSameIndividual(g, seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(';',rcv_nums):
                        rcv_id = 'ClinVar:'+rcv_num
                        gu.addIndividualToGraph(g, rcv_id, None)
                        gu.addXref(g, seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    gu.addClassToGraph(g, gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    gu.addIndividualToGraph(g, vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)

开发者ID:d3borah，项目名称:dipper，代码行数:70，代码来源:ClinVar.py

示例13: process_gaf

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        zfin = wbase = None
        if 7955 in self.tax_ids:
            zfin = ZFIN()
        elif 6239 in self.tax_ids:
            wbase = WormBase()

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == '' or
                        go_id == '' or ref == '' or eco_symbol == '' or
                        aspect == '' or object_type == '' or taxon == '' or
                        date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n"+'\t'.join(row),
                        line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                gu.addClassToGraph(g, gene_id, gene_symbol)
                if gene_name != '':
                    gu.addDescription(g, gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        gu.addSynonym(g, gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
#.........这里部分代码省略.........

开发者ID:JervenBolleman，项目名称:dipper，代码行数:103，代码来源:GeneOntology.py

示例14: _process_nlx_157874_1_view

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Triples:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                OIO:hasRelatedSynonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in filereader:
                line_counter += 1
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid,
                 v_last_modified) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                gu.addClassToGraph(self.graph, morphology_term_id,
                                   morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (
                        re.match(r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (
                        re.match(r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = \
                    '  '.join(
                        (objective_definition, subjective_definition)).strip()

                gu.addDefinition(self.graph, morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    gu.addDepiction(self.graph, morphology_term_id,
                                    small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    gu.addDepiction(self.graph, morphology_term_id,
                                    large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    gu.addComment(self.graph, morphology_term_id,
                                  comments.strip())

                if synonyms != '':
                    for s in synonyms.split(';'):
                        gu.addSynonym(
                            self.graph, morphology_term_id, s.strip(),
                            gu.properties['hasExactSynonym'])

                # morphology_term_id hasRelatedSynonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for s in replaces.split(';'):
                        gu.addSynonym(
                            self.graph, morphology_term_id, s.strip(),
                            gu.properties['hasRelatedSynonym'])

#.........这里部分代码省略.........

开发者ID:JervenBolleman，项目名称:dipper，代码行数:103，代码来源:EOM.py

示例15: _process_ortholog_classes

# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        If there's an embedded enzyme commission number,
        that is added as an xref.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:

        :return:
        """

        logger.info("Processing ortholog classes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (orthology_class_id, orthology_class_name) = row

                if self.testMode and \
                        orthology_class_id not in \
                        self.test_ids['orthology_classes']:
                    continue

                # The orthology class is essentially a KEGG gene ID
                # that is species agnostic.
                # Add the ID and label as a gene family class

                other_labels = re.split(r'[;,]', orthology_class_name)
                # the first one is the label we'll use
                orthology_label = other_labels[0]

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = OrthologyAssoc.terms['gene_family']
                gu.addClassToGraph(g, orthology_class_id, orthology_label,
                                   orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        gu.addSynonym(g, orthology_class_id, s.strip())

                    # add the last one as the description
                    d = other_labels[len(other_labels)-1]
                    gu.addDescription(g, orthology_class_id, d)

                    # add the enzyme commission number (EC:1.2.99.5)as an xref
                    # sometimes there's two, like [EC:1.3.5.1 1.3.5.4]
                    # can also have a dash, like EC:1.10.3.-
                    ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d)
                    if ec_matches is not None:
                        for ecm in ec_matches:
                            gu.addXref(g, orthology_class_id, 'EC:'+ecm)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Done with ortholog classes")
        return

开发者ID:JervenBolleman，项目名称:dipper，代码行数:74，代码来源:KEGG.py

注：本文中的dipper.utils.GraphUtils.GraphUtils.addSynonym方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。