当前位置: 首页>>代码示例>>Python>>正文


Python Dataset.setFileAccessUrl方法代码示例

本文整理汇总了Python中dipper.models.Dataset.Dataset.setFileAccessUrl方法的典型用法代码示例。如果您正苦于以下问题:Python Dataset.setFileAccessUrl方法的具体用法?Python Dataset.setFileAccessUrl怎么用?Python Dataset.setFileAccessUrl使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在dipper.models.Dataset.Dataset的用法示例。


在下文中一共展示了Dataset.setFileAccessUrl方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: EOM

# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 别名]
class EOM(PostgreSQLSource):
    """
    Elements of Morphology is a resource from NHGRI that has definitions of
    morphological abnormalities, together with image depictions.
    We pull those relationships, as well as our local mapping of equivalences
    between EOM and HP terminologies.

    The website is crawled monthly by NIF's DISCO crawler system,
        which we utilize here.
    Be sure to have pg user/password connection details in your conf.json file,
    like:
      dbauth : {
        'disco' : {'user' : '<username>', 'password' : '<password>'}
      }

    Monarch-curated data for the HP to EOM mapping is stored at
        https://phenotype-ontologies.googlecode.com

    Since this resource is so small, the entirety of it is the "test" set.

    """

    # we are using the production view here; should we be using services?
    tables = [
        'dvp.pr_nlx_157874_1'
    ]

    files = {
        'map': {
            'file': 'hp-to-eom-mapping.tsv',
            'url': 'https://phenotype-ontologies.googlecode.com/svn/trunk/src/ontology/hp/mappings/hp-to-eom-mapping.tsv'
        }
    }

    def __init__(self):
        super().__init__('eom')
        self.namespaces.update(curie_map.get())

        # update the dataset object with details about this resource
        # TODO put this into a conf file?
        self.dataset = Dataset(
            'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None,
            'http://www.genome.gov/copyright.cfm',
            'https://creativecommons.org/publicdomain/mark/1.0/')

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'disco' not in config.get_config()['dbauth']:
            logger.error("not configured with PG user/password.")

        # source-specific warnings.  will be cleared when resolved.

        return

    def fetch(self, is_dl_forced=False):
        '''create the connection details for DISCO'''

        cxn = config.get_config()['dbauth']['disco']
        cxn.update(
            {'host': 'nif-db.crbs.ucsd.edu', 'database': 'disco_crawler',
             'port': 5432})

        self.dataset.setFileAccessUrl(
            ''.join(('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']),
                    '/', cxn['database'])))

        # process the tables
        # self.fetch_from_pgdb(self.tables,cxn,100)  #for testing
        self.fetch_from_pgdb(self.tables, cxn)

        self.get_files(is_dl_forced)

        # FIXME: Everything needed for data provenance?
        st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')))
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
        self.dataset.setVersion(filedate)

        return

    def parse(self, limit=None):
        '''
            Over ride Source.parse inherited via PostgreSQLSource
        '''

        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        if self.testOnly:
            self.testMode = True

        logger.info("Parsing files...")

        self._process_nlx_157874_1_view('/'.join((self.rawdir,
                                                  'dvp.pr_nlx_157874_1')),
                                        limit)
        self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])),
                            limit)

        logger.info("Finished parsing.")

#.........这里部分代码省略.........
开发者ID:JervenBolleman,项目名称:dipper,代码行数:103,代码来源:EOM.py

示例2: GeneReviews

# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 别名]

#.........这里部分代码省略.........
            # figure out if the book is there; if so, process, otherwise skip
            book_dir = '/'.join((self.rawdir, 'books'))
            book_files = os.listdir(book_dir)
            if ''.join((nbk, '.html')) not in book_files:
                # logger.warning("No book found locally for %s; skipping", nbk)
                books_not_found.add(nbk)
                continue
            logger.info("Processing %s", nbk)

            page = open(url)
            soup = BeautifulSoup(page.read())

            # sec0 == clinical description
            clin_summary = \
                soup.find(
                    'div', id=re.compile(".*Summary.sec0"))
            if clin_summary is not None:
                p = clin_summary.find('p')
                ptext = p.text
                ptext = re.sub(r'\s+', ' ', ptext)

                ul = clin_summary.find('ul')
                if ul is not None:
                    item_text = list()
                    for li in ul.find_all('li'):
                        item_text.append(re.sub(r'\s+', ' ', li.text))
                    ptext += ' '.join(item_text)

                # add in the copyright and citation info to description
                ptext = \
                    ' '.join(
                        (ptext,
                         '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
                         nbk_id+']'))

                self.gu.addDefinition(self.graph, nbk_id, ptext.strip())

            # get the pubs
            pmid_set = set()
            pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
            if pub_div is not None:
                ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
                for r in ref_list:
                    for a in r.find_all(
                            'a', attrs={'href': re.compile(r"pubmed")}):
                        if re.match(r'PubMed:', a.text):
                            pmnum = re.sub(r'PubMed:\s*', '', a.text)
                        else:
                            pmnum = \
                                re.search(
                                    r'\/pubmed\/(\d+)$', a['href']).group(1)
                        if pmnum is not None:
                            pmid = 'PMID:'+str(pmnum)
                            self.gu.addTriple(
                                self.graph, pmid,
                                self.gu.object_properties['is_about'],
                                nbk_id)
                            pmid_set.add(pmnum)
                            r = Reference(
                                pmid, Reference.ref_types['journal_article'])
                            r.addRefToGraph(self.graph)

            # TODO add author history, copyright, license to dataset

            # TODO get PMID-NBKID equivalence (near foot of page),
            # and make it "is about" link
            # self.gu.addTriple(
            #   self.graph, pmid,
            #   self.gu.object_properties['is_about'], nbk_id)
            # for example: NBK1191 PMID:20301370

            # add the book to the dataset
            self.dataset.setFileAccessUrl(book_item['url'])

            if limit is not None and c > limit:
                break

            # finish looping through books

        l = len(books_not_found)
        if len(books_not_found) > 0:
            if l > 100:
                logger.warning("There were %d books not found.", l)
            else:
                logger.warning(
                    "The following %d books were not found locally: %s",
                    l, str(books_not_found))
        logger.info(
            "Finished processing %d books for clinical descriptions", c-l)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_genereviews import GeneReviewsTestCase

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)

        return test_suite
开发者ID:JervenBolleman,项目名称:dipper,代码行数:104,代码来源:GeneReviews.py

示例3: Coriell

# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 别名]

#.........这里部分代码省略.........
                # we rename (for simplicity) the original file
                st = None
                if os.path.exists(target_name):
                    st = os.stat(target_name)
                    logger.info(
                        "Local file date: %s",
                        datetime.utcfromtimestamp(st[stat.ST_CTIME]))
                if st is None or remotef.st_mtime > st[stat.ST_CTIME]:
                    if st is None:
                        logger.info(
                            "File does not exist locally; downloading...")
                    else:
                        logger.info(
                            "There's a new version of %s catalog available; "
                            "downloading...", r)
                    sftp.get(remotef.filename, target_name)
                    logger.info(
                        "Fetched remote %s -> %s",
                        remotef.filename, target_name)
                    st = os.stat(target_name)
                    filedate = \
                        datetime.utcfromtimestamp(
                            remotef.st_mtime).strftime("%Y-%m-%d")
                    logger.info(
                        "New file date: %s",
                        datetime.utcfromtimestamp(st[stat.ST_CTIME]))

                else:
                    logger.info("File %s exists; using local copy", fname)
                    filedate = \
                        datetime.utcfromtimestamp(
                            st[stat.ST_CTIME]).strftime("%Y-%m-%d")

                self.dataset.setFileAccessUrl(remotef.filename)
                self.dataset.setVersion(filedate)
        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        for f in self.files:
            file = '/'.join((self.rawdir, self.files[f]['file']))
            self._process_collection(
                self.files[f]['id'],
                self.files[f]['label'],
                self.files[f]['page'])
            self._process_data(file, limit)

        logger.info("Finished parsing.")

        self.load_bindings()

        logger.info("Found %d nodes in graph", len(self.graph))
        logger.info("Found %d nodes in testgraph", len(self.testgraph))

        return

    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
开发者ID:JervenBolleman,项目名称:dipper,代码行数:70,代码来源:Coriell.py

示例4: fetch

# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 别名]

#.........这里部分代码省略.........
                    LOG.warning("New Remote File exists but it is SMALLER")
                    return True
                # filesize is a fairly imperfect metric here
                LOG.info("New Remote fFle has same filesize--will not download")
        elif fstat[ST_SIZE] != size:
            LOG.info(
                "Remote File is %i  \t Local File is %i", size, fstat[ST_SIZE])
            return True

        return False

    def get_files(self, is_dl_forced, files=None):
        """
        Given a set of files for this source, it will go fetch them, and
        set a default version by date.  If you need to set the version number
        by another method, then it can be set again.
        :param is_dl_forced - boolean
        :param files dict - override instance files dict
        :return: None
        """

        fstat = None
        if files is None:
            files = self.files
        for fname in files:
            headers = None
            filesource = files[fname]
            if 'headers' in filesource:
                headers = filesource['headers']
            LOG.info("Getting %s", fname)
            # if the key 'clean' exists in the sources `files` dict
            # expose that instead of the longer url
            if 'clean' in filesource and filesource['clean'] is not None:
                self.dataset.setFileAccessUrl(filesource['clean'])
            else:
                self.dataset.setFileAccessUrl(filesource['url'])
                LOG.info('Fetching %s', filesource['url'])

            self.fetch_from_url(
                filesource['url'], '/'.join((self.rawdir, filesource['file'])),
                is_dl_forced, headers)

            fstat = os.stat('/'.join((self.rawdir, filesource['file'])))

        # only keeping the date from the last file
        filedate = datetime.utcfromtimestamp(fstat[ST_CTIME]).strftime("%Y-%m-%d")

        # FIXME
        # change this so the date is attached only to each file, not the entire dataset
        self.dataset.set_date_issued(filedate)

    def fetch_from_url(
            self, remotefile, localfile=None, is_dl_forced=False, headers=None):
        """
        Given a remote url and a local filename, attempt to determine
        if the remote file is newer; if it is,
        fetch the remote file and save it to the specified localfile,
        reporting the basic file information once it is downloaded
        :param remotefile: URL of remote file to fetch
        :param localfile: pathname of file to save locally
        :return: None

        """

        response = None
        if ((is_dl_forced is True) or localfile is None or
开发者ID:monarch-initiative,项目名称:dipper,代码行数:70,代码来源:Source.py


注:本文中的dipper.models.Dataset.Dataset.setFileAccessUrl方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。