當前位置: 首頁>>代碼示例>>Python>>正文


Python Dataset.setFileAccessUrl方法代碼示例

本文整理匯總了Python中dipper.models.Dataset.Dataset.setFileAccessUrl方法的典型用法代碼示例。如果您正苦於以下問題:Python Dataset.setFileAccessUrl方法的具體用法?Python Dataset.setFileAccessUrl怎麽用?Python Dataset.setFileAccessUrl使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在dipper.models.Dataset.Dataset的用法示例。


在下文中一共展示了Dataset.setFileAccessUrl方法的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: EOM

# 需要導入模塊: from dipper.models.Dataset import Dataset [as 別名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 別名]
class EOM(PostgreSQLSource):
    """
    Elements of Morphology is a resource from NHGRI that has definitions of
    morphological abnormalities, together with image depictions.
    We pull those relationships, as well as our local mapping of equivalences
    between EOM and HP terminologies.

    The website is crawled monthly by NIF's DISCO crawler system,
        which we utilize here.
    Be sure to have pg user/password connection details in your conf.json file,
    like:
      dbauth : {
        'disco' : {'user' : '<username>', 'password' : '<password>'}
      }

    Monarch-curated data for the HP to EOM mapping is stored at
        https://phenotype-ontologies.googlecode.com

    Since this resource is so small, the entirety of it is the "test" set.

    """

    # we are using the production view here; should we be using services?
    tables = [
        'dvp.pr_nlx_157874_1'
    ]

    files = {
        'map': {
            'file': 'hp-to-eom-mapping.tsv',
            'url': 'https://phenotype-ontologies.googlecode.com/svn/trunk/src/ontology/hp/mappings/hp-to-eom-mapping.tsv'
        }
    }

    def __init__(self):
        super().__init__('eom')
        self.namespaces.update(curie_map.get())

        # update the dataset object with details about this resource
        # TODO put this into a conf file?
        self.dataset = Dataset(
            'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None,
            'http://www.genome.gov/copyright.cfm',
            'https://creativecommons.org/publicdomain/mark/1.0/')

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'disco' not in config.get_config()['dbauth']:
            logger.error("not configured with PG user/password.")

        # source-specific warnings.  will be cleared when resolved.

        return

    def fetch(self, is_dl_forced=False):
        '''create the connection details for DISCO'''

        cxn = config.get_config()['dbauth']['disco']
        cxn.update(
            {'host': 'nif-db.crbs.ucsd.edu', 'database': 'disco_crawler',
             'port': 5432})

        self.dataset.setFileAccessUrl(
            ''.join(('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']),
                    '/', cxn['database'])))

        # process the tables
        # self.fetch_from_pgdb(self.tables,cxn,100)  #for testing
        self.fetch_from_pgdb(self.tables, cxn)

        self.get_files(is_dl_forced)

        # FIXME: Everything needed for data provenance?
        st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')))
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
        self.dataset.setVersion(filedate)

        return

    def parse(self, limit=None):
        '''
            Over ride Source.parse inherited via PostgreSQLSource
        '''

        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        if self.testOnly:
            self.testMode = True

        logger.info("Parsing files...")

        self._process_nlx_157874_1_view('/'.join((self.rawdir,
                                                  'dvp.pr_nlx_157874_1')),
                                        limit)
        self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])),
                            limit)

        logger.info("Finished parsing.")

#.........這裏部分代碼省略.........
開發者ID:JervenBolleman,項目名稱:dipper,代碼行數:103,代碼來源:EOM.py

示例2: GeneReviews

# 需要導入模塊: from dipper.models.Dataset import Dataset [as 別名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 別名]

#.........這裏部分代碼省略.........
            # figure out if the book is there; if so, process, otherwise skip
            book_dir = '/'.join((self.rawdir, 'books'))
            book_files = os.listdir(book_dir)
            if ''.join((nbk, '.html')) not in book_files:
                # logger.warning("No book found locally for %s; skipping", nbk)
                books_not_found.add(nbk)
                continue
            logger.info("Processing %s", nbk)

            page = open(url)
            soup = BeautifulSoup(page.read())

            # sec0 == clinical description
            clin_summary = \
                soup.find(
                    'div', id=re.compile(".*Summary.sec0"))
            if clin_summary is not None:
                p = clin_summary.find('p')
                ptext = p.text
                ptext = re.sub(r'\s+', ' ', ptext)

                ul = clin_summary.find('ul')
                if ul is not None:
                    item_text = list()
                    for li in ul.find_all('li'):
                        item_text.append(re.sub(r'\s+', ' ', li.text))
                    ptext += ' '.join(item_text)

                # add in the copyright and citation info to description
                ptext = \
                    ' '.join(
                        (ptext,
                         '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
                         nbk_id+']'))

                self.gu.addDefinition(self.graph, nbk_id, ptext.strip())

            # get the pubs
            pmid_set = set()
            pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
            if pub_div is not None:
                ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
                for r in ref_list:
                    for a in r.find_all(
                            'a', attrs={'href': re.compile(r"pubmed")}):
                        if re.match(r'PubMed:', a.text):
                            pmnum = re.sub(r'PubMed:\s*', '', a.text)
                        else:
                            pmnum = \
                                re.search(
                                    r'\/pubmed\/(\d+)$', a['href']).group(1)
                        if pmnum is not None:
                            pmid = 'PMID:'+str(pmnum)
                            self.gu.addTriple(
                                self.graph, pmid,
                                self.gu.object_properties['is_about'],
                                nbk_id)
                            pmid_set.add(pmnum)
                            r = Reference(
                                pmid, Reference.ref_types['journal_article'])
                            r.addRefToGraph(self.graph)

            # TODO add author history, copyright, license to dataset

            # TODO get PMID-NBKID equivalence (near foot of page),
            # and make it "is about" link
            # self.gu.addTriple(
            #   self.graph, pmid,
            #   self.gu.object_properties['is_about'], nbk_id)
            # for example: NBK1191 PMID:20301370

            # add the book to the dataset
            self.dataset.setFileAccessUrl(book_item['url'])

            if limit is not None and c > limit:
                break

            # finish looping through books

        l = len(books_not_found)
        if len(books_not_found) > 0:
            if l > 100:
                logger.warning("There were %d books not found.", l)
            else:
                logger.warning(
                    "The following %d books were not found locally: %s",
                    l, str(books_not_found))
        logger.info(
            "Finished processing %d books for clinical descriptions", c-l)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_genereviews import GeneReviewsTestCase

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)

        return test_suite
開發者ID:JervenBolleman,項目名稱:dipper,代碼行數:104,代碼來源:GeneReviews.py

示例3: Coriell

# 需要導入模塊: from dipper.models.Dataset import Dataset [as 別名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 別名]

#.........這裏部分代碼省略.........
                # we rename (for simplicity) the original file
                st = None
                if os.path.exists(target_name):
                    st = os.stat(target_name)
                    logger.info(
                        "Local file date: %s",
                        datetime.utcfromtimestamp(st[stat.ST_CTIME]))
                if st is None or remotef.st_mtime > st[stat.ST_CTIME]:
                    if st is None:
                        logger.info(
                            "File does not exist locally; downloading...")
                    else:
                        logger.info(
                            "There's a new version of %s catalog available; "
                            "downloading...", r)
                    sftp.get(remotef.filename, target_name)
                    logger.info(
                        "Fetched remote %s -> %s",
                        remotef.filename, target_name)
                    st = os.stat(target_name)
                    filedate = \
                        datetime.utcfromtimestamp(
                            remotef.st_mtime).strftime("%Y-%m-%d")
                    logger.info(
                        "New file date: %s",
                        datetime.utcfromtimestamp(st[stat.ST_CTIME]))

                else:
                    logger.info("File %s exists; using local copy", fname)
                    filedate = \
                        datetime.utcfromtimestamp(
                            st[stat.ST_CTIME]).strftime("%Y-%m-%d")

                self.dataset.setFileAccessUrl(remotef.filename)
                self.dataset.setVersion(filedate)
        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        for f in self.files:
            file = '/'.join((self.rawdir, self.files[f]['file']))
            self._process_collection(
                self.files[f]['id'],
                self.files[f]['label'],
                self.files[f]['page'])
            self._process_data(file, limit)

        logger.info("Finished parsing.")

        self.load_bindings()

        logger.info("Found %d nodes in graph", len(self.graph))
        logger.info("Found %d nodes in testgraph", len(self.testgraph))

        return

    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
開發者ID:JervenBolleman,項目名稱:dipper,代碼行數:70,代碼來源:Coriell.py

示例4: fetch

# 需要導入模塊: from dipper.models.Dataset import Dataset [as 別名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 別名]

#.........這裏部分代碼省略.........
                    LOG.warning("New Remote File exists but it is SMALLER")
                    return True
                # filesize is a fairly imperfect metric here
                LOG.info("New Remote fFle has same filesize--will not download")
        elif fstat[ST_SIZE] != size:
            LOG.info(
                "Remote File is %i  \t Local File is %i", size, fstat[ST_SIZE])
            return True

        return False

    def get_files(self, is_dl_forced, files=None):
        """
        Given a set of files for this source, it will go fetch them, and
        set a default version by date.  If you need to set the version number
        by another method, then it can be set again.
        :param is_dl_forced - boolean
        :param files dict - override instance files dict
        :return: None
        """

        fstat = None
        if files is None:
            files = self.files
        for fname in files:
            headers = None
            filesource = files[fname]
            if 'headers' in filesource:
                headers = filesource['headers']
            LOG.info("Getting %s", fname)
            # if the key 'clean' exists in the sources `files` dict
            # expose that instead of the longer url
            if 'clean' in filesource and filesource['clean'] is not None:
                self.dataset.setFileAccessUrl(filesource['clean'])
            else:
                self.dataset.setFileAccessUrl(filesource['url'])
                LOG.info('Fetching %s', filesource['url'])

            self.fetch_from_url(
                filesource['url'], '/'.join((self.rawdir, filesource['file'])),
                is_dl_forced, headers)

            fstat = os.stat('/'.join((self.rawdir, filesource['file'])))

        # only keeping the date from the last file
        filedate = datetime.utcfromtimestamp(fstat[ST_CTIME]).strftime("%Y-%m-%d")

        # FIXME
        # change this so the date is attached only to each file, not the entire dataset
        self.dataset.set_date_issued(filedate)

    def fetch_from_url(
            self, remotefile, localfile=None, is_dl_forced=False, headers=None):
        """
        Given a remote url and a local filename, attempt to determine
        if the remote file is newer; if it is,
        fetch the remote file and save it to the specified localfile,
        reporting the basic file information once it is downloaded
        :param remotefile: URL of remote file to fetch
        :param localfile: pathname of file to save locally
        :return: None

        """

        response = None
        if ((is_dl_forced is True) or localfile is None or
開發者ID:monarch-initiative,項目名稱:dipper,代碼行數:70,代碼來源:Source.py


注:本文中的dipper.models.Dataset.Dataset.setFileAccessUrl方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。