当前位置: 首页>>代码示例>>Python>>正文


Python PyWebHdfsClient.create_file方法代码示例

本文整理汇总了Python中pywebhdfs.webhdfs.PyWebHdfsClient.create_file方法的典型用法代码示例。如果您正苦于以下问题:Python PyWebHdfsClient.create_file方法的具体用法?Python PyWebHdfsClient.create_file怎么用?Python PyWebHdfsClient.create_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pywebhdfs.webhdfs.PyWebHdfsClient的用法示例。


在下文中一共展示了PyWebHdfsClient.create_file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: update_raw_stage

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def update_raw_stage(output, delivery_tag):

    #context = zmq.Context()

    #confirm = context.socket(zmq.PUSH)
    #confirm.connect(confirm_host)

    hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user)
    impala_conn = connect(host=impala_host, port=int(impala_port))
    cur = impala_conn.cursor()

    start_time = time.time()

    for k, v in output.iteritems():

        if (time.time() - start_time)/60 > sink_minutes:
            sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60))
        try:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

        except hdfs_err.PyWebHdfsException:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.create_file(file_name, '')
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

    #confirm.send(delivery_tag)
    sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60))
    sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))
开发者ID:kkrainov,项目名称:event_trigger_json,代码行数:33,代码来源:sink_server.py

示例2: setup_common_oozie_libs

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = 'hdfs'
    platform_dir = 'user/deployment/platform'
    lib_path_list = ['/usr/hdp/current/hbase-client/lib/hbase-client.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-common.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-server.jar',
                     '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-it.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar',
                     '/usr/hdp/current/hbase-client/lib/zookeeper.jar',
                     '/usr/hdp/current/pig-client/piggybank.jar',
                     '/usr/hdp/current/spark-client/lib/spark-examples.jar']

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        print 'Copying source file: %s to HDFS path %s' % (path, platform_file)
        with open(path) as file_data:
            try:
                hdfs_client.create_file(platform_file, file_data, overwrite=True)
            except PyWebHdfsException:
                print 'retrying HDFS copy command for %s' % platform_file
                time.sleep(5)
                hdfs_client.create_file(platform_file, file_data, overwrite=True)
开发者ID:pndaproject,项目名称:platform-salt,代码行数:33,代码来源:oozie_libs.py

示例3: setup_common_oozie_libs

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = 'hdfs'
    platform_dir = 'user/deployment/platform'
    lib_path_list = ['/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar',
                     '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar',
                     '/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples.jar']

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        logging.info('Copying source file: %s to HDFS path %s', path, platform_file)
        with open(path) as file_data:
            hdfs_client.create_file(platform_file, file_data, overwrite=True)
开发者ID:pndaproject,项目名称:platform-salt,代码行数:28,代码来源:cm_setup.py

示例4: upload_file

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def upload_file():
    """
    Upload file
    ---
    tags:
        - Files
    consumes: "multipart/form-data"
    parameters:
        -   name: file
            in: formData
            required: true
            paramType: body
            dataType: file
            type: file
    responses:
        200:
            description: Return a successful message
        401:
            description: Unauthorized
        400:
            description: Bad Request
        500:
            description: Server Internal error
    """
    # hard-code config information. You should imporove it.
    hdfs = PyWebHdfsClient(host='webhdfs',port='50070', user_name='thanhson1085')
    if request.method == 'POST':
        file = request.files['file']
        if file and allowed_file(file.filename):
            filename = secure_filename(str(time.time()) + file.filename)
            my_file = 'tmp/thanhson1085/data/' + filename
            hdfs.create_file(my_file, file)
            return jsonify({'success':'true'})

    return jsonify({'success':'false'})
开发者ID:thanhson1085,项目名称:flask-webhdfs,代码行数:37,代码来源:controllers.py

示例5: submit

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
    def submit(self, bund, files=[]):
        hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')

        for f in files:
            hdfs.create_file("{}/{}".format(bund.path, f.name), f.read())  

        doc, tag, text = Doc().tagtext()
        with tag("configuration"):
            with tag("property"):
                with tag("name"):
                    text("user.name")
                with tag("value"):
                    text("oozie")

            with tag("property"):
                with tag("name"):
                    text("oozie.bundle.application.path")
                with tag("value"):
                    text("/"+bund.path + "/" + bund.name)

        configuration = doc.getvalue()
        response = post("{0}/oozie/v1/jobs".format(self.url), data=configuration, headers={'Content-Type': 'application/xml'})

        if response.status_code > 399:
            print response.headers["oozie-error-message"]
        print response.status_code
        print response.content
开发者ID:orenmazor,项目名称:oozie.py,代码行数:29,代码来源:oozie_server.py

示例6: ship_udf

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
    def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None,
            overwrite=False):
        # extract some information from the function
        if udf_name is None:
            udf_name = function.name
        symbol = function.llvm_func.name
        ir = function.llvm_module.to_bitcode()
        return_type = udf_to_impala_type[function.signature.return_type.name]
        arg_types = [udf_to_impala_type[arg.name]
                        for arg in function.signature.args[1:]]

        # ship the IR to the cluster
        hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port,
                user_name=ic._hdfs_user)
        if hdfs_path is None:
            hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll')
        if not hdfs_path.endswith('.ll'):
            raise ValueError("The HDFS file name must end with .ll")
        hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite)

        # register the function in Impala
        if database is None:
            database = ic._temp_db
        impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types))
        if overwrite:
            ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name)
        register_query = "CREATE FUNCTION %s RETURNS %s LOCATION '%s' SYMBOL='%s'" % (impala_name,
                return_type, hdfs_path, symbol)
        ic._cursor.execute(register_query)
开发者ID:carlotorniai,项目名称:impyla,代码行数:31,代码来源:__init__.py

示例7: WhenTestingCreateOperation

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
class WhenTestingCreateOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.location = 'redirect_uri'
        self.path = 'user/hdfs'
        self.file_data = '010101'
        self.init_response = MagicMock()
        self.init_response.headers = {'location': self.location}
        self.response = MagicMock()
        self.expected_headers = {'content-type': 'application/octet-stream'}

    def test_create_throws_exception_for_no_redirect(self):

        self.init_response.status_code = httplib.BAD_REQUEST
        self.response.status_code = httplib.CREATED
        self.requests.put.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.create_file(self.path, self.file_data)

    def test_create_throws_exception_for_not_created(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.BAD_REQUEST
        self.requests.put.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.create_file(self.path, self.file_data)

    def test_create_returns_file_location(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.CREATED
        self.put_method = MagicMock(
            side_effect=[self.init_response, self.response])
        self.requests.put = self.put_method
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.create_file(self.path, self.file_data)
        self.assertTrue(result)
        self.put_method.assert_called_with(
            self.location, headers=self.expected_headers, data=self.file_data)
开发者ID:ProjectMeniscus,项目名称:pywebhdfs,代码行数:51,代码来源:test_webhdfs.py

示例8: create_data_from_station_data

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def create_data_from_station_data(first, second):
    """this function creates the data analyzing the two stations in comparison"""
    global hdfs; #global hdfs object
    global hbase; #global hbase object
    
    if(hdfs is None): 
        from pywebhdfs.webhdfs import PyWebHdfsClient; 
        hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya'); 
   
    if(hbase is None):
        import happybase;
        hbase = happybase.ConnectionPool(size=1,host='cshadoop.boisestate.edu');
 
    date_for_comparision = first["Date"].strip();

   # creating directory for each date
    try:
        hdfs.get_file_dir_status('user/uacharya/simulation/'+date_for_comparision);
    except Exception:
        # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9    
        for index in range(1, 10):
            content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n';
            try:
                hdfs.create_file('user/uacharya/simulation/'+date_for_comparision+'/node'+str(index)+'/output.csv',content,replication=1);
            except Exception:
                continue;
   
    
    dataset = {'node_1':[],'node_2':[],'node_3':[],'node_4':[],'node_5':[],'node_6':[],'node_7':[],'node_8':[],'node_9':[]};
   
    for data in broadcast_variable.value:
        compare_data_between(date_for_comparision, first, data,dataset);

#    for key in dataset:
#        if(len(dataset[key])!=0):
#            content = "\n".join(dataset[key]);
#            content +="\n";
#            while(True):
#                try:
#                    hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096);
#                    break;
#                except Exception:
#                    time.sleep(0.2);
#                    continue;

    
    dataset.clear(); #clearing the dictionary
    # append over here after all the global variable has been made        
    return second;
开发者ID:uacharya,项目名称:WebServer,代码行数:51,代码来源:TestingSpark.py

示例9: from_pandas

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def from_pandas(ic, df, table=None, path=None, method='in_query',
        file_format='TEXTFILE', field_terminator='\t', line_terminator='\n',
        escape_char='\\',
        hdfs_host=None, webhdfs_port=50070, hdfs_user=None, overwrite=False):
    """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala
    
    path is the dir, not the filename
    """
    # TODO: this is not atomic
    temp_table = _random_id('tmp_table_', 8)
    if table is None:
        table = "%s.%s" % (ic._temp_db, temp_table)
    if path is None:
        path = os.path.join(ic._temp_dir, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    columns = list(df.columns)
    types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes]
    schema = zip(columns, types)
    create_stmt = _create_table(table_name, schema, path=path,
            file_format=file_format, field_terminator=field_terminator,
            line_terminator=line_terminator, escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    if method == 'in_query':
        query = "INSERT INTO %s VALUES " % table_name.to_sql()
        query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values])
        ic._cursor.execute(query)
    elif method == 'webhdfs':
        if file_format != 'TEXTFILE':
            raise ValueError("only TEXTFILE format supported for webhdfs")
        if path is None:
            raise ValueError("must supply a path for EXTERNAL table for webhdfs")
        from pywebhdfs.webhdfs import PyWebHdfsClient
        hdfs_client = PyWebHdfsClient(host=hdfs_host, port=webhdfs_port,
                user_name=hdfs_user)
        raw_data = StringIO()
        df.to_csv(raw_data, sep=field_terminator,
                line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False)
        hdfs_client.create_file(os.path.join(path, 'data.txt').lstrip('/'), raw_data.getvalue(), overwrite=overwrite)
        raw_data.close()
    else:
        raise ValueError("method must be 'in_query' or 'webhdfs'; got %s" % method)
    return from_sql_table(ic, table_name.to_sql())
开发者ID:fkaufer,项目名称:impyla,代码行数:46,代码来源:bdf.py

示例10: HDFS

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
class HDFS(object):
    def __init__(self, host, port, user):
        self._hdfs = PyWebHdfsClient(
            host=host, port=port, user_name=user, timeout=None)
        logging.debug('webhdfs = %[email protected]%s:%s', user, host, port)

    def recursive_copy(self, local_path, remote_path, exclude=None):

        if exclude is None:
            exclude = []

        c_path = canonicalize(remote_path)
        logging.debug('making %s', c_path)
        self._hdfs.make_dir(c_path)

        fs_g = os.walk(local_path)
        for dpath, dnames, fnames in fs_g:
            _, relative_path = dpath.split(local_path)
            for dname in dnames:
                if dname not in exclude:
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, dname))
                    logging.debug('making %s', c_path)
                    self._hdfs.make_dir(c_path)

            for fname in fnames:
                if fname not in exclude:
                    data = file(
                        canonicalize(
                            '%s/%s/%s' %
                            (local_path, relative_path, fname)), 'rb')
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, fname))
                    logging.debug('creating %s', c_path)
                    self._hdfs.create_file(c_path, data, overwrite=True)
                    data.close()

    def make_dir(self, path):

        logging.debug('make_dir: %s', path)

        self._hdfs.make_dir(canonicalize(path))

    def create_file(self, data, remote_file_path):

        logging.debug('create_file: %s', remote_file_path)

        sio = StringIO.StringIO(data)

        self._hdfs.create_file(
            canonicalize(remote_file_path),
            sio,
            overwrite=True)

    def append_file(self, data, remote_file_path):

        logging.debug('append to: %s', remote_file_path)

        self._hdfs.append_file(canonicalize(remote_file_path), data)


    def stream_file_to_disk(self, remote_file_path, local_file_path):
        chunk_size = 10*1024*1024
        offset = 0
        with open(local_file_path, 'wb') as dest_file:
            data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)
            while True:
                dest_file.write(data)
                if len(data) < chunk_size:
                    break
                offset += chunk_size
                data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)

    def read_file(self, remote_file_path):

        data = self._hdfs.read_file(canonicalize(remote_file_path))

        return data

    def remove(self, path, recursive=False):

        logging.debug('remove: %s', path)

        self._hdfs.delete_file_dir(canonicalize(path), recursive)
开发者ID:pndaproject,项目名称:platform-deployment-manager,代码行数:88,代码来源:deployer_utils.py

示例11: PyWebHdfsClient

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
import ujson
from impala.dbapi import connect
import datetime
import re

hdfs_row = []
bad_str =  """?NaDDi?\\""" #

hdfs_row.append('blablabla')
hdfs_row.append(re.sub("""(\n|\t|\r)""", '?', bad_str))
hdfs_row.append('blablabla')

#try:
hdfs = PyWebHdfsClient(host='al1.zmeke.com', port=50070, user_name='k.kraynov')
#data = 'test,test,test'
hdfs.create_file('user/k.kraynov/test/test.txt', 'blabla')
#hdfs.delete_file_dir('user/k.kraynov/test.txt')
#hdfs.append_file('user/k.kraynov/test2.txt', data+'\n')
#hdfs.append_file('etl/500.txt', 'test')
#hdfs.make_dir('etl/stage/log_{0}')
#conn = connect(host='al1.zmeke.com', port=21050)
#cur = conn.cursor()
#cur.execute('show tables in stage;')
#cur.execute('REFRESH analytics.test')
#result = cur.fetchall()
#for i in result:
#print hdfs.list_dir('user/k.kraynov/')
#dir = hdfs.list_dir('etl/stage/log_102/2')
#for dir_file in dir['FileStatuses']['FileStatus']:
 #   print dir_file['pathSuffix']
#except
开发者ID:kkrainov,项目名称:hadoop_etl,代码行数:33,代码来源:webhdfs.py

示例12: len

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
t=conn.table('anet')
while True:
  a_net=randint(1,255)
  ROW=t.row(str(a_net))
  if len(ROW) > 0:
    for key, value in ROW.items():
        if value != str(-1):
          START=randint(1,255)
	  continue
  t.put(str(a_net),{'data:user':'thisnode'})
  print 'scanning the major '+str(a_net)+'.0.0.0/8 subnet'
  for bnet in range(0,256):
    if a_net==10:
       continue
    elif a_net==192 and bnet==168:
       continue
    elif a_net==172 and bnet==16:
       continue
    elif a_net==127:
       continue
    IPADDR=str(a_net)+'.'+str(bnet)+'.0.0/16'
    OFILE=str(a_net)+'-'+str(bnet)+'-p80.log'
    A=subprocess.Popen(['masscan','-p80','-oG',OFILE,IPADDR,'--rate=2000'])
    A.wait()
    time.sleep(2)
    HADOOP_FILE_NAME='user/root/scans/'+str(a_net)+'/'+OFILE
    with open(OFILE) as ofp:
      hdfs.create_file(HADOOP_FILE_NAME,ofp)
    subprocess.Popen(['rm',OFILE])
  t.put(str(a_net),{'data:user':'-1'})
开发者ID:rob-berkes,项目名称:pycraw,代码行数:32,代码来源:scan-for-sites.py

示例13: save

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
 def save(self):
   hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
   coordinator_path = "{0}/{1}/coordinator.xml".format(self.path, self.name)
   hdfs.make_dir(self.path)
   hdfs.create_file(coordinator_path, self.as_xml())
开发者ID:orenmazor,项目名称:oozie.py,代码行数:7,代码来源:coordinators.py

示例14: BeautifulSoup

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
    HTMLFILE=str(line[1])+'.htm'
    TEXTFILE=str(line[1])+'.txt'
    HADOOP_HTMLFILE='user/root/crawls/'+str(ANET)+'/'+str(BNET)+'/'+HTMLFILE
    HADOOP_TEXTFILE='user/root/texts/'+str(ANET)+'/'+str(BNET)+'/'+TEXTFILE
    print "-======= site: "+str(url)+" =======-"
    try:
      soup = BeautifulSoup(html)
    except:
      print " soup exception"
      continue
    HFP=open(HTMLFILE,'w')
    HFP.write(soup.encode('utf-8'))
    HFP.close()
    with open(HTMLFILE) as hfp:
      try:
        client.create_file(HADOOP_HTMLFILE,hfp)
      except:
        client.delete_file_dir(HADOOP_HTMLFILE)
        client.create_file(HADOOP_HTMLFILE,hfp)

    TFP=open(TEXTFILE,'w')
    WRITEOUT=unicode(soup.get_text())
    WORDLIST=re.sub(r'[^a-zA-Z0-9 ]',r' ',WRITEOUT)
    WORDLIST=WORDLIST.strip().split()
    TFP.write(WRITEOUT.encode('utf-8'))
    TFP.close()
    PAGETITLE=''
    try:
      PAGETITLE=soup.title.string
    except:
      pass
开发者ID:rob-berkes,项目名称:pycraw,代码行数:33,代码来源:crawl-and-index-sites.py

示例15: save

# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
 def save(self, workflow_name="workflow.xml"):
     hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
     workflow_path = "{0}/{1}/workflow.xml".format(self.path, self.name)
     hdfs.make_dir(self.path)
     hdfs.create_file(workflow_path, self.as_xml())
开发者ID:orenmazor,项目名称:oozie.py,代码行数:7,代码来源:workflow.py


注:本文中的pywebhdfs.webhdfs.PyWebHdfsClient.create_file方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。