本文整理汇总了Python中pywebhdfs.webhdfs.PyWebHdfsClient.create_file方法的典型用法代码示例。如果您正苦于以下问题:Python PyWebHdfsClient.create_file方法的具体用法?Python PyWebHdfsClient.create_file怎么用?Python PyWebHdfsClient.create_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pywebhdfs.webhdfs.PyWebHdfsClient
的用法示例。
在下文中一共展示了PyWebHdfsClient.create_file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: update_raw_stage
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def update_raw_stage(output, delivery_tag):
#context = zmq.Context()
#confirm = context.socket(zmq.PUSH)
#confirm.connect(confirm_host)
hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user)
impala_conn = connect(host=impala_host, port=int(impala_port))
cur = impala_conn.cursor()
start_time = time.time()
for k, v in output.iteritems():
if (time.time() - start_time)/60 > sink_minutes:
sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60))
try:
file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
hdfs.append_file(file_name, '\n'.join(v))
cur.execute('refresh test_log_{0}'.format(k))
except hdfs_err.PyWebHdfsException:
file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
hdfs.create_file(file_name, '')
hdfs.append_file(file_name, '\n'.join(v))
cur.execute('refresh test_log_{0}'.format(k))
#confirm.send(delivery_tag)
sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60))
sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))
示例2: setup_common_oozie_libs
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def setup_common_oozie_libs(name_node):
webhdfs_port = '14000'
webhdfs_user = 'hdfs'
platform_dir = 'user/deployment/platform'
lib_path_list = ['/usr/hdp/current/hbase-client/lib/hbase-client.jar',
'/usr/hdp/current/hbase-client/lib/hbase-common.jar',
'/usr/hdp/current/hbase-client/lib/hbase-protocol.jar',
'/usr/hdp/current/hbase-client/lib/hbase-server.jar',
'/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar',
'/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar',
'/usr/hdp/current/hbase-client/lib/hbase-it.jar',
'/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar',
'/usr/hdp/current/hbase-client/lib/zookeeper.jar',
'/usr/hdp/current/pig-client/piggybank.jar',
'/usr/hdp/current/spark-client/lib/spark-examples.jar']
# Setup a connection with hdfs using namenode.
hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
# Create directory on hadoop file system (HDFS).
hdfs_client.make_dir(platform_dir)
# Creates a new file on HDFS and write contents from local FS.
for path in lib_path_list:
platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
print 'Copying source file: %s to HDFS path %s' % (path, platform_file)
with open(path) as file_data:
try:
hdfs_client.create_file(platform_file, file_data, overwrite=True)
except PyWebHdfsException:
print 'retrying HDFS copy command for %s' % platform_file
time.sleep(5)
hdfs_client.create_file(platform_file, file_data, overwrite=True)
示例3: setup_common_oozie_libs
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def setup_common_oozie_libs(name_node):
webhdfs_port = '14000'
webhdfs_user = 'hdfs'
platform_dir = 'user/deployment/platform'
lib_path_list = ['/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar',
'/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar',
'/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar',
'/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar',
'/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar',
'/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar',
'/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar',
'/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar',
'/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar',
'/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar',
'/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples.jar']
# Setup a connection with hdfs using namenode.
hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
# Create directory on hadoop file system (HDFS).
hdfs_client.make_dir(platform_dir)
# Creates a new file on HDFS and write contents from local FS.
for path in lib_path_list:
platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
logging.info('Copying source file: %s to HDFS path %s', path, platform_file)
with open(path) as file_data:
hdfs_client.create_file(platform_file, file_data, overwrite=True)
示例4: upload_file
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def upload_file():
"""
Upload file
---
tags:
- Files
consumes: "multipart/form-data"
parameters:
- name: file
in: formData
required: true
paramType: body
dataType: file
type: file
responses:
200:
description: Return a successful message
401:
description: Unauthorized
400:
description: Bad Request
500:
description: Server Internal error
"""
# hard-code config information. You should imporove it.
hdfs = PyWebHdfsClient(host='webhdfs',port='50070', user_name='thanhson1085')
if request.method == 'POST':
file = request.files['file']
if file and allowed_file(file.filename):
filename = secure_filename(str(time.time()) + file.filename)
my_file = 'tmp/thanhson1085/data/' + filename
hdfs.create_file(my_file, file)
return jsonify({'success':'true'})
return jsonify({'success':'false'})
示例5: submit
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def submit(self, bund, files=[]):
hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
for f in files:
hdfs.create_file("{}/{}".format(bund.path, f.name), f.read())
doc, tag, text = Doc().tagtext()
with tag("configuration"):
with tag("property"):
with tag("name"):
text("user.name")
with tag("value"):
text("oozie")
with tag("property"):
with tag("name"):
text("oozie.bundle.application.path")
with tag("value"):
text("/"+bund.path + "/" + bund.name)
configuration = doc.getvalue()
response = post("{0}/oozie/v1/jobs".format(self.url), data=configuration, headers={'Content-Type': 'application/xml'})
if response.status_code > 399:
print response.headers["oozie-error-message"]
print response.status_code
print response.content
示例6: ship_udf
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None,
overwrite=False):
# extract some information from the function
if udf_name is None:
udf_name = function.name
symbol = function.llvm_func.name
ir = function.llvm_module.to_bitcode()
return_type = udf_to_impala_type[function.signature.return_type.name]
arg_types = [udf_to_impala_type[arg.name]
for arg in function.signature.args[1:]]
# ship the IR to the cluster
hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port,
user_name=ic._hdfs_user)
if hdfs_path is None:
hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll')
if not hdfs_path.endswith('.ll'):
raise ValueError("The HDFS file name must end with .ll")
hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite)
# register the function in Impala
if database is None:
database = ic._temp_db
impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types))
if overwrite:
ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name)
register_query = "CREATE FUNCTION %s RETURNS %s LOCATION '%s' SYMBOL='%s'" % (impala_name,
return_type, hdfs_path, symbol)
ic._cursor.execute(register_query)
示例7: WhenTestingCreateOperation
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
class WhenTestingCreateOperation(unittest.TestCase):
def setUp(self):
self.host = 'hostname'
self.port = '00000'
self.user_name = 'username'
self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
user_name=self.user_name)
self.response = MagicMock()
self.requests = MagicMock(return_value=self.response)
self.location = 'redirect_uri'
self.path = 'user/hdfs'
self.file_data = '010101'
self.init_response = MagicMock()
self.init_response.headers = {'location': self.location}
self.response = MagicMock()
self.expected_headers = {'content-type': 'application/octet-stream'}
def test_create_throws_exception_for_no_redirect(self):
self.init_response.status_code = httplib.BAD_REQUEST
self.response.status_code = httplib.CREATED
self.requests.put.side_effect = [self.init_response, self.response]
with patch('pywebhdfs.webhdfs.requests', self.requests):
with self.assertRaises(errors.PyWebHdfsException):
self.webhdfs.create_file(self.path, self.file_data)
def test_create_throws_exception_for_not_created(self):
self.init_response.status_code = httplib.TEMPORARY_REDIRECT
self.response.status_code = httplib.BAD_REQUEST
self.requests.put.side_effect = [self.init_response, self.response]
with patch('pywebhdfs.webhdfs.requests', self.requests):
with self.assertRaises(errors.PyWebHdfsException):
self.webhdfs.create_file(self.path, self.file_data)
def test_create_returns_file_location(self):
self.init_response.status_code = httplib.TEMPORARY_REDIRECT
self.response.status_code = httplib.CREATED
self.put_method = MagicMock(
side_effect=[self.init_response, self.response])
self.requests.put = self.put_method
with patch('pywebhdfs.webhdfs.requests', self.requests):
result = self.webhdfs.create_file(self.path, self.file_data)
self.assertTrue(result)
self.put_method.assert_called_with(
self.location, headers=self.expected_headers, data=self.file_data)
示例8: create_data_from_station_data
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def create_data_from_station_data(first, second):
"""this function creates the data analyzing the two stations in comparison"""
global hdfs; #global hdfs object
global hbase; #global hbase object
if(hdfs is None):
from pywebhdfs.webhdfs import PyWebHdfsClient;
hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya');
if(hbase is None):
import happybase;
hbase = happybase.ConnectionPool(size=1,host='cshadoop.boisestate.edu');
date_for_comparision = first["Date"].strip();
# creating directory for each date
try:
hdfs.get_file_dir_status('user/uacharya/simulation/'+date_for_comparision);
except Exception:
# directory to hold dataset in csv file for reach node in wall display starting from 1 to 9
for index in range(1, 10):
content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n';
try:
hdfs.create_file('user/uacharya/simulation/'+date_for_comparision+'/node'+str(index)+'/output.csv',content,replication=1);
except Exception:
continue;
dataset = {'node_1':[],'node_2':[],'node_3':[],'node_4':[],'node_5':[],'node_6':[],'node_7':[],'node_8':[],'node_9':[]};
for data in broadcast_variable.value:
compare_data_between(date_for_comparision, first, data,dataset);
# for key in dataset:
# if(len(dataset[key])!=0):
# content = "\n".join(dataset[key]);
# content +="\n";
# while(True):
# try:
# hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096);
# break;
# except Exception:
# time.sleep(0.2);
# continue;
dataset.clear(); #clearing the dictionary
# append over here after all the global variable has been made
return second;
示例9: from_pandas
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def from_pandas(ic, df, table=None, path=None, method='in_query',
file_format='TEXTFILE', field_terminator='\t', line_terminator='\n',
escape_char='\\',
hdfs_host=None, webhdfs_port=50070, hdfs_user=None, overwrite=False):
"""Create a BDF by shipping an in-memory pandas `DataFrame` into Impala
path is the dir, not the filename
"""
# TODO: this is not atomic
temp_table = _random_id('tmp_table_', 8)
if table is None:
table = "%s.%s" % (ic._temp_db, temp_table)
if path is None:
path = os.path.join(ic._temp_dir, temp_table)
table_name = _to_TableName(table)
if overwrite:
ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
columns = list(df.columns)
types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes]
schema = zip(columns, types)
create_stmt = _create_table(table_name, schema, path=path,
file_format=file_format, field_terminator=field_terminator,
line_terminator=line_terminator, escape_char=escape_char)
ic._cursor.execute(create_stmt)
if method == 'in_query':
query = "INSERT INTO %s VALUES " % table_name.to_sql()
query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values])
ic._cursor.execute(query)
elif method == 'webhdfs':
if file_format != 'TEXTFILE':
raise ValueError("only TEXTFILE format supported for webhdfs")
if path is None:
raise ValueError("must supply a path for EXTERNAL table for webhdfs")
from pywebhdfs.webhdfs import PyWebHdfsClient
hdfs_client = PyWebHdfsClient(host=hdfs_host, port=webhdfs_port,
user_name=hdfs_user)
raw_data = StringIO()
df.to_csv(raw_data, sep=field_terminator,
line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False)
hdfs_client.create_file(os.path.join(path, 'data.txt').lstrip('/'), raw_data.getvalue(), overwrite=overwrite)
raw_data.close()
else:
raise ValueError("method must be 'in_query' or 'webhdfs'; got %s" % method)
return from_sql_table(ic, table_name.to_sql())
示例10: HDFS
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
class HDFS(object):
def __init__(self, host, port, user):
self._hdfs = PyWebHdfsClient(
host=host, port=port, user_name=user, timeout=None)
logging.debug('webhdfs = %[email protected]%s:%s', user, host, port)
def recursive_copy(self, local_path, remote_path, exclude=None):
if exclude is None:
exclude = []
c_path = canonicalize(remote_path)
logging.debug('making %s', c_path)
self._hdfs.make_dir(c_path)
fs_g = os.walk(local_path)
for dpath, dnames, fnames in fs_g:
_, relative_path = dpath.split(local_path)
for dname in dnames:
if dname not in exclude:
c_path = canonicalize(
'%s/%s/%s' %
(remote_path, relative_path, dname))
logging.debug('making %s', c_path)
self._hdfs.make_dir(c_path)
for fname in fnames:
if fname not in exclude:
data = file(
canonicalize(
'%s/%s/%s' %
(local_path, relative_path, fname)), 'rb')
c_path = canonicalize(
'%s/%s/%s' %
(remote_path, relative_path, fname))
logging.debug('creating %s', c_path)
self._hdfs.create_file(c_path, data, overwrite=True)
data.close()
def make_dir(self, path):
logging.debug('make_dir: %s', path)
self._hdfs.make_dir(canonicalize(path))
def create_file(self, data, remote_file_path):
logging.debug('create_file: %s', remote_file_path)
sio = StringIO.StringIO(data)
self._hdfs.create_file(
canonicalize(remote_file_path),
sio,
overwrite=True)
def append_file(self, data, remote_file_path):
logging.debug('append to: %s', remote_file_path)
self._hdfs.append_file(canonicalize(remote_file_path), data)
def stream_file_to_disk(self, remote_file_path, local_file_path):
chunk_size = 10*1024*1024
offset = 0
with open(local_file_path, 'wb') as dest_file:
data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)
while True:
dest_file.write(data)
if len(data) < chunk_size:
break
offset += chunk_size
data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)
def read_file(self, remote_file_path):
data = self._hdfs.read_file(canonicalize(remote_file_path))
return data
def remove(self, path, recursive=False):
logging.debug('remove: %s', path)
self._hdfs.delete_file_dir(canonicalize(path), recursive)
示例11: PyWebHdfsClient
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
import ujson
from impala.dbapi import connect
import datetime
import re
hdfs_row = []
bad_str = """?NaDDi?\\""" #
hdfs_row.append('blablabla')
hdfs_row.append(re.sub("""(\n|\t|\r)""", '?', bad_str))
hdfs_row.append('blablabla')
#try:
hdfs = PyWebHdfsClient(host='al1.zmeke.com', port=50070, user_name='k.kraynov')
#data = 'test,test,test'
hdfs.create_file('user/k.kraynov/test/test.txt', 'blabla')
#hdfs.delete_file_dir('user/k.kraynov/test.txt')
#hdfs.append_file('user/k.kraynov/test2.txt', data+'\n')
#hdfs.append_file('etl/500.txt', 'test')
#hdfs.make_dir('etl/stage/log_{0}')
#conn = connect(host='al1.zmeke.com', port=21050)
#cur = conn.cursor()
#cur.execute('show tables in stage;')
#cur.execute('REFRESH analytics.test')
#result = cur.fetchall()
#for i in result:
#print hdfs.list_dir('user/k.kraynov/')
#dir = hdfs.list_dir('etl/stage/log_102/2')
#for dir_file in dir['FileStatuses']['FileStatus']:
# print dir_file['pathSuffix']
#except
示例12: len
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
t=conn.table('anet')
while True:
a_net=randint(1,255)
ROW=t.row(str(a_net))
if len(ROW) > 0:
for key, value in ROW.items():
if value != str(-1):
START=randint(1,255)
continue
t.put(str(a_net),{'data:user':'thisnode'})
print 'scanning the major '+str(a_net)+'.0.0.0/8 subnet'
for bnet in range(0,256):
if a_net==10:
continue
elif a_net==192 and bnet==168:
continue
elif a_net==172 and bnet==16:
continue
elif a_net==127:
continue
IPADDR=str(a_net)+'.'+str(bnet)+'.0.0/16'
OFILE=str(a_net)+'-'+str(bnet)+'-p80.log'
A=subprocess.Popen(['masscan','-p80','-oG',OFILE,IPADDR,'--rate=2000'])
A.wait()
time.sleep(2)
HADOOP_FILE_NAME='user/root/scans/'+str(a_net)+'/'+OFILE
with open(OFILE) as ofp:
hdfs.create_file(HADOOP_FILE_NAME,ofp)
subprocess.Popen(['rm',OFILE])
t.put(str(a_net),{'data:user':'-1'})
示例13: save
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def save(self):
hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
coordinator_path = "{0}/{1}/coordinator.xml".format(self.path, self.name)
hdfs.make_dir(self.path)
hdfs.create_file(coordinator_path, self.as_xml())
示例14: BeautifulSoup
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
HTMLFILE=str(line[1])+'.htm'
TEXTFILE=str(line[1])+'.txt'
HADOOP_HTMLFILE='user/root/crawls/'+str(ANET)+'/'+str(BNET)+'/'+HTMLFILE
HADOOP_TEXTFILE='user/root/texts/'+str(ANET)+'/'+str(BNET)+'/'+TEXTFILE
print "-======= site: "+str(url)+" =======-"
try:
soup = BeautifulSoup(html)
except:
print " soup exception"
continue
HFP=open(HTMLFILE,'w')
HFP.write(soup.encode('utf-8'))
HFP.close()
with open(HTMLFILE) as hfp:
try:
client.create_file(HADOOP_HTMLFILE,hfp)
except:
client.delete_file_dir(HADOOP_HTMLFILE)
client.create_file(HADOOP_HTMLFILE,hfp)
TFP=open(TEXTFILE,'w')
WRITEOUT=unicode(soup.get_text())
WORDLIST=re.sub(r'[^a-zA-Z0-9 ]',r' ',WRITEOUT)
WORDLIST=WORDLIST.strip().split()
TFP.write(WRITEOUT.encode('utf-8'))
TFP.close()
PAGETITLE=''
try:
PAGETITLE=soup.title.string
except:
pass
示例15: save
# 需要导入模块: from pywebhdfs.webhdfs import PyWebHdfsClient [as 别名]
# 或者: from pywebhdfs.webhdfs.PyWebHdfsClient import create_file [as 别名]
def save(self, workflow_name="workflow.xml"):
hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
workflow_path = "{0}/{1}/workflow.xml".format(self.path, self.name)
hdfs.make_dir(self.path)
hdfs.create_file(workflow_path, self.as_xml())