本文整理汇总了Java中org.apache.hadoop.hive.serde2.ColumnProjectionUtils类的典型用法代码示例。如果您正苦于以下问题:Java ColumnProjectionUtils类的具体用法?Java ColumnProjectionUtils怎么用?Java ColumnProjectionUtils使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
ColumnProjectionUtils类属于org.apache.hadoop.hive.serde2包,在下文中一共展示了ColumnProjectionUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: initialize
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
@Override
public void initialize( final Configuration conf, final Properties table , final Properties part ) throws SerDeException{
LOG.info( table.toString() );
if( part != null ){
LOG.info( part.toString() );
}
String columnNameProperty = table.getProperty(serdeConstants.LIST_COLUMNS);
String columnTypeProperty = table.getProperty(serdeConstants.LIST_COLUMN_TYPES);
String projectionColumnNames = conf.get( ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR , "" );
StructTypeInfo rootType;
if( projectionColumnNames.isEmpty() ){
rootType = getAllReadTypeInfo( columnNameProperty , columnTypeProperty );
}
else{
rootType = getColumnProjectionTypeInfo( columnNameProperty , columnTypeProperty , projectionColumnNames );
}
inspector = MDSObjectInspectorFactory.craeteObjectInspectorFromTypeInfo( rootType );
}
示例2: HiveVectorizedReaderSetting
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
public HiveVectorizedReaderSetting( final FileSplit split , final JobConf job , final HiveReaderSetting hiveReaderConfig ) throws IOException{
this.hiveReaderConfig = hiveReaderConfig;
rbCtx = Utilities.getVectorizedRowBatchCtx( job );
partitionValues = new Object[rbCtx.getPartitionColumnCount()];
if( 0 < partitionValues.length ){
rbCtx.getPartitionValues( rbCtx, job, split, partitionValues );
}
TypeInfo[] typeInfos = rbCtx.getRowColumnTypeInfos();
columnNames = rbCtx.getRowColumnNames();
needColumnIds = createNeedColumnId( ColumnProjectionUtils.getReadColumnIDs( job ) );
projectionColumn = new boolean[columnNames.length];
assignors = new IColumnVectorAssignor[columnNames.length];
for( int id : needColumnIds ){
projectionColumn[id] = true;
assignors[id] = ColumnVectorAssignorFactory.create( typeInfos[id] );
}
}
示例3: readAndAssertOnEmptyCols
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
/**
* Read using record reader and assert that the columns not requested have 0 length.
* <p>
* @param conf the reader configuration -- must have the region name
* @param split the input-split containing the records to be read
* @param predicates the predicates to filter out unwanted results
* @param readColIds the column ids to retrieve
* @return total number of records read
*/
private long readAndAssertOnEmptyCols(final Configuration conf, final InputSplit split,
final String readColIds, final Filter[] predicates) throws IOException{
MonarchRecordReader mrr = new MonarchRecordReader(conf);
FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL);
if (predicates != null) {
for (int i = 0; i < predicates.length; i++) {
filterList.addFilter(predicates[i]);
}
mrr.pushDownfilters = filterList;
}
// mrr.readColIds = readColIds;
/*List<Integer> readColIdList = readColIds == null ? Collections.emptyList() :
Arrays.stream(readColIds.split(",")).mapToInt(Integer::valueOf)
.collect(ArrayList::new, ArrayList::add, ArrayList::addAll);*/
List<Integer> readColIdList = ColumnProjectionUtils.getReadColumnIDs(conf);
long size = 0;
try {
mrr.initialize(split, conf);
Writable key = mrr.createKey();
Writable value = mrr.createValue();
while (mrr.next(key, value)) {
BytesRefArrayWritable braw = (BytesRefArrayWritable) value;
/** assert that skipped (not read) columns have 0 length **/
for (int i = 0; i < braw.size(); i++) {
if (!readColIdList.isEmpty() && !readColIdList.contains(i)) {
assertEquals(0, braw.get(i).getLength());
}
}
++size;
}
mrr.close();
} catch (IOException e) {
e.printStackTrace();
}
return size;
}
示例4: getIncludeColumns
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
private void getIncludeColumns(Configuration conf, Segment segment) {
List<ColumnSchema> segColSchemas = segment.schema().getColumns();
String columnNamesStr = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
if (ColumnProjectionUtils.isReadAllColumns(conf) ||
columnNamesStr == null) {
projectCols = new ColumnSchema[segColSchemas.size()];
projectColIds = new int[segColSchemas.size()];
for (int i = 0; i < segColSchemas.size(); i++) {
projectCols[i] = segColSchemas.get(i);
projectColIds[i] = i;
}
} else {
String[] ss = Strings.isEmpty(columnNamesStr.trim()) ? new String[]{} : columnNamesStr.split(",");
projectCols = new ColumnSchema[ss.length];
projectColIds = new int[ss.length];
for (int i = 0; i < ss.length; i++) {
String col = ss[i];
int colId = Trick.indexFirst(segColSchemas, c -> c.getName().equalsIgnoreCase(col));
//Preconditions.checkState(colId >= 0, String.format("Column [%s] not found in segment [%s]", col, segment.name()));
if (colId < 0) {
projectCols[i] = null;
projectColIds[i] = -1;
} else {
projectCols[i] = segColSchemas.get(colId);
projectColIds[i] = colId;
}
}
}
}
示例5: initProperties
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
private static void initProperties(
Properties props,
Configuration conf,
List<HCatTableColumn> columns,
List<OutputColumnSpec> outputColumns) throws Exception {
String colNames = "";
String colTypes = "";
for (HCatTableColumn colInfo : columns) {
colNames += colInfo.getName() + ",";
colTypes += colInfo.getDataType() + ",";
}
if (colNames.length() > 0)
colNames = colNames.substring(0, colNames.length() - 1);
if (colTypes.length() > 0)
colTypes = colTypes.substring(0, colTypes.length() - 1);
props.put(serdeConstants.LIST_COLUMNS, colNames);
props.put(serdeConstants.LIST_COLUMN_TYPES, colTypes);
props.put(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL");
// Fix for Avro (NullPointerException if null)
if (props.getProperty("columns.comments") == null) {
props.put("columns.comments", "");
}
// Pushdown projection if we don't need all columns
Set<Integer> requiredColumns = new HashSet<>();
for (OutputColumnSpec spec : outputColumns) {
if (spec.getColumnPosition() < columns.size()) {
requiredColumns.add(spec.getColumnPosition());
}
}
if (requiredColumns.size() < columns.size()) {
ColumnProjectionUtils.appendReadColumns(conf, new ArrayList<>(requiredColumns));
}
}
示例6: setReadColumns
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
/**
* Sets which fields are to be read from the ORC file
*/
static void setReadColumns(Configuration conf, StructTypeInfo actualStructTypeInfo) {
StructTypeInfo readStructTypeInfo = getTypeInfo(conf);
LOG.info("Read StructTypeInfo: {}", readStructTypeInfo);
List<Integer> ids = new ArrayList<>();
List<String> names = new ArrayList<>();
List<String> readNames = readStructTypeInfo.getAllStructFieldNames();
List<String> actualNames = actualStructTypeInfo.getAllStructFieldNames();
for (int i = 0; i < actualNames.size(); i++) {
String actualName = actualNames.get(i);
if (readNames.contains(actualName)) {
// make sure they are the same type
TypeInfo actualTypeInfo = actualStructTypeInfo.getStructFieldTypeInfo(actualName);
TypeInfo readTypeInfo = readStructTypeInfo.getStructFieldTypeInfo(actualName);
if (!actualTypeInfo.equals(readTypeInfo)) {
throw new IllegalStateException("readTypeInfo [" + readTypeInfo + "] does not match actualTypeInfo ["
+ actualTypeInfo + "]");
}
// mark the column as to-be-read
ids.add(i);
names.add(actualName);
}
}
if (ids.size() == 0) {
throw new IllegalStateException("None of the selected columns were found in the ORC file.");
}
LOG.info("Set column projection on columns: {} ({})", ids, names);
ColumnProjectionUtils.appendReadColumns(conf, ids, names);
}
示例7: setInputReadColumnProjection
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
@Test
public void setInputReadColumnProjection() {
StructTypeInfo typeInfo = new StructTypeInfoBuilder()
.add("a", TypeInfoFactory.stringTypeInfo)
.add("b", TypeInfoFactory.longTypeInfo)
.build();
conf.set(CorcInputFormat.INPUT_TYPE_INFO, "struct<a:string>");
CorcInputFormat.setReadColumns(conf, typeInfo);
assertThat(conf.getBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, true), is(false));
assertThat(conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), is("a"));
assertThat(conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), is("0"));
}
示例8: setInputReadColumnsAll
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
@Test
public void setInputReadColumnsAll() {
StructTypeInfo typeInfo = new StructTypeInfoBuilder()
.add("a", TypeInfoFactory.stringTypeInfo)
.add("b", TypeInfoFactory.longTypeInfo)
.build();
conf.set(CorcInputFormat.INPUT_TYPE_INFO, "struct<a:string,b:bigint>");
CorcInputFormat.setReadColumns(conf, typeInfo);
assertThat(conf.getBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, true), is(false));
assertThat(conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), is("a,b"));
assertThat(conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), is("0,1"));
}
示例9: setInputReadColumnsMissing
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
@Test
public void setInputReadColumnsMissing() {
StructTypeInfo typeInfo = new StructTypeInfoBuilder()
.add("a", TypeInfoFactory.stringTypeInfo)
.add("b", TypeInfoFactory.longTypeInfo)
.build();
conf.set(CorcInputFormat.INPUT_TYPE_INFO, "struct<a:string,b:bigint,c:string>");
CorcInputFormat.setReadColumns(conf, typeInfo);
assertThat(conf.getBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, true), is(false));
assertThat(conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), is("0,1"));
}
示例10: init
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
/**
*
* It creates the readContext for Parquet side with the requested schema during the init phase.
*
* @param configuration needed to get the wanted columns
* @param keyValueMetaData // unused
* @param fileSchema parquet file schema
* @return the parquet ReadContext
*/
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration,
final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
final String columns = configuration.get(IOConstants.COLUMNS);
final Map<String, String> contextMetadata = new HashMap<String, String>();
if (columns != null) {
final List<String> listColumns = getColumns(columns);
final List<Type> typeListTable = new ArrayList<Type>();
for (final String col : listColumns) {
// listColumns contains partition columns which are metadata only
if (fileSchema.containsField(col)) {
typeListTable.add(fileSchema.getType(col));
} else {
// below allows schema evolution
typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
}
}
MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());
MessageType requestedSchemaByUser = tableSchema;
final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
final List<Type> typeListWanted = new ArrayList<Type>();
for (final Integer idx : indexColumnsWanted) {
typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
}
requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
typeListWanted), fileSchema, configuration);
return new ReadContext(requestedSchemaByUser, contextMetadata);
} else {
contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
return new ReadContext(fileSchema, contextMetadata);
}
}
示例11: setLocation
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
@Override
public void setLocation(String location, Job job) throws IOException {
Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
if (!UDFContext.getUDFContext().isFrontend()) {
typeInfo = (TypeInfo)ObjectSerializer.deserialize(p.getProperty(signature + SchemaSignatureSuffix));
} else if (typeInfo == null) {
typeInfo = getTypeInfo(location, job);
}
if (typeInfo != null && oi == null) {
oi = OrcStruct.createObjectInspector(typeInfo);
}
if (!UDFContext.getUDFContext().isFrontend()) {
if (p.getProperty(signature + RequiredColumnsSuffix) != null) {
mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p
.getProperty(signature + RequiredColumnsSuffix));
job.getConfiguration().setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,
getReqiredColumnIdString(mRequiredColumns));
if (p.getProperty(signature + SearchArgsSuffix) != null) {
// Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
getReqiredColumnNamesString(getSchema(location, job), mRequiredColumns));
}
} else if (p.getProperty(signature + SearchArgsSuffix) != null) {
// Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
getReqiredColumnNamesString(getSchema(location, job)));
}
if (p.getProperty(signature + SearchArgsSuffix) != null) {
job.getConfiguration().set(SARG_PUSHDOWN, p.getProperty(signature + SearchArgsSuffix));
}
}
FileInputFormat.setInputPaths(job, location);
}
示例12: findIncludedColumns
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
/**
* Take the configuration and figure out which columns we need to include.
* @param types the types of the file
* @param conf the configuration
* @return true for each column that should be included
*/
private static boolean[] findIncludedColumns(List<OrcProto.Type> types,
Configuration conf) {
String includedStr =
conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR);
if (includedStr == null || includedStr.trim().length() == 0) {
return null;
} else {
int numColumns = types.size();
boolean[] result = new boolean[numColumns];
result[0] = true;
OrcProto.Type root = types.get(0);
List<Integer> included = ColumnProjectionUtils.getReadColumnIDs(conf);
for(int i=0; i < root.getSubtypesCount(); ++i) {
if (included.contains(i)) {
includeColumnRecursive(types, result, root.getSubtypes(i));
}
}
// if we are filtering at least one column, return the boolean array
for(boolean include: result) {
if (!include) {
return result;
}
}
return null;
}
}
示例13: HiveReaderSetting
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
public HiveReaderSetting( final FileSplit split, final JobConf job ){
config = new Configuration();
disableSkipBlock = job.getBoolean( "mds.disable.block.skip" , false );
disableFilterPushdown = job.getBoolean( "mds.disable.filter.pushdown" , false );
Set<String> pathNameSet= createPathSet( split.getPath() );
List<ExprNodeGenericFuncDesc> filterExprs = new ArrayList<ExprNodeGenericFuncDesc>();
String filterExprSerialized = job.get( TableScanDesc.FILTER_EXPR_CONF_STR );
if( filterExprSerialized != null ){
filterExprs.add( SerializationUtilities.deserializeExpression(filterExprSerialized) );
}
MapWork mapWork;
try{
mapWork = Utilities.getMapWork(job);
}catch( Exception e ){
mapWork = null;
}
if( mapWork == null ){
node = createExpressionNode( filterExprs );
isVectorModeFlag = false;
return;
}
node = createExpressionNode( filterExprs );
for( Map.Entry<String,PartitionDesc> pathsAndParts: mapWork.getPathToPartitionInfo().entrySet() ){
if( ! pathNameSet.contains( pathsAndParts.getKey() ) ){
continue;
}
Properties props = pathsAndParts.getValue().getTableDesc().getProperties();
if( props.containsKey( "mds.expand" ) ){
config.set( "spread.reader.expand.column" , props.getProperty( "mds.expand" ) );
}
if( props.containsKey( "mds.flatten" ) ){
config.set( "spread.reader.flatten.column" , props.getProperty( "mds.flatten" ) );
}
}
config.set( "spread.reader.read.column.names" , createReadColumnNames( job.get( ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR , null ) ) );
// Next Hive vesion;
// Utilities.getUseVectorizedInputFileFormat(job)
isVectorModeFlag = Utilities.isVectorMode( job );
}
示例14: initialize
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
public void initialize(final InputSplit split, final Configuration conf) throws IOException {
this.startTime = System.currentTimeMillis();
/** batch size for reading multiple records together **/
batchSize = NumberUtils.toInt(conf.get(MonarchUtils.MONARCH_BATCH_SIZE), MonarchUtils.MONARCH_BATCH_SIZE_DEFAULT);
final MonarchSplit ms = (MonarchSplit) split;
this.readColIds = ColumnProjectionUtils.getReadColumnIDs(conf);
final String expression = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
if (expression != null && columns != null) {
this.pushDownfilters = getPushDownFilters(expression, columns);
if (this.pushDownfilters != null) {
for (Filter mFilter : pushDownfilters.getFilters()) {
logger.info("Pushing filter= {}", mFilter);
int id=-1;
for(int i=0; i< columns.length; i++){
if(columns[i].equalsIgnoreCase(((SingleColumnValueFilter) mFilter).getColumnNameString())) {
id =i;
break;
}
}
if (!readColIds.contains(id) && readColIds.size() > 0 && id != -1) {
readColIds.add(id);
}
}
}
}
/** create the scan with required parameters.. **/
Scan scan = new Scan();
scan.setBucketIds(ms.getBucketIds());
scan.setBatchSize(batchSize);
scan.setReturnKeysFlag(false);
final String str = conf.get(MonarchUtils.READ_FILTER_ON_LATEST_VERSION);
if (str != null) {
scan.setFilterOnLatestVersionOnly(Boolean.getBoolean(str));
}
final boolean isOldestFirst = Boolean.getBoolean(conf.get(MonarchUtils.READ_OLDEST_FIRST));
final int maxVersions = NumberUtils.toInt(conf.get(MonarchUtils.READ_MAX_VERSIONS), 1);
scan.setMaxVersions(maxVersions, isOldestFirst);
if (pushDownfilters != null) {
scan.setFilter(pushDownfilters);
}
scan.setBucketToServerMap(ms.getBucketToServerMap());
Collections.sort(this.readColIds);
if (! readColIds.isEmpty()) {
scan.setColumns(readColIds);
}
logger.info("Retrieving columns= {}", scan.getColumns());
this.mResultScanner = this.anyTable.getScanner(scan);
this.valueIterator = this.mResultScanner.iterator();
if (logger.isDebugEnabled()) {
logger.debug("{} - Initialize MonarchRecordReader: batchSize= {}, split= {}", new Date(), batchSize, ms);
}
}
示例15: initialize
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; //导入依赖的package包/类
@Override
@SuppressWarnings("unchecked")
public void initialize(Configuration conf, Properties tbl) throws SerDeException {
String[] cols = tbl.getProperty("columns").split(",");
String types = tbl.getProperty("columns.types");
if (types == null) {
types = Collections.nCopies(cols.length, "string").stream().collect(Collectors.joining(","));
}
this.columnList = Arrays.asList(cols);
this.typeInfoList = TypeInfoUtils.getTypeInfosFromTypeString(types);
/** initialize storage for fields **/
int size = columnList.size();
field = new BytesRefWritable[size];
for (int i = 0; i < size; i++) {
field[i] = new BytesRefWritable();
serializeCache.set(i, field[i]);
}
serializedSize = 0;
/** the columns to skip **/
List notSkipIDs = new ArrayList();
if(conf != null && !ColumnProjectionUtils.isReadAllColumns(conf)) {
notSkipIDs = ColumnProjectionUtils.getReadColumnIDs(conf);
} else {
for(int i = 0; i < typeInfoList.size(); ++i) {
notSkipIDs.add(i);
}
}
/**
* create the object inspector for row.. use native Java object inspectors for
* the objects for which deserialization is done by us and not Hive.
* Cache Monarch object types as well.. for all rows (serialize)..
*/
List<ObjectInspector> oiList = new ArrayList<>(columnList.size());
this.objectTypeList = new ArrayList<>(columnList.size());
for (final TypeInfo ti : typeInfoList) {
DataType type = null;
try {
type = MonarchPredicateHandler.getMonarchFieldType(ti.getTypeName());
} catch (Exception e) {
//
}
if (type != null) {
oiList.add(TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(ti));
} else {
oiList.add(LazyBinaryUtils.getLazyBinaryObjectInspectorFromTypeInfo(ti));
}
this.objectTypeList.add(type);
}
this.rowOI = ObjectInspectorFactory.getColumnarStructObjectInspector(columnList, oiList);
/** Initialize the lazy structure for on-demand de-serialization **/
this.cachedLazyStruct = new MonarchColumnarStruct(rowOI, notSkipIDs);
}