public class StatsUtils extends Object
Constructor and Description |
---|
StatsUtils() |
Modifier and Type | Method and Description |
---|---|
static Statistics |
collectStatistics(HiveConf conf,
PrunedPartitionList partList,
Table table,
List<ColumnInfo> schema,
List<String> neededColumns,
List<String> referencedColumns,
boolean fetchColStats,
boolean fetchPartStats) |
static Statistics |
collectStatistics(HiveConf conf,
PrunedPartitionList partList,
Table table,
TableScanOperator tableScanOperator)
Collect table, partition and column level statistics
|
static int |
estimateRowSizeFromSchema(HiveConf conf,
List<ColumnInfo> schema,
List<String> neededColumns) |
static long |
getAvailableMemory(org.apache.hadoop.conf.Configuration conf) |
static long |
getAvgColLenOfFixedLengthTypes(String colType)
Get size of fixed length primitives
|
static long |
getAvgColLenOfVariableLengthTypes(HiveConf conf,
ObjectInspector oi,
String colType)
Get the raw data size of variable length data types
|
static List<Long> |
getBasicStatForPartitions(Table table,
List<Partition> parts,
String statType)
Get basic stats of partitions
|
static long |
getBasicStatForTable(Table table,
String statType)
Get basic stats of table
|
static ColStatistics |
getColStatistics(ColumnStatisticsObj cso,
String tabName,
String colName)
Convert ColumnStatisticsObj to ColStatistics
|
static ColStatistics |
getColStatisticsFromExpression(HiveConf conf,
Statistics parentStats,
ExprNodeDesc end)
Get column statistics expression nodes
|
static List<ColStatistics> |
getColStatisticsFromExprMap(HiveConf conf,
Statistics parentStats,
Map<String,ExprNodeDesc> colExprMap,
RowSchema rowSchema)
Get column statistics from parent statistics.
|
static List<ColStatistics> |
getColStatisticsUpdatingTableAlias(Statistics parentStats,
RowSchema rowSchema)
Get column statistics from parent statistics given the
row schema of its child.
|
static long |
getDataSizeFromColumnStats(long numRows,
List<ColStatistics> colStats)
Compute raw data size from column statistics
|
static List<Long> |
getFileSizeForPartitions(HiveConf conf,
List<Partition> parts)
Find the bytes on disks occupied by list of partitions
|
static long |
getFileSizeForTable(HiveConf conf,
Table table)
Find the bytes on disk occupied by a table
|
static String |
getFullyQualifiedTableName(String dbName,
String tabName) |
static long |
getMaxIfOverflow(long val)
negative number of rows or data sizes are invalid.
|
static int |
getNDVPartitionColumn(Set<Partition> partitions,
String partColName) |
static long |
getNumRows(Table table)
Get number of rows of a give table
|
static List<String> |
getQualifedReducerKeyNames(List<String> keyExprs)
Get qualified column name from output key column names
|
static long |
getRawDataSize(Table table)
Get raw data size of a give table
|
static float |
getScaledSelectivity(ColStatistics csPK,
ColStatistics csFK)
Scale selectivity based on key range ratio.
|
static long |
getSizeOfComplexTypes(HiveConf conf,
ObjectInspector oi)
Get the size of complex data types
|
static long |
getSizeOfMap(StandardConstantMapObjectInspector scmoi)
Estimate the size of map object
|
static long |
getSizeOfPrimitiveTypeArraysFromType(String colType,
int length)
Get the size of arrays of primitive types
|
static long |
getSumIgnoreNegatives(List<Long> vals)
Get sum of all values in the list that are >0
|
static List<ColStatistics> |
getTableColumnStats(Table table,
List<ColumnInfo> schema,
List<String> neededColumns)
Get table level column statistics from metastore for needed columns
|
static long |
getTotalSize(Table table)
Get total size of a give table
|
static long |
getWritableSize(ObjectInspector oi,
Object value)
Get size of primitive data types based on their respective writable object inspector
|
static void |
inferAndSetPrimaryKey(long numRows,
List<ColStatistics> colStats)
Based on the provided column statistics and number of rows, this method infers if the column
can be primary key.
|
static boolean |
inferForeignKey(ColStatistics csPK,
ColStatistics csFK)
Infer foreign key relationship from given column statistics.
|
static long |
safeAdd(long a,
long b)
Bounded addition - overflows become MAX_VALUE
|
static long |
safeMult(long a,
double b)
Bounded multiplication - overflows become MAX_VALUE
|
static long |
safeMult(long a,
long b)
Bounded multiplication - overflows become MAX_VALUE
|
public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, TableScanOperator tableScanOperator) throws HiveException
conf
- - hive configurationpartList
- - partition listtable
- - tabletableScanOperator
- - table scan operatorHiveException
public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats) throws HiveException
HiveException
public static void inferAndSetPrimaryKey(long numRows, List<ColStatistics> colStats)
numRows
- - number of rowscolStats
- - column statisticspublic static boolean inferForeignKey(ColStatistics csPK, ColStatistics csFK)
csPK
- - column statistics of primary keycsFK
- - column statistics of potential foreign keypublic static float getScaledSelectivity(ColStatistics csPK, ColStatistics csFK)
csPK
- - column statistics of primary keycsFK
- - column statistics of potential foreign keypublic static int getNDVPartitionColumn(Set<Partition> partitions, String partColName)
public static int estimateRowSizeFromSchema(HiveConf conf, List<ColumnInfo> schema, List<String> neededColumns)
public static long getFileSizeForTable(HiveConf conf, Table table)
conf
- - hive conftable
- - tablepublic static List<Long> getFileSizeForPartitions(HiveConf conf, List<Partition> parts)
conf
- - hive confparts
- - partition listpublic static long getSumIgnoreNegatives(List<Long> vals)
vals
- - list of valuespublic static ColStatistics getColStatistics(ColumnStatisticsObj cso, String tabName, String colName)
cso
- - ColumnStatisticsObjtabName
- - table namecolName
- - column namepublic static List<ColStatistics> getTableColumnStats(Table table, List<ColumnInfo> schema, List<String> neededColumns)
table
- - tableschema
- - output schemaneededColumns
- - list of needed columnspublic static long getAvgColLenOfVariableLengthTypes(HiveConf conf, ObjectInspector oi, String colType)
conf
- - hive confoi
- - object inspectorcolType
- - column typepublic static long getSizeOfComplexTypes(HiveConf conf, ObjectInspector oi)
conf
- - hive confoi
- - object inspectorpublic static long getAvgColLenOfFixedLengthTypes(String colType)
colType
- - column typepublic static long getSizeOfPrimitiveTypeArraysFromType(String colType, int length)
colType
- - column typelength
- - array lengthpublic static long getSizeOfMap(StandardConstantMapObjectInspector scmoi)
scmoi
- - object inspectorpublic static long getWritableSize(ObjectInspector oi, Object value)
oi
- - object inspectorvalue
- - valuepublic static List<ColStatistics> getColStatisticsFromExprMap(HiveConf conf, Statistics parentStats, Map<String,ExprNodeDesc> colExprMap, RowSchema rowSchema)
conf
- - hive confparentStats
- - parent statisticscolExprMap
- - column expression maprowSchema
- - row schemapublic static List<ColStatistics> getColStatisticsUpdatingTableAlias(Statistics parentStats, RowSchema rowSchema)
parentStats
- - parent statisticsrowSchema
- - row schemapublic static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats, ExprNodeDesc end)
conf
- - hive confparentStats
- - parent statisticsend
- - expression nodespublic static long getNumRows(Table table)
public static long getRawDataSize(Table table)
public static long getTotalSize(Table table)
public static long getBasicStatForTable(Table table, String statType)
table
- - tablestatType
- - type of statspublic static List<Long> getBasicStatForPartitions(Table table, List<Partition> parts, String statType)
table
- - tableparts
- - partitionsstatType
- - type of statspublic static long getDataSizeFromColumnStats(long numRows, List<ColStatistics> colStats)
numRows
- - number of rowscolStats
- - column statisticspublic static String getFullyQualifiedTableName(String dbName, String tabName)
public static List<String> getQualifedReducerKeyNames(List<String> keyExprs)
keyExprs
- - output key namespublic static long getAvailableMemory(org.apache.hadoop.conf.Configuration conf)
public static long getMaxIfOverflow(long val)
val
- - input valuepublic static long safeMult(long a, double b)
public static long safeAdd(long a, long b)
public static long safeMult(long a, long b)
Copyright © 2017 The Apache Software Foundation. All rights reserved.