The CREATE TABLE AS SELECT statement fails to update the hive statistics information.
Hive metastore version is CDH-5.14.2.bm2-1.cdh5.14.2.p0.3.
Presto version is 306 with hive.collect-column-statistics-on-write=true setting.
And succeeded after applying session set session melon.collect_column_statistics_on_write = false;.
SQL
create table example
as
SELECT * FROM
(VALUES
(030162130062, TIMESTAMP '2017-04-01 08:38:37.000', 300019, 20000, TIMESTAMP '2017-04-02 00:00:00.000', TIMESTAMP '2017-05-01 23:59:59.000', TIMESTAMP '2017-03-01 00:00:00.000', 1202, 1, DATE '2017-04-02'),
(032315230763, TIMESTAMP '2019-03-23 09:12:40.000', 400003, 20000, TIMESTAMP '2019-03-23 00:00:00.000', TIMESTAMP '2019-04-23 23:59:59.000', TIMESTAMP '2019-03-23 00:00:00.000', 1100, 0, DATE '2019-03-23'),
(052075703374, TIMESTAMP '2017-12-20 16:04:58.000', 300001, 20000, TIMESTAMP '2017-12-21 00:00:00.000', TIMESTAMP '2018-01-20 23:59:59.000', TIMESTAMP '2017-05-20 00:00:00.000', 1202, 7, DATE '2017-12-21'),
(041812029424, TIMESTAMP '2017-08-18 17:06:17.000', 300009, 20000, TIMESTAMP '2017-08-19 00:00:00.000', TIMESTAMP '2017-09-18 23:59:59.000', TIMESTAMP '2016-04-18 00:00:00.000', 1202, 16, DATE '2017-08-19'),
(041812029424, TIMESTAMP '2018-05-18 16:06:29.000', 300009, 20000, TIMESTAMP '2018-05-19 00:00:00.000', TIMESTAMP '2018-06-18 23:59:59.000', TIMESTAMP '2016-04-18 00:00:00.000', 1202, 25, DATE '2018-05-19')
) as T(rst_buy_no, buy_date, prod_id, prod_prt_cd, vld_strt_date, vld_end_date, first_buy_vld_strt_date, prod_join_prt_cd, at_buy_cnt, dt)
ERROR
io.prestosql.spi.PrestoException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore.setTableColumnStatistics(ThriftHiveMetastore.java:440)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore.updateTableStatistics(ThriftHiveMetastore.java:417)
at io.prestosql.plugin.hive.metastore.thrift.BridgingHiveMetastore.updateTableStatistics(BridgingHiveMetastore.java:110)
at io.prestosql.plugin.hive.metastore.CachingHiveMetastore.updateTableStatistics(CachingHiveMetastore.java:337)
at io.prestosql.plugin.hive.metastore.CachingHiveMetastore.updateTableStatistics(CachingHiveMetastore.java:337)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore$UpdateStatisticsOperation.run(SemiTransactionalHiveMetastore.java:2462)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore$Committer.executeUpdateStatisticsOperations(SemiTransactionalHiveMetastore.java:1381)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore$Committer.access$1300(SemiTransactionalHiveMetastore.java:1032)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore.commitShared(SemiTransactionalHiveMetastore.java:980)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore.commit(SemiTransactionalHiveMetastore.java:882)
at io.prestosql.plugin.hive.HiveMetadata.commit(HiveMetadata.java:2186)
at io.prestosql.plugin.hive.HiveConnector.commit(HiveConnector.java:198)
at io.prestosql.transaction.InMemoryTransactionManager$TransactionMetadata$ConnectorTransactionMetadata.commit(InMemoryTransactionManager.java:601)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125)
at com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:57)
at com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78)
at io.airlift.concurrent.BoundedExecutor.drainQueue(BoundedExecutor.java:78)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
at org.apache.thrift.TApplicationException.read(TApplicationException.java:111)
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:79)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.recv_update_table_column_statistics(ThriftHiveMetastore.java:3839)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.update_table_column_statistics(ThriftHiveMetastore.java:3826)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastoreClient.setTableColumnStatistics(ThriftHiveMetastoreClient.java:165)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore.lambda$setTableColumnStatistics$20(ThriftHiveMetastore.java:431)
at io.prestosql.plugin.hive.metastore.thrift.HiveMetastoreApiStats.lambda$wrap$0(HiveMetastoreApiStats.java:42)
at io.prestosql.plugin.hive.util.RetryDriver.run(RetryDriver.java:130)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore.setTableColumnStatistics(ThriftHiveMetastore.java:429)
... 20 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
I encountered this on Presto 316 with hive-1.1.0+cdh5.9.2 running CREATE TABLE "hive"."tpch_sf300_orc"."lineitem" WITH (format = 'ORC') AS SELECT * FROM tpch.sf300."lineitem"
io.prestosql.spi.PrestoException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore.setTableColumnStatistics(ThriftHiveMetastore.java:443)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore.updateTableStatistics(ThriftHiveMetastore.java:420)
at io.prestosql.plugin.hive.metastore.thrift.BridgingHiveMetastore.updateTableStatistics(BridgingHiveMetastore.java:113)
at io.prestosql.plugin.hive.metastore.CachingHiveMetastore.updateTableStatistics(CachingHiveMetastore.java:337)
at io.prestosql.plugin.hive.metastore.CachingHiveMetastore.updateTableStatistics(CachingHiveMetastore.java:337)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore$UpdateStatisticsOperation.run(SemiTransactionalHiveMetastore.java:2575)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore$Committer.executeUpdateStatisticsOperations(SemiTransactionalHiveMetastore.java:1452)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore$Committer.access$1400(SemiTransactionalHiveMetastore.java:1043)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore.commitShared(SemiTransactionalHiveMetastore.java:990)
at io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore.commit(SemiTransactionalHiveMetastore.java:891)
at io.prestosql.plugin.hive.HiveMetadata.commit(HiveMetadata.java:2211)
at io.prestosql.plugin.hive.HiveConnector.commit(HiveConnector.java:198)
at io.prestosql.transaction.InMemoryTransactionManager$TransactionMetadata$ConnectorTransactionMetadata.commit(InMemoryTransactionManager.java:595)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125)
at com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:57)
at com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78)
at io.airlift.concurrent.BoundedExecutor.drainQueue(BoundedExecutor.java:78)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
at org.apache.thrift.TApplicationException.read(TApplicationException.java:111)
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:79)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.recv_update_table_column_statistics(ThriftHiveMetastore.java:3839)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.update_table_column_statistics(ThriftHiveMetastore.java:3826)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastoreClient.setTableColumnStatistics(ThriftHiveMetastoreClient.java:180)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore.lambda$setTableColumnStatistics$20(ThriftHiveMetastore.java:434)
at io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreApiStats.lambda$wrap$0(ThriftMetastoreApiStats.java:42)
at io.prestosql.plugin.hive.util.RetryDriver.run(RetryDriver.java:130)
at io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore.setTableColumnStatistics(ThriftHiveMetastore.java:432)
... 20 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
Suppressed: org.apache.thrift.TApplicationException: Required field 'colName' is unset! Struct:ColumnStatisticsObj(colName:null, colType:null, statsData:null)
... 29 more
If anyone wants to work on this, I have some WIP to provide CI test coverage for CDH @ https://github.com/prestosql/presto/pull/973
@combineads @willshen in your case, is something like the following loggedin _metastore_'s log?
19/11/03 04:34:40 [pool-4-thread-32]: ERROR server.TThreadPoolServer: Thrift error occurred during processing of message.
org.apache.thrift.protocol.TProtocolException: Missing version in readMessageBegin, old client?
at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:228)
at org.apache.hadoop.hive.metastore.TUGIBasedProcessor.process(TUGIBasedProcessor.java:75)
at org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:286)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I think this fails for DATE columns but doesn't for e.g. TIMESTAMP columns (when testing with 1-2 column tables).
This could be thrift version incompatibility.
CDH 5.16 uses libthrift 0.9.3 but even with this exact version problem persists.
@electrum thoughts?
CDH 5.15 has this in thrift:
union ColumnStatisticsData {
1: BooleanColumnStatsData booleanStats,
2: LongColumnStatsData longStats,
3: DoubleColumnStatsData doubleStats,
4: StringColumnStatsData stringStats,
5: BinaryColumnStatsData binaryStats,
6: DecimalColumnStatsData decimalStats
}
Hive 1.2+ has this (since https://issues.apache.org/jira/browse/HIVE-10226)
union ColumnStatisticsData {
1: BooleanColumnStatsData booleanStats,
2: LongColumnStatsData longStats,
3: DoubleColumnStatsData doubleStats,
4: StringColumnStatsData stringStats,
5: BinaryColumnStatsData binaryStats,
6: DecimalColumnStatsData decimalStats,
7: DateColumnStatsData dateStats // a new option
}
we send DateColumnStatsData for a date column and metastore chokes on this (ignores unrecognized data, fails validation, and interprets the rest as a new message, failing again with Missing version in readMessageBegin, old client?).
Moreover CDH 5.15's Hive cannot analyze a table with a date column:
presto:default> set session hive.collect_column_statistics_on_write = false;
presto:default> create table td(a bigint, b date, c bigint);
hive> analyze table td compute statistics for columns ;
...
FAILED: UDFArgumentTypeException Only integer/long/timestamp/float/double/string/binary/boolean/decimal type argument is accepted but date is passed.
19/11/03 22:16:01 [main]: ERROR ql.Driver: FAILED: UDFArgumentTypeException Only integer/long/timestamp/float/double/string/binary/boolean/decimal type argument is accepted but date is passed.
org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException: Only integer/long/timestamp/float/double/string/binary/boolean/decimal type argument is accepted but date is passed.
at org.apache.hadoop.hive.ql.udf.generic.GenericUDAFComputeStats.getEvaluator(GenericUDAFComputeStats.java:90)
at org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver.getEvaluator(AbstractGenericUDAFResolver.java:48)
at org.apache.hadoop.hive.ql.exec.FunctionRegistry.getGenericUDAFEvaluator(FunctionRegistry.java:913)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.getGenericUDAFEvaluator(SemanticAnalyzer.java:4098)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genGroupByPlanMapGroupByOperator(SemanticAnalyzer.java:4762)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genGroupByPlanMapAggrNoSkew(SemanticAnalyzer.java:5779)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genBodyPlan(SemanticAnalyzer.java:8970)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9852)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9745)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genOPTree(SemanticAnalyzer.java:10218)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10229)
at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10109)
at org.apache.hadoop.hive.ql.parse.ColumnStatsSemanticAnalyzer.analyze(ColumnStatsSemanticAnalyzer.java:463)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:560)
at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1358)
at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1475)
at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1287)
at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1277)
at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:226)
at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:175)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:389)
at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:781)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:699)
at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:634)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:226)
at org.apache.hadoop.util.RunJar.main(RunJar.java:141)
it seems we need to filter out date stats when talking to CDH (<6) metastore.
@findepi Unfortunately, this is not a problem I can reproduce now.
@findepi yes, this is exactly what I see in the metastore log
[pool-4-thread-155]: Thrift error occurred during processing of message.
org.apache.thrift.protocol.TProtocolException: Missing version in readMessageBegin, old client?
at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:228)
at org.apache.hadoop.hive.metastore.TUGIBasedProcessor.process(TUGIBasedProcessor.java:75)
at org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:286)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)