selectcount(vdid) from vddata; # Total MapReduce CPU Time Spent: 4 seconds 590 msec # OK # Time taken: 27.214 seconds, Fetched: 1 row(s)
select date_month,count(vdid) from vddata groupby date_month; # Total MapReduce CPU Time Spent: 4 seconds 330 msec # Time taken: 23.09 seconds, Fetched: 12 row(s)
select date_month,avg(speed),avg(laneoccupy),avg(volume) from vddata groupby date_month; # Total MapReduce CPU Time Spent: 4 seconds 100 msec # Time taken: 19.655 seconds, Fetched: 12 row(s)
select date_month,date_day,avg(speed),avg(laneoccupy),avg(volume) from vddata groupby date_month,date_day; # Total MapReduce CPU Time Spent: 5 seconds 230 msec # Time taken: 19.033 seconds, Fetched: 364 row(s)
val spark = SparkSession.builder().appName("spark on hive") .config("spark.sql.warehouse.dir", "hdfs://hc1/spark") .enableHiveSupport().getOrCreate() val st = getInstance().getTime() val sql_res_1 = spark.sql("selectcount(vdid) from vddata").collect() println(getInstance().getTime().getTime() - st.getTime()) // 7724 ms val st = getInstance().getTime() val sql_res_2 = spark.sql("select date_month,count(vdid) from vddata groupby date_month").collect() println(getInstance().getTime().getTime() - st.getTime()) // 3729 ms val st = getInstance().getTime() val sql_res_3 = spark.sql("select date_month,avg(speed),avg(laneoccupy),avg(volume) from vddata groupby date_month").collect() println(getInstance().getTime().getTime() - st.getTime()) // 4520 ms val st = getInstance().getTime() val sql_res_4 = spark.sql("select date_month,date_day,avg(speed),avg(laneoccupy),avg(volume) from vddata groupby date_month,date_day").collect() println(getInstance().getTime().getTime() - st.getTime()) // 4353 ms
# on HBase select count(vdid) from hbase.vddata; # 1.319s select vddata.datetime.`month`,count(vddata.vdid) from hbase.vddata group by vddata.datetime.`month`; # 11.125s select vddata.datetime.`month`,avg(vddata.vd_info.volume) from hbase.vddata group by vddata.datetime.`month`; # datatype error select datetime.month,date_day,avg(vd_info.speed),avg(vd_info.laneoccupy),avg(vd_info.volume) from hbase.vddata group by datetime.month,datetime.day; # datatype error
# on Hive select count(vdid) from hive_cassSpark1.vddata; # in time select date_month,count(vdid) from hive_cassSpark1.vddata group by date_month; # 2.203s select date_month,avg(speed),avg(laneoccupy),avg(volume) from hive_cassSpark1.vddata group by date_month; # 2.632s select date_month,date_day,avg(speed),avg(laneoccupy),avg(volume) from hive_cassSpark1.vddata group by date_month,date_day; # 3.167s
# on csv in hdfs select count(columns[0]) from dfs.`/drill/tw5_df.csv`; # 1.078s select columns[5],count(columns[0]) from dfs.`/drill/tw5_df.csv` group by columns[5]; # 2.049s select columns[5],avg(columns[1]),avg(columns[2]),avg(columns[3]) from dfs.`/drill/tw5_df.csv` group by columns[5]; # datatype error select columns[5],columns[6],avg(columns[1]),avg(columns[2]),avg(columns[3]) from dfs.`/drill/tw5_df.csv` group by columns[5],columns[6]; # datatype error
# on json in hdfs select count(vdid) from dfs.`/drill/tw5_df.json`; # 2.834s select date_month,count(vdid) from dfs.`/drill/tw5_df.json` group by date_month; # 5.778s select date_month,avg(speed),avg(laneoccupy),avg(volume) from dfs.`/drill/tw5_df.json` group by date_month; # datatype error select date_month,date_day,avg(speed),avg(laneoccupy),avg(volume) from dfs.`/drill/tw5_df.json` group by date_month,date_day; # datatype error
val spark = SparkSession.builder().appName("spark on hive").config("spark.sql.warehouse.dir", "hdfs://hc1/spark").enableHiveSupport().getOrCreate()
val st = getInstance().getTime() val susy_out_3 = spark.sql("select count(lepton1phi) from susy_df group by lepton1pt").collect() println(getInstance().getTime().getTime() - st.getTime())
val st = getInstance().getTime() val tmp = spark.sql("select count(lepton_phi) from higgs_df group by lepton_pt").collect() println(getInstance().getTime().getTime() - st.getTime())
Drill運行SQL script
1 2
selectcount(lepton1phi) from hive_cassSpark1.susy_df groupby lepton1pt selectcount(lepton_phi) from hive_cassSpark1.higgs_df groupby lepton_pt