SELECT CONCAT_WS(",",collect_set(video_name)) AS ConcatenatedString
split
split(str, regex) - Splits str around occurances that match regex
split('a,b,c,d',',') 得到的结果:["a","b","c","d"]
split('a,b,c,d',',')[0] 得到的结果:a
数组转行
SELECT ID,itemsName,name,loc
FROM Table
LATERAL VIEW explode(items) itemTable AS itemsName;
ID | items | name | loc
_________________________________________________________________
id1 | ["item1","item2","item3","item4","item5"] | Mike | CT
id2 | ["item3","item7","item4","item9","item8"] | Chris| MN
ID | items | name | loc
______________________________________________________
id1 | item1 | Mike | CT
id1 | item2 | Mike | CT
id1 | item3 | Mike | CT
id1 | item4 | Mike | CT
id1 | item5 | Mike | CT
id2 | item3 | Chris | MN
id2 | item7 | Chris | MN
id2 | item4 | Chris | MN
id2 | item9 | Chris | MN
id2 | item8 | Chris | MN
各种问题
内存溢出
SET yarn.app.mapreduce.am.resource.mb=4096;
SET yarn.app.mapreduce.am.command-opts=-Xmx4000m;
动态分区
SET hive.exec.dynamic.partition.mode=nonstrict;
数据压缩
SET hive.exec.compress.intermediate=true;
SET mapreduce.map.output.compress=true;
SET mapred.map.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
SET mapred.map.output.compression.codec=com.hadoop.compression.lzo.LzoCodec;
SET hive.exec.compress.output=true;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
晚起reduce
SET mapreduce.job.reduce.slowstart.completedmaps=0.9;
增加reduce
SET mapred.reduce.tasks=1000;
不限制数据分块数
SET mapreduce.jobtracker.split.metainfo.maxsize=-1;