From 2be1f051c03567e49f745d9ce1da4d08f5c879aa Mon Sep 17 00:00:00 2001 From: David Kaufmann Date: Wed, 15 May 2019 04:48:45 +0200 Subject: [PATCH] update hive --- ex2/hive/create.sql | 2 +- ex2/hive/query.sql | 62 +++++++++++++++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/ex2/hive/create.sql b/ex2/hive/create.sql index 68a5b07..8e3d721 100644 --- a/ex2/hive/create.sql +++ b/ex2/hive/create.sql @@ -23,7 +23,7 @@ CREATE TABLE IF NOT EXISTS comments (id BIGINT, creationdate TIMESTAMP, postid B CREATE TABLE IF NOT EXISTS postlinks (id BIGINT,creationdate TIMESTAMP,linktypeid BIGINT,postid BIGINT,relatedpostid BIGINT) row format delimited fields terminated by ','; CREATE TABLE IF NOT EXISTS posts (id BIGINT,acceptedanswerid BIGINT,answercount INT,body VARCHAR(1000),closeddate TIMESTAMP,commentcount INT,communityowneddate TIMESTAMP,creationdate TIMESTAMP,favoritecount INT,lastactivitydate TIMESTAMP,lasteditdate TIMESTAMP,lasteditordisplayname VARCHAR(100),lasteditoruserid BIGINT,ownerdisplayname VARCHAR(100),owneruserid BIGINT,parentid BIGINT,posttypeid TINYINT,score INT,tags VARCHAR(200),title VARCHAR(200),viewcount INT) row format delimited fields terminated by ','; CREATE TABLE IF NOT EXISTS users (id BIGINT,aboutme VARCHAR(3000),accountid BIGINT,creationdate INT,displayname VARCHAR(100),downvotes INT,lastaccessdate TIMESTAMP,location VARCHAR(100),profileimageurl VARCHAR(500),reputation INT,upvotes INT,views INT,websiteurl VARCHAR(500)) row format delimited fields terminated by ','; -CREATE TABLE IF NOT EXISTS votes(id BIGINT,bountyamount INT ,creationdate TIMESTAMP,postid BIGINT,userid BIGINT,votetypeid BIGINT) row format delimited fields terminated by ','; +CREATE TABLE IF NOT EXISTS votes(id BIGINT,bountyamount INT,creationdate TIMESTAMP,postid BIGINT,userid BIGINT,votetypeid BIGINT) row format delimited fields terminated by ','; INSERT OVERWRITE TABLE badges SELECT id,class,date_format(regexp_replace(`date`, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'),name,tagbased,userid FROM raw_badges; diff --git a/ex2/hive/query.sql b/ex2/hive/query.sql index 0c2daf3..9a9626e 100644 --- a/ex2/hive/query.sql +++ b/ex2/hive/query.sql @@ -6,8 +6,8 @@ SET hive.exec.dynamic.partition = TRUE; SET hive.exec.dynamic.partition.mode = nonstrict; --SET hive.exec.max.dynamic.partitions = 1980; --SET hive.exec.max.dynamic.partitions.pernode = 110; -SET hive.exec.max.dynamic.partitions = 2000; -SET hive.exec.max.dynamic.partitions.pernode = 400; +SET hive.exec.max.dynamic.partitions = 1000; +SET hive.exec.max.dynamic.partitions.pernode = 100; -- 18 nodes total -- users.reputation has 1519 different values @@ -18,28 +18,52 @@ SET hive.exec.max.dynamic.partitions.pernode = 400; --AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id) -- find join column diff values -SELECT "COUNT(DISTINCT x)"; -SELECT COUNT(DISTINCT postid) FROM comments; -SELECT COUNT(DISTINCT userid) FROM comments; -SELECT COUNT(DISTINCT owneruserid) FROM posts; -SELECT COUNT(DISTINCT relatedpostid) FROM postlinks; +--SELECT "COUNT(DISTINCT x)"; +--SELECT COUNT(DISTINCT postid) FROM comments; -- 117818 (total: 312277) +--SELECT COUNT(DISTINCT userid) FROM comments; -- 40417 (total: 312277) +--SELECT COUNT(DISTINCT owneruserid) FROM posts; -- 78561 (total: 244189) +--SELECT COUNT(DISTINCT relatedpostid) FROM postlinks; -- 12461 (total: 25373) +--SELECT COUNT(DISTINCT postid) FROM votes; -- 274081 (total: 1040881) +--SELECT COUNT(DISTINCT userid) FROM votes; -- 21704 (total: 1040881) --- users_partitioned ---CREATE TABLE IF NOT EXISTS users_partitioned (id INT, aboutme STRING, accountid INT, creationdate TIMESTAMP, displayname STRING, downvotes INT, lastaccessdate TIMESTAMP, location STRING, profileimageurl STRING, upvotes INT, views INT, websiteurl STRING) PARTITIONED BY (reputation INT); ---INSERT OVERWRITE TABLE users_partitioned PARTITION (reputation) SELECT * FROM users; +DROP TABLE IF EXISTS comments_partitioned; +DROP TABLE IF EXISTS postlinks_partitioned; +DROP TABLE IF EXISTS posts_partitioned; +DROP TABLE IF EXISTS votes_partitioned; -SELECT "\n\n\n"; +-- partitioned tables +SET hive.exec.max.dynamic.partitions = 40000; +SET hive.exec.max.dynamic.partitions.pernode = 40000; +CREATE TABLE IF NOT EXISTS comments_partitioned (id BIGINT, creationdate TIMESTAMP, postid BIGINT, score INT, text VARCHAR(40000), userdisplayname VARCHAR(100)) PARTITIONED BY (userid BIGINT); +INSERT OVERWRITE TABLE comments_partitioned PARTITION (userid) SELECT id,creationdate,postid,score,text,userdisplayname,userid FROM comments; + +--SET hive.exec.max.dynamic.partitions = 200; +--SET hive.exec.max.dynamic.partitions.pernode = 200; +CREATE TABLE IF NOT EXISTS postlinks_partitioned (id BIGINT,creationdate TIMESTAMP,linktypeid BIGINT,postid BIGINT) PARTITIONED BY (relatedpostid BIGINT); +INSERT OVERWRITE TABLE postlinks_partitioned PARTITION (relatedpostid) SELECT id,creationdate,linktype,postid,relatedpostid FROM postlinks; + +--SET hive.exec.max.dynamic.partitions = 800; +--SET hive.exec.max.dynamic.partitions.pernode = 800; +CREATE TABLE IF NOT EXISTS posts_partitioned (id BIGINT,acceptedanswerid BIGINT,answercount INT,body VARCHAR(1000),closeddate TIMESTAMP,commentcount INT,communityowneddate TIMESTAMP,creationdate TIMESTAMP,favoritecount INT,lastactivitydate TIMESTAMP,lasteditdate TIMESTAMP,lasteditordisplayname VARCHAR(100),lasteditoruserid BIGINT,ownerdisplayname VARCHAR(100),parentid BIGINT,posttypeid TINYINT,score INT,tags VARCHAR(200),title VARCHAR(200),viewcount INT) PARTITIONED BY (owneruserid BIGINT); +INSERT OVERWRITE TABLE posts_partitioned PARTITION (owneruserid) SELECT id,acceptedanswerid,answercount,body,closeddate,commentcount,communityowneddate,creationdate,favoritecount,lastactivitydate,lasteditdate,lasteditordisplayname,lasteditoruserid,ownerdisplayname,parentid,posttypeid,score,tags,title,viewcount,owneruserid FROM posts; + +--SET hive.exec.max.dynamic.partitions = 300; +--SET hive.exec.max.dynamic.partitions.pernode = 300; +CREATE TABLE IF NOT EXISTS votes_partitioned(id BIGINT,bountyamount INT,creationdate TIMESTAMP,postid BIGINT,votetypeid BIGINT) PARTITIONED BY (userid BIGINT); +INSERT OVERWRITE TABLE votes_partitioned PARTITION (userid) SELECT id,bountyamount,creationdate,postid,votetypeid,userid FROM votes; + +--SELECT "\n\n\n"; SELECT "\n##### QUERY 1 UNOPTIMIZED #####\n"; -- query unoptimized ---EXPLAIN SELECT p.id FROM posts p, comments c, users u, votes v ---WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid ---AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid ---AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); +EXPLAIN SELECT p.id FROM posts p, comments c, users u, votes v +WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid +AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid +AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); SELECT "\n##### QUERY 1 OPTIMIZED #####\n"; -- query optimized ---EXPLAIN SELECT p.id FROM posts p, comments c, users_partitioned u, votes v ---WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid ---AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid ---AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); +EXPLAIN SELECT p.id FROM posts_partitioned p, comments_partitioned c, users u, votes_partitioned v +WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid +AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid +AND NOT EXISTS (SELECT 1 FROM postlinks_partitioned l WHERE l.relatedpostid = p.id); -- 2.43.0