From c468006746ec61f9116a5eaf9668cea81e65e662 Mon Sep 17 00:00:00 2001 From: David Kaufmann Date: Tue, 14 May 2019 22:07:53 +0200 Subject: [PATCH] split sql --- ex2/hive/{hive.sql => create.sql} | 31 --------------------- ex2/hive/query.sql | 45 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 31 deletions(-) rename ex2/hive/{hive.sql => create.sql} (78%) create mode 100644 ex2/hive/query.sql diff --git a/ex2/hive/hive.sql b/ex2/hive/create.sql similarity index 78% rename from ex2/hive/hive.sql rename to ex2/hive/create.sql index f487237..68a5b07 100644 --- a/ex2/hive/hive.sql +++ b/ex2/hive/create.sql @@ -2,17 +2,6 @@ DROP DATABASE IF EXISTS e700719f CASCADE; CREATE DATABASE e700719f; USE e700719f; --- configure bucketing -SET hive.enforce.bucketing = TRUE; -SET hive.exec.dynamic.partition = TRUE; -SET hive.exec.dynamic.partition.mode = nonstrict; ---SET hive.exec.max.dynamic.partitions = 1980; ---SET hive.exec.max.dynamic.partitions.pernode = 110; -SET hive.exec.max.dynamic.partitions = 2000; -SET hive.exec.max.dynamic.partitions.pernode = 400; --- 18 nodes total --- users.reputation has 1519 different values - -- raw tables CREATE TABLE IF NOT EXISTS raw_badges (id BIGINT, class INT, `date` STRING, name VARCHAR(100), tagbased BOOLEAN, userid BIGINT) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1"); CREATE TABLE IF NOT EXISTS raw_comments (id BIGINT, creationdate STRING, postid BIGINT, score INT, text VARCHAR(40000), userdisplayname VARCHAR(100), userid BIGINT) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1"); @@ -56,23 +45,3 @@ INSERT OVERWRITE TABLE users location,profileimageurl,reputation,upvotes,views,websiteurl FROM raw_users; INSERT OVERWRITE TABLE votes SELECT id,bountyamount,date_format(regexp_replace(creationdate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'),postid,userid,votetypeid FROM raw_votes; - --- users_partitioned ---CREATE TABLE IF NOT EXISTS users_partitioned (id INT, aboutme STRING, accountid INT, creationdate TIMESTAMP, displayname STRING, downvotes INT, lastaccessdate TIMESTAMP, location STRING, profileimageurl STRING, upvotes INT, views INT, websiteurl STRING) PARTITIONED BY (reputation INT); ---INSERT OVERWRITE TABLE users_partitioned PARTITION (reputation) SELECT * FROM users; - -SELECT "\n\n\n"; - -SELECT "\n##### QUERY 1 UNOPTIMIZED #####\n"; --- query unoptimized ---EXPLAIN SELECT p.id FROM posts p, comments c, users u, votes v ---WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid ---AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid ---AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); - -SELECT "\n##### QUERY 1 OPTIMIZED #####\n"; --- query optimized ---EXPLAIN SELECT p.id FROM posts p, comments c, users_partitioned u, votes v ---WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid ---AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid ---AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); diff --git a/ex2/hive/query.sql b/ex2/hive/query.sql new file mode 100644 index 0000000..0c2daf3 --- /dev/null +++ b/ex2/hive/query.sql @@ -0,0 +1,45 @@ +USE e700719f; + +-- configure bucketing +SET hive.enforce.bucketing = TRUE; +SET hive.exec.dynamic.partition = TRUE; +SET hive.exec.dynamic.partition.mode = nonstrict; +--SET hive.exec.max.dynamic.partitions = 1980; +--SET hive.exec.max.dynamic.partitions.pernode = 110; +SET hive.exec.max.dynamic.partitions = 2000; +SET hive.exec.max.dynamic.partitions.pernode = 400; +-- 18 nodes total +-- users.reputation has 1519 different values + +-- query to optimize +--SELECT p.id FROM posts p, comments c, users u, votes v +--WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid +--AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid +--AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id) + +-- find join column diff values +SELECT "COUNT(DISTINCT x)"; +SELECT COUNT(DISTINCT postid) FROM comments; +SELECT COUNT(DISTINCT userid) FROM comments; +SELECT COUNT(DISTINCT owneruserid) FROM posts; +SELECT COUNT(DISTINCT relatedpostid) FROM postlinks; + +-- users_partitioned +--CREATE TABLE IF NOT EXISTS users_partitioned (id INT, aboutme STRING, accountid INT, creationdate TIMESTAMP, displayname STRING, downvotes INT, lastaccessdate TIMESTAMP, location STRING, profileimageurl STRING, upvotes INT, views INT, websiteurl STRING) PARTITIONED BY (reputation INT); +--INSERT OVERWRITE TABLE users_partitioned PARTITION (reputation) SELECT * FROM users; + +SELECT "\n\n\n"; + +SELECT "\n##### QUERY 1 UNOPTIMIZED #####\n"; +-- query unoptimized +--EXPLAIN SELECT p.id FROM posts p, comments c, users u, votes v +--WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid +--AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid +--AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); + +SELECT "\n##### QUERY 1 OPTIMIZED #####\n"; +-- query optimized +--EXPLAIN SELECT p.id FROM posts p, comments c, users_partitioned u, votes v +--WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid +--AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid +--AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); -- 2.43.0