From c468006746ec61f9116a5eaf9668cea81e65e662 Mon Sep 17 00:00:00 2001
From: David Kaufmann <astra@ionic.at>
Date: Tue, 14 May 2019 22:07:53 +0200
Subject: [PATCH] split sql

---
 ex2/hive/{hive.sql => create.sql} | 31 ---------------------
 ex2/hive/query.sql                | 45 +++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 31 deletions(-)
 rename ex2/hive/{hive.sql => create.sql} (78%)
 create mode 100644 ex2/hive/query.sql

diff --git a/ex2/hive/hive.sql b/ex2/hive/create.sql
similarity index 78%
rename from ex2/hive/hive.sql
rename to ex2/hive/create.sql
index f487237..68a5b07 100644
--- a/ex2/hive/hive.sql
+++ b/ex2/hive/create.sql
@@ -2,17 +2,6 @@ DROP DATABASE IF EXISTS e700719f CASCADE;
 CREATE DATABASE e700719f;
 USE e700719f;
 
--- configure bucketing
-SET hive.enforce.bucketing = TRUE;
-SET hive.exec.dynamic.partition = TRUE;
-SET hive.exec.dynamic.partition.mode = nonstrict;
---SET hive.exec.max.dynamic.partitions = 1980;
---SET hive.exec.max.dynamic.partitions.pernode = 110;
-SET hive.exec.max.dynamic.partitions = 2000;
-SET hive.exec.max.dynamic.partitions.pernode = 400;
--- 18 nodes total
--- users.reputation has 1519 different values
-
 -- raw tables
 CREATE TABLE IF NOT EXISTS raw_badges (id BIGINT, class INT, `date` STRING, name VARCHAR(100), tagbased BOOLEAN, userid BIGINT) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1");
 CREATE TABLE IF NOT EXISTS raw_comments (id BIGINT, creationdate STRING, postid BIGINT, score INT, text VARCHAR(40000), userdisplayname VARCHAR(100), userid BIGINT) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1");
@@ -56,23 +45,3 @@ INSERT OVERWRITE TABLE users
 		location,profileimageurl,reputation,upvotes,views,websiteurl FROM raw_users;
 INSERT OVERWRITE TABLE votes 
 	SELECT id,bountyamount,date_format(regexp_replace(creationdate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'),postid,userid,votetypeid FROM raw_votes;
-
--- users_partitioned
---CREATE TABLE IF NOT EXISTS users_partitioned (id INT, aboutme STRING, accountid INT, creationdate TIMESTAMP, displayname STRING, downvotes INT, lastaccessdate TIMESTAMP, location STRING, profileimageurl STRING, upvotes INT, views INT, websiteurl STRING) PARTITIONED BY (reputation INT);
---INSERT OVERWRITE TABLE users_partitioned PARTITION (reputation) SELECT * FROM users;
-
-SELECT "\n\n\n";
-
-SELECT "\n##### QUERY 1 UNOPTIMIZED #####\n";
--- query unoptimized
---EXPLAIN SELECT p.id FROM posts p, comments c, users u, votes v
---WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid
---AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid
---AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id);
-
-SELECT "\n##### QUERY 1 OPTIMIZED #####\n";
--- query optimized
---EXPLAIN SELECT p.id FROM posts p, comments c, users_partitioned u, votes v
---WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid
---AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid
---AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id);
diff --git a/ex2/hive/query.sql b/ex2/hive/query.sql
new file mode 100644
index 0000000..0c2daf3
--- /dev/null
+++ b/ex2/hive/query.sql
@@ -0,0 +1,45 @@
+USE e700719f;
+
+-- configure bucketing
+SET hive.enforce.bucketing = TRUE;
+SET hive.exec.dynamic.partition = TRUE;
+SET hive.exec.dynamic.partition.mode = nonstrict;
+--SET hive.exec.max.dynamic.partitions = 1980;
+--SET hive.exec.max.dynamic.partitions.pernode = 110;
+SET hive.exec.max.dynamic.partitions = 2000;
+SET hive.exec.max.dynamic.partitions.pernode = 400;
+-- 18 nodes total
+-- users.reputation has 1519 different values
+
+-- query to optimize
+--SELECT p.id FROM posts p, comments c, users u, votes v
+--WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid
+--AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid
+--AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id)
+
+-- find join column diff values
+SELECT "COUNT(DISTINCT x)";
+SELECT COUNT(DISTINCT postid) FROM comments;
+SELECT COUNT(DISTINCT userid) FROM comments;
+SELECT COUNT(DISTINCT owneruserid) FROM posts;
+SELECT COUNT(DISTINCT relatedpostid) FROM postlinks;
+
+-- users_partitioned
+--CREATE TABLE IF NOT EXISTS users_partitioned (id INT, aboutme STRING, accountid INT, creationdate TIMESTAMP, displayname STRING, downvotes INT, lastaccessdate TIMESTAMP, location STRING, profileimageurl STRING, upvotes INT, views INT, websiteurl STRING) PARTITIONED BY (reputation INT);
+--INSERT OVERWRITE TABLE users_partitioned PARTITION (reputation) SELECT * FROM users;
+
+SELECT "\n\n\n";
+
+SELECT "\n##### QUERY 1 UNOPTIMIZED #####\n";
+-- query unoptimized
+--EXPLAIN SELECT p.id FROM posts p, comments c, users u, votes v
+--WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid
+--AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid
+--AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id);
+
+SELECT "\n##### QUERY 1 OPTIMIZED #####\n";
+-- query optimized
+--EXPLAIN SELECT p.id FROM posts p, comments c, users_partitioned u, votes v
+--WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid
+--AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid
+--AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id);
-- 
2.43.0