From 942a639e78445105f8e8df6596626bdace87825a Mon Sep 17 00:00:00 2001 From: David Kaufmann Date: Tue, 14 May 2019 21:51:46 +0200 Subject: [PATCH] import tables done --- ex2/hive/hive.sql | 87 +++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/ex2/hive/hive.sql b/ex2/hive/hive.sql index d747952..f487237 100644 --- a/ex2/hive/hive.sql +++ b/ex2/hive/hive.sql @@ -6,52 +6,73 @@ USE e700719f; SET hive.enforce.bucketing = TRUE; SET hive.exec.dynamic.partition = TRUE; SET hive.exec.dynamic.partition.mode = nonstrict; -SET hive.exec.max.dynamic.partitions = 1980; -SET hive.exec.max.dynamic.partitions.pernode = 110; +--SET hive.exec.max.dynamic.partitions = 1980; +--SET hive.exec.max.dynamic.partitions.pernode = 110; +SET hive.exec.max.dynamic.partitions = 2000; +SET hive.exec.max.dynamic.partitions.pernode = 400; -- 18 nodes total -- users.reputation has 1519 different values --- badges -CREATE TABLE IF NOT EXISTS badges (id INT, class INT, `date` DATE, name STRING, tagbased BOOLEAN, userid INT); -LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/badges.csv' OVERWRITE INTO TABLE badges; +-- raw tables +CREATE TABLE IF NOT EXISTS raw_badges (id BIGINT, class INT, `date` STRING, name VARCHAR(100), tagbased BOOLEAN, userid BIGINT) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1"); +CREATE TABLE IF NOT EXISTS raw_comments (id BIGINT, creationdate STRING, postid BIGINT, score INT, text VARCHAR(40000), userdisplayname VARCHAR(100), userid BIGINT) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1"); +CREATE TABLE IF NOT EXISTS raw_posts (id BIGINT,acceptedanswerid BIGINT,answercount INT,body VARCHAR(1000),closeddate STRING,commentcount INT,communityowneddate STRING,creationdate STRING,favoritecount INT,lastactivitydate STRING,lasteditdate STRING,lasteditordisplayname VARCHAR(100),lasteditoruserid BIGINT,ownerdisplayname VARCHAR(100),owneruserid BIGINT,parentid BIGINT,posttypeid TINYINT,score INT,tags VARCHAR(200),title VARCHAR(200),viewcount INT) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1"); +CREATE TABLE IF NOT EXISTS raw_postlinks (id BIGINT,creationdate STRING,linktypeid BIGINT,postid BIGINT,relatedpostid BIGINT) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1"); +CREATE TABLE IF NOT EXISTS raw_users (id BIGINT,aboutme VARCHAR(3000),accountid BIGINT,creationdate INT,displayname VARCHAR(100),downvotes INT,lastaccessdate STRING,location VARCHAR(100),profileimageurl VARCHAR(500),reputation INT,upvotes INT,views INT,websiteurl VARCHAR(500)) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1"); +CREATE TABLE IF NOT EXISTS raw_votes(id BIGINT,bountyamount INT ,creationdate STRING,postid BIGINT,userid BIGINT,votetypeid BIGINT) row format delimited fields terminated by ',' tblproperties ("skip.header.line.count"="1"); --- comments -CREATE TABLE IF NOT EXISTS comments (id INT, creationdate DATE, postid INT, score INT, text STRING, userdisplayname STRING, userid INT); -LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/comments.csv' OVERWRITE INTO TABLE comments; +LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/badges.csv' OVERWRITE INTO TABLE raw_badges; +LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/comments.csv' OVERWRITE INTO TABLE raw_comments; +LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/postlinks.csv' OVERWRITE INTO TABLE raw_postlinks; +LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/posts.csv' OVERWRITE INTO TABLE raw_posts; +LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/users.csv' OVERWRITE INTO TABLE raw_users; +LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/votes.csv' OVERWRITE INTO TABLE raw_votes; --- postlinks -CREATE TABLE IF NOT EXISTS postlinks (id INT, creationdate DATE, linktypeid INT, postid INT, relatedpostid INT); -LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/postlinks.csv' OVERWRITE INTO TABLE postlinks; +-- real tables +CREATE TABLE IF NOT EXISTS badges (id BIGINT, class INT, `date` TIMESTAMP, name VARCHAR(100), tagbased BOOLEAN, userid BIGINT) row format delimited fields terminated by ','; +CREATE TABLE IF NOT EXISTS comments (id BIGINT, creationdate TIMESTAMP, postid BIGINT, score INT, text VARCHAR(40000), userdisplayname VARCHAR(100), userid BIGINT) row format delimited fields terminated by ','; +CREATE TABLE IF NOT EXISTS postlinks (id BIGINT,creationdate TIMESTAMP,linktypeid BIGINT,postid BIGINT,relatedpostid BIGINT) row format delimited fields terminated by ','; +CREATE TABLE IF NOT EXISTS posts (id BIGINT,acceptedanswerid BIGINT,answercount INT,body VARCHAR(1000),closeddate TIMESTAMP,commentcount INT,communityowneddate TIMESTAMP,creationdate TIMESTAMP,favoritecount INT,lastactivitydate TIMESTAMP,lasteditdate TIMESTAMP,lasteditordisplayname VARCHAR(100),lasteditoruserid BIGINT,ownerdisplayname VARCHAR(100),owneruserid BIGINT,parentid BIGINT,posttypeid TINYINT,score INT,tags VARCHAR(200),title VARCHAR(200),viewcount INT) row format delimited fields terminated by ','; +CREATE TABLE IF NOT EXISTS users (id BIGINT,aboutme VARCHAR(3000),accountid BIGINT,creationdate INT,displayname VARCHAR(100),downvotes INT,lastaccessdate TIMESTAMP,location VARCHAR(100),profileimageurl VARCHAR(500),reputation INT,upvotes INT,views INT,websiteurl VARCHAR(500)) row format delimited fields terminated by ','; +CREATE TABLE IF NOT EXISTS votes(id BIGINT,bountyamount INT ,creationdate TIMESTAMP,postid BIGINT,userid BIGINT,votetypeid BIGINT) row format delimited fields terminated by ','; --- posts -CREATE TABLE IF NOT EXISTS posts (id INT, acceptedanswerid INT, answercount INT, body STRING, closeddate DATE, commentcount INT, communityowneddate DATE, creationdate DATE, favoritecount INT, lastactivitydate DATE, lasteditdate DATE, lasteditordisplayname STRING, lasteditoruserid INT, ownerdisplayname STRING, owneruserid INT, parentid INT, posttypeid INT, score INT, tags STRING, title STRING, viewcount INT); -LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/posts.csv' OVERWRITE INTO TABLE posts; - --- users -CREATE TABLE IF NOT EXISTS users (id INT, aboutme STRING, accountid INT, creationdate DATE, displayname STRING, downvotes INT, lastaccessdate DATE, location STRING, profileimageurl STRING, reputation INT, upvotes INT, views INT, websiteurl STRING); -LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/users.csv' OVERWRITE INTO TABLE users; - --- votes -CREATE TABLE IF NOT EXISTS votes (id INT, bountyamount INT, creationdate DATE, postid INT, userid INT, votetypeid INT); -LOAD DATA LOCAL INPATH '/home/adbs/2019S/shared/hive/votes.csv' OVERWRITE INTO TABLE votes; +INSERT OVERWRITE TABLE badges + SELECT id,class,date_format(regexp_replace(`date`, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'),name,tagbased,userid FROM raw_badges; +INSERT OVERWRITE TABLE comments + SELECT id,date_format(regexp_replace(creationdate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'),postid,score,text,userdisplayname,userid FROM raw_comments; +INSERT OVERWRITE TABLE postlinks + SELECT id,date_format(regexp_replace(creationdate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'),linktypeid,postid,relatedpostid FROM raw_postlinks; +INSERT OVERWRITE TABLE posts + SELECT id,acceptedanswerid,answercount,body, + date_format(regexp_replace(closeddate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'),commentcount, + date_format(regexp_replace(communityowneddate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'), + date_format(regexp_replace(creationdate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'),favoritecount, + date_format(regexp_replace(lastactivitydate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'), + date_format(regexp_replace(lasteditdate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'), + lasteditordisplayname,lasteditoruserid,ownerdisplayname,owneruserid,parentid,posttypeid,score,tags,title,viewcount FROM raw_posts; +INSERT OVERWRITE TABLE users + SELECT id,aboutme,accountid,date_format(regexp_replace(creationdate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'), + displayname,downvotes,date_format(regexp_replace(lastaccessdate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'), + location,profileimageurl,reputation,upvotes,views,websiteurl FROM raw_users; +INSERT OVERWRITE TABLE votes + SELECT id,bountyamount,date_format(regexp_replace(creationdate, 'T', ' '),'yyyy-MM-dd HH:mm:ss.SSS'),postid,userid,votetypeid FROM raw_votes; -- users_partitioned -CREATE TABLE IF NOT EXISTS users_partitioned (id INT, aboutme STRING, accountid INT, creationdate DATE, displayname STRING, downvotes INT, lastaccessdate DATE, location STRING, profileimageurl STRING, upvotes INT, views INT, websiteurl STRING) PARTITIONED BY (reputation INT); -INSERT OVERWRITE TABLE users_partitioned PARTITION (reputation) SELECT * FROM users; +--CREATE TABLE IF NOT EXISTS users_partitioned (id INT, aboutme STRING, accountid INT, creationdate TIMESTAMP, displayname STRING, downvotes INT, lastaccessdate TIMESTAMP, location STRING, profileimageurl STRING, upvotes INT, views INT, websiteurl STRING) PARTITIONED BY (reputation INT); +--INSERT OVERWRITE TABLE users_partitioned PARTITION (reputation) SELECT * FROM users; SELECT "\n\n\n"; SELECT "\n##### QUERY 1 UNOPTIMIZED #####\n"; -- query unoptimized -EXPLAIN SELECT p.id FROM posts p, comments c, users u, votes v -WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid -AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid -AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); +--EXPLAIN SELECT p.id FROM posts p, comments c, users u, votes v +--WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid +--AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid +--AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); SELECT "\n##### QUERY 1 OPTIMIZED #####\n"; -- query optimized -EXPLAIN SELECT p.id FROM posts p, comments c, users_partitioned u, votes v -WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid -AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid -AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); - +--EXPLAIN SELECT p.id FROM posts p, comments c, users_partitioned u, votes v +--WHERE c.postid=p.id AND c.userid=p.owneruserid AND u.id=p.owneruserid +--AND u.reputation > 100 AND v.postid = p.id AND v.userid = p.owneruserid +--AND NOT EXISTS (SELECT 1 FROM postlinks l WHERE l.relatedpostid = p.id); -- 2.43.0