From a3a7c1f6ffdc2e5e1420aeb12443bbb85af99420 Mon Sep 17 00:00:00 2001 From: David Kaufmann Date: Sat, 11 May 2019 23:05:39 +0200 Subject: [PATCH] add first code --- ex2/spark/Exercise5_SparkInScala.ipynb | 174 ++++++++++++++++++++++--- 1 file changed, 157 insertions(+), 17 deletions(-) diff --git a/ex2/spark/Exercise5_SparkInScala.ipynb b/ex2/spark/Exercise5_SparkInScala.ipynb index 7a070d2..7e3b4e5 100644 --- a/ex2/spark/Exercise5_SparkInScala.ipynb +++ b/ex2/spark/Exercise5_SparkInScala.ipynb @@ -31,7 +31,17 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "val fibs20 = sc.parallelize(List( 0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181)) " ] @@ -47,8 +57,20 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "val evenFibs20 = fibs20.filter(x => (x % 2 == 0))" + ] }, { "cell_type": "markdown", @@ -61,8 +83,22 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "val avg1map = fibs20.map(x => (x, 1))\n", + "val avg1fold = avg1map.fold (0,0) ((x,y) => (x._1 + y._1, x._2 + y._2))\n", + "var avg1 = avg1fold._1 / avg1fold._2.toFloat" + ] }, { "cell_type": "markdown", @@ -75,8 +111,20 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "val avgDiff = fibs20.map(x => (x - avg1).abs)" + ] }, { "cell_type": "markdown", @@ -89,7 +137,17 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "val words = sc.parallelize(List(\"automaton\", \"language\", \"logic\",\"closure\"))" ] @@ -105,7 +163,17 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "def permutate (word:String) = word.permutations.toList" ] @@ -121,8 +189,20 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "words.map(word => permutate(word))." + ] }, { "cell_type": "markdown", @@ -137,7 +217,17 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "val dataPath = \"/home/adbs/2019S/shared/diamonds.csv\"\n", "val diamonds = spark.read.format(\"csv\")\n", @@ -166,7 +256,17 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "val query1 = spark.sql(\"SELECT COUNT(*) FROM articles WHERE sectionName='Politics'\")\n", "query1.show()\n", @@ -191,7 +291,17 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "val query2 = articlesDF.groupBy(\"sectionName\").count()\n", "query2.show(false)\n", @@ -216,7 +326,17 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "val query3 = spark.sql(\n", " \"SELECT a.headline, COUNT(c.commentID) AS numComments FROM articles a, comments c WHERE a.articleID = c.articleID GROUP BY a.headline\" )\n", @@ -242,7 +362,17 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "val query4 = spark.sql(\" SELECT headline, byline, pubDate FROM articles WHERE headline RLIKE \\\"2016\\\" \")\n", "query4.show(false)\n", @@ -267,7 +397,17 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "val query5 = articlesDF\n", " .join(commentsDF, articlesDF(\"articleID\") === commentsDF(\"articleID\"))\n", -- 2.43.0