"cell_type": "code",
"execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Intitializing Scala interpreter ..."
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"val fibs20 = sc.parallelize(List( 0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181)) "
]
"cell_type": "code",
"execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Intitializing Scala interpreter ..."
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"val evenFibs20 = fibs20.filter(x => (x % 2 == 0)).collect()"
]
"cell_type": "code",
"execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Intitializing Scala interpreter ..."
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"val avg1map = fibs20.map(x => (x, 1))\n",
"val avg1fold = avg1map.fold (0,0) ((x,y) => (x._1 + y._1, x._2 + y._2))\n",
"metadata": {},
"outputs": [],
"source": [
- "val wordlist = words.map(word => permutate(word)).collect().flatMap(x => x)"
+ "val wordlist = words.map(word => permutate(word)).flatMap(x => x).collect()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Intitializing Scala interpreter ..."
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"val query2 = articlesDF.groupBy(\"sectionName\").count()\n",
"query2.show(false)\n",
"cell_type": "code",
"execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Intitializing Scala interpreter ..."
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"val query3 = spark.sql(\n",
" \"SELECT a.headline, COUNT(c.commentID) AS numComments FROM articles a, comments c WHERE a.articleID = c.articleID GROUP BY a.headline\" )\n",
"cell_type": "code",
"execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Intitializing Scala interpreter ..."
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"val query4 = spark.sql(\" SELECT headline, byline, pubDate FROM articles WHERE headline RLIKE \\\"2016\\\" \")\n",
"query4.show(false)\n",
"cell_type": "code",
"execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Intitializing Scala interpreter ..."
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"val query5 = articlesDF\n",
" .join(commentsDF, articlesDF(\"articleID\") === commentsDF(\"articleID\"))\n",
"\n",
"#### Use the Spark Internal Web UI to analyse the dependencies and stages of the queries, and try to determine which commands on which Dataframes are executed as wide dependencies and which as narrow dependencies. \n"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {