ex1/main_3.tex

   1 %ex1.3
   2
   3 \begin{enumerate}[label=(\alph*)]
   4 % (a)
   5 \item
   6 $ numGroups = 8 $ \\
   7 $ numRecords = 4000 $ \\
   8 $ bucketHeight = numRecords / numGroups = 500 $ \\
   9
  10 \begin{enumerate}[label=\roman*)]
  11 \item
  12 $ numBuckets = 3 + \frac{4}{7} = \frac{25}{7} $ \\
  13 $ selectivity = \frac{numBuckets * bucketHeight}{numRecords} = \frac{\frac{25}{7} * 500}{4000} \approx 0.446 \approx 44.6\% $ \\
  14
  15 \item
  16 $ numBuckets = \frac{175-140}{175-133} = \frac{35}{42} $ \\
  17 $ selectivity = \frac{numBuckets * bucketHeight}{numRecords} = \frac{\frac{35}{42} * 500}{4000} \approx 0.104 \approx 10.4\% $ \\
  18 \end{enumerate}
  19
  20 % (b)
  21 \item
  22 All values are most likely unique (stored value count is 180, while maximum
  23 value is 175).
  24
  25 \begin{enumerate}[label=\roman*)]
  26 \item
  27 Because of the uniqueness assumption, the result set has size 1.\\
  28
  29 $ selectivity = \frac{1}{180} \approx 0.006 \approx 0.6\% $\\
  30 \item
  31 $ selectivity = (1 - \frac{1}{180}) \approx 0.994 \approx 99.4\% $\\
  32 \end{enumerate}
  33
  34 Tradeoffs: we have to update the buckets distribution regularly, so we either
  35 work with outdated data or we update the buckets regularly.\\
  36
  37 % (c)
  38 \item
  39 Known Distribution Table:
  40
  41 \begin{tabular}{ c c c }
  42  Value & Frequency & Occurrences \\
  43  5 & 11\% & 440 \\
  44  9 & 8\% & 320 \\
  45  15 & 8\% & 320 \\
  46  26 & 3\% & 120
  47 \end{tabular}
  48
  49 \begin{enumerate}[label=\roman*)]
  50 \item
  51 Known Excluded by $votes$ $!=$ $9$: $320$ \\
  52 Known Included by $votes$ $<$ $30$: $440 + 320 + 120 = 880$ \\
  53 Assumed uniform distributed values: $ numRecords - knownExcluded - knownIncluded = 4000 - 880 - 320 = 2800$\\
  54
  55 Number of values below 30: $30$\\
  56 Number of known values below 30: $4$\\
  57 Average rows per unique votes value:\\
  58 $avgRowsPerVotes = \frac{2800}{175-4} \approx 16.37$\\
  59 Assumed uniform values below 30:\\
  60 $avgRowsPerVotes * (30-4) = 16.37 * 26 \approx 425.73$\\
  61
  62 Total values below 30: $knownIncluded + assumedUniformValues = 880 + 425.73 = 1305.73$\\
  63 $selectivity = totalValues / numRecords = 1305.73 / 4000 \approx 0.326 \approx 32.6\%$\\
  64
  65 \item
  66 Known rows from first filter (votes = 15): 320\\
  67 Assumed rows $> 50$: $avgRowsPerVote * (175-50) = 16.37 * 125 \approx 2046.78$\\
  68 Assume no overlaps, so just sum up both selectivities:\\
  69 $assumedRows = knownIsFifteen + assumedOverFifty = 320 + 2046.78 = 2366.78$\\
  70 $selectivity = assumedRows / numRecords = 2366.78 / 4000 = 0.5916 \approx 59.2\%$\\
  71 \end{enumerate}
  72
  73 \item
  74 \begin{enumerate}[label=\roman*)]
  75 \item Selector over course:\\
  76 Filter: $coursename$ $=$ $\textquotedbl$ADBS$\textquotedbl$ $OR$ $ects > 6$\\
  77
  78 Again assume no overlap (as ADBS has 6 ects), so OR just gets summed up.
  79
  80 We'd assume the same even without knowledge of how many ECTS ADBS has.
  81
  82 Assume uniform distribution over $course.coursename$:\\
  83 $numADBSCourses = (1 / distinctCourses) * numRows = (1/490) * 540 = 1.102$\\
  84 $bucketSize = (numRows / numBuckets) = 540 / 5 = 108$\\
  85 $bucketsOverSix = 1 + \frac{2}{3} = \frac{5}{3}$\\
  86 $coursesOverSix = bucketsOverSix * bucketSize = \frac{5}{3} * 108 = 180$\\
  87 $selectivity = (1.102 + 180) / 540 = 181.102 / 540 = 0.335 \approx 33.5\%$
  88
  89 \item Selector over room:\\
  90 Filter: $capacity < 300$ $AND$ $building$ $!=$ \textquotedbl$Freihaus$\textquotedbl\\
  91
  92 $bucketSize = (numRows / numBuckets) = 1700 / 5 = 340$\\
  93 $bucketsBelow300 = 4.5$\\
  94 $roomsBelow300 = bucketsBelow300 * bucketSize = 4.5 * 340 = 1530$\\
  95 $remainingBuildings = 10$\\
  96 $roomsNotInFreihaus = remainingBuildings / numBuildings * numRooms = 1545.45$\\
  97 $selectivity = math.min(roomsBelow300, roomsNotInFreihaus) / numRooms$\\
  98 $selectivity = math.min(1530, 1545.45) / 1700 = 1530 / 1700 = 0.9 = 90\%$
  99 \end{enumerate}
 100
 101 Join Order:
 102
 103 Everything should be joined to $reservation$, as there are no common
 104 columns for $room$ and $course$.
 105
 106 The table $reservation$ is the largest table without any WHERE clauses,
 107 $course$ has the lowest selectivity.
 108
 109 Therefor we'd join first $reservation$ and $course$, and then join to $room$.
 110
 111 % (d)
 112 \item
 113 Selected Join Strategies:
 114
 115 Hash Join for join $reservation$ to $course$, this should
 116 reduce the table space quite far.
 117
 118 Both selections have to be done, because they are connected via OR.
 119
 120 Hash Join for joining the result to $room$, as we don't know
 121 if any of the results are sorted.
 122
 123 First the restriction to room capacity is applied (excludes a
 124 little bit more than the other restriction), then the restriction
 125 to buildings other than ``Freihaus''.
 126
 127 \begin{verbatim}
 128 Hash Join
 129     Hash Cond: (reservation.room = room.name)
 130     ->  Hash Join
 131         Hash Cond: (reservation.course = course.courseid)
 132         ->  Seq Scan on reservation
 133         ->  Hash
 134             -> Seq Scan on course
 135                Filter: ((coursename)::text = 'ADBS'::text OR (ects)::integer > 6)
 136     -> Hash
 137        ->  Seq Scan on room
 138            Filter: ((capacity)::integer < 300::integer AND
 139                     (building)::text != 'Freihaus'::text)
 140 \end{verbatim}
 141 \end{enumerate}