\documentclass[11pt]{amsart}
\usepackage{amsmath,amsthm,amsfonts,amssymb}
\usepackage[usenames]{xcolor}
\usepackage{enumerate}
\usepackage{fullpage}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{proposition}[theorem]{Proposition}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\theoremstyle{plain}
\newcommand{\replacethistext}[1]{{\color{red} #1}}
\newcommand{\E}{\mathrm{E}}
\newcommand{\median}{\mathrm{median}}
\newcommand{\rank}{\mathrm{rank}}
\newcommand{\Var}{\mathrm{Var}}
\title{
CS 466/666: Algorithm Design and Analysis \\
Problem Set 2}
\author{Due on: October 4, 2017}
\begin{document}
\begin{flushright}
\textbf{
\replacethistext{Your name} \\
\replacethistext{Your student number}
}
\end{flushright}
\medskip
\maketitle
\section*{Acknowledgements.}
\replacethistext{Acknowledge {\bf all collaborators} with whom you discussed any of the problems in this problem set and {\bf all external sources} that you may have used in the completion of the assignment. See the website for the full homework policy.}
%%% PROBLEM 1 %%%
\section{Problem 1: Middle of the road}
For this question, we will examine a question that comes up in the study of quicksort and a number of other applications as well.
For some integers $x_1,\ldots,x_n \in \mathbb{Z}$, we define the \emph{rank} of an integer $y \in \mathbb{Z}$ to be the number of elements in the list $x_1,\ldots,x_n$ that are no larger than $y$. Formally, $\rank(y) = |\{i \in [n] : x_i \le y\}|$. Let $X$ be a random variable obtained by drawing $i \in [n]$ uniformly at random and setting $X = x_i$.
\begin{enumerate}[(a)]
\item Establish the expected value of the rank of $X$.
\begin{lemma}
The rank of $X$ has expected value $\E[ \rank(X) ] = \replacethistext{\mbox{(enter the correct value here)}}$.
\end{lemma}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\item Establish the variance of the rank of $X$.
\begin{lemma}
The rank of $X$ has variance $\Var[ \rank(X) ] = \replacethistext{\mbox{(enter the correct value here)}}$.
\end{lemma}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\item Show that that the rank of $X$ is neither very small nor very large except with small probability.
\begin{lemma}
\label{lem:concentration}
Let $E$ denote the event that the rank of $X$ satisfies $\rank(X) \le \frac{n}{10}$ or $\rank(X) \ge \frac{9n}{10}$. Then $\Pr[ E ] \le \replacethistext{\mbox{(enter the correct value here)}}$.
\end{lemma}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\item Let $X_1,\ldots,X_t$ be $t$ copies of $X$ drawn independently at random for some odd $t \ge 1$. Show that the probability that the median of these values has very small or very large rank is exponentially small in $t$.
\begin{lemma}
Define $p \ge \Pr[E]$ where $E$ is the event defined in Lemma~\ref{lem:concentration}. Then for any odd $t \ge 1$, the rank of the variable $Y = \median(X_1,\ldots,X_t)$ satisfies
\[
\Pr\Big[ \rank(Y) \le \tfrac{n}{10} \mbox{ or } \rank(Y) \ge \tfrac{9n}{10}\Big] \le
\binom{t}{\lceil t/2 \rceil} p^{\lceil t/2 \rceil} \le 2^t p^{t/2} = (4p)^{t/2}.
\]
\end{lemma}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\bigskip
\noindent \emph{Hint.}
An extremely useful tool for this and many other problems is the \emph{union bound}, which states that for any events $E_1, E_2,\ldots,E_m$ we have $\Pr[ E_1 \vee E_2 \vee \cdots \vee E_m] \le \sum_{i \le m} \Pr[ E_i ]$.
\end{enumerate}
%%% PROBLEM 2 %%%
\newpage
\section{Problem 2: Counting with accuracy}
\begin{enumerate}[(a)]
\item Prove the following fundamental fact about averages of samples.
\begin{lemma}
Let $X_1,\ldots,X_t$ be independent random variables each with expected value $\E[X_i] = \mu$ and variance $\Var[X_i] = \sigma^2$. Then the variable $X = \frac{X_1 + X_2 + \cdots + X_t}{t}$ satisfies
\[
\E[X] = \mu
\qquad \mbox{and} \qquad
\Var[X] = \frac{\sigma^2}{t}.
\]
\end{lemma}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\item Determine the variance of the Morris counter seen in class.
\begin{lemma}
The random variable $Z = 2^{C_k} - 1$ corresponding to the value returned by the Morris counter when the stream contains $k$ ones has variance $\Var[Z] = \binom{k}{2}$.
\end{lemma}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\bigskip
\noindent \emph{Hint.}
You may want to use a proof by induction to show that $E[ (2^{C_k})^2 ] = \frac32 k^2 + \frac32 k + 1$.
\bigskip
\item Use the above two results to show that we can design approximate counters with arbitrary accuracy by running some number $t \ge 1$ of copies of the Morris counter in parallel and averaging their estimates.
\begin{theorem}
For any $\epsilon > 0$ and $n \ge 1$, there is a value $t \ge 1$ for which the algorithm $\mathcal{A}$ that runs $t$ copies of the Morris counter is an $\epsilon$-approximate counter for inputs $x_1,\ldots,x_n \in \{0,1\}$ and has memory complexity $O(t \log \log n)$.
\end{theorem}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\bigskip
\noindent \emph{Remark.}
Your proof should specify exactly what the value of $t$ should be (as a function of $n$ and/or $\epsilon$). As an aside, is the algorithm $\mathcal{A}$ you obtain this way a very efficient approximate counter?
\end{enumerate}
%%% PROBLEM 3 %%%
\newpage
\section{Problem 3: Counting with confidence}
\begin{enumerate}[(a)]
\item Show that when $E_1,\ldots,E_t$ are independent events that each occur with small probability, the probability that more than half of the events occur is exponentially small in $t$.
\begin{lemma}
Let $E_1, E_2, \ldots, E_t$ be independent events that each occur with probability $\Pr[E_i] = \mu \le \frac13$. Then
\[
\Pr\Big[ \left|\big\{i \le t : E_i \mbox{ occurs} \big\}\right| > \tfrac t2 \Big] \le \binom{t}{\lceil t/2 \rceil} \mu^{t/2} \le (4\mu)^{t/2}.
\]
\end{lemma}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\item Using the lemma above, show that the probability of error of approximate counters can be reduced to any $\delta > 0$ with only a small multiplicative factor increase in the memory cost.
\begin{theorem}
Let $\mathcal{A}$ be an $\epsilon$-approximate counter with memory complexity $m$. Then for any $\delta > 0$, there is a randomized streaming algorithm $\mathcal{A}'$ with memory complexity $O(m \log \frac1\delta)$ which outputs an approximate count $\tilde{c}$ that satisfies $(1-\epsilon)k \le \tilde{c} \le (1+\epsilon)k$ with probability at least $1-\delta$ when $k = \sum_i x_i$ is the number of ones in the stream.
\end{theorem}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\end{enumerate}
%%% PROBLEM 4 %%%
\newpage
\section{Problem 4: Impossibility}
I have argued in class that the only way to beat the na\"ive counter's memory requirements in streaming algorithms is to consider randomized algorithms that are allowed to err with some probability. In this question, you will prove this assertion.
\begin{enumerate}[(a)]
\item Prove that every deterministic (exact) counter has memory complexity $\Omega(n)$ for streams of length $n$.
\begin{theorem}
Let $\mathcal{A}$ be a deterministic streaming algorithm that outputs the number of 1s observed in the stream $x_1,\ldots,x_n \in \{0,1\}$. Then $\mathcal{A}$ must have at least $\lceil \log n \rceil$ bits of memory.
\end{theorem}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\bigskip
\noindent \emph{Hint.}
The \emph{pigeonhole principle}, which says that if you put $m$ pigeons into $\ell < m$ boxes then you must end up with a box that contains at least 2 pigeons, may be useful in your proof.
\bigskip
\item Define the \emph{worst-case memory complexity} of a randomized algorithm to be the maximum number of bits of memory that the algorithm requires over all inputs \emph{and} all possible values of the random bits that it uses. Use the result from part (a) to show that randomized algorithms which output the correct count all the time also have large memory complexity.
\begin{theorem}
Let $\mathcal{A}$ be a randomized streaming algorithm that always correctly outputs the number of 1s observed in the stream $x_1,\ldots,x_n \in \{0,1\}$. Then $\mathcal{A}$ has worst-case memory complexity at least $\lceil \log n \rceil$.
\end{theorem}
\begin{proof}
\replacethistext{Enter your answer here.}
\end{proof}
\bigskip
\noindent \emph{Hint.}
Can you show that if $\mathcal{A}$ has memory complexity $m$, then there is also a deterministic streaming algorithm for counting that also requires only $m$ bits of memory?
\bigskip
\item {\bf Bonus.} Extend the results above to show that for any $\epsilon < 1$, every $\epsilon$-approximate counter which \emph{always} outputs an estimate in the range $(1-\epsilon)k \le \tilde{c} \le (1+\epsilon)k$ also has memory complexity $\Omega(\log n)$.
\end{enumerate}
\end{document}