@article{Zuo_Zhang_Joe-Wong_2020, title={Observe Before Play: Multi-Armed Bandit with Pre-Observations}, volume={34}, url={https://ojs.aaai.org/index.php/AAAI/article/view/6187}, DOI={10.1609/aaai.v34i04.6187}, abstractNote={<p>We consider the stochastic multi-armed bandit (MAB) problem in a setting where a player can pay to pre-observe arm rewards before playing an arm in each round. Apart from the usual trade-off between exploring new arms to find the best one and exploiting the arm believed to offer the highest reward, we encounter an additional dilemma: pre-observing more arms gives a higher chance to play the best one, but incurs a larger cost. For the single-player setting, we design an Observe-Before-Play Upper Confidence Bound (OBP-UCB) algorithm for <em>K</em> arms with Bernoulli rewards, and prove a <em>T</em>-round regret upper bound <em>O</em>(<em>K</em><sup>2</sup>log <em>T</em>). In the multi-player setting, collisions will occur when players select the same arm to play in the same round. We design a centralized algorithm, C-MP-OBP, and prove its <em>T</em>-round regret relative to an offline greedy strategy is upper bounded in <em>O</em>(<em>K</em><sup>4</sup>/<em>M</em><sup>2</sup>log <em>T</em>) for <em>K</em> arms and <em>M</em> players. We also propose distributed versions of the C-MP-OBP policy, called D-MP-OBP and D-MP-Adapt-OBP, achieving logarithmic regret with respect to collision-free target policies. Experiments on synthetic data and wireless channel traces show that C-MP-OBP and D-MP-OBP outperform random heuristics and offline optimal policies that do not allow pre-observations.</p>}, number={04}, journal={Proceedings of the AAAI Conference on Artificial Intelligence}, author={Zuo, Jinhang and Zhang, Xiaoxi and Joe-Wong, Carlee}, year={2020}, month={Apr.}, pages={7023-7030} }