@article{Patel_Caragea_Caragea_Giles_2021, title={Author Homepage Discovery in CiteSeerX}, volume={35}, url={https://ojs.aaai.org/index.php/AAAI/article/view/17778}, DOI={10.1609/aaai.v35i17.17778}, abstractNote={Scholarly digital libraries provide access to scientific publications and comprise useful resources for researchers. CiteSeerX is one such digital library search engine that provides access to more than 10 million academic documents. We propose a novel search-driven approach to build and maintain a large collection of homepages that can be used as seed URLs in any digital library including CiteSeerX to crawl scientific documents. Precisely, we integrate Web search and classification in a unified approach to discover new homepages: first, we use publicly-available author names and research paper titles as queries to a Web search engine to find relevant content, and then we identify the correct homepages from the search results using a powerful deep learning classifier based on Convolutional Neural Networks. Moreover, we use Self-Training in order to reduce the labeling effort and to utilize the unlabeled data to train the efficient researcher homepage classifier. Our experiments on a large scale dataset highlight the effectiveness of our approach, and position Web search as an effective method for acquiring authors’ homepages. We show the development and deployment of the proposed approach in CiteSeerX and the maintenance requirements.}, number={17}, journal={Proceedings of the AAAI Conference on Artificial Intelligence}, author={Patel, Krutarth and Caragea, Cornelia and Caragea, Doina and Giles, C. Lee}, year={2021}, month={May}, pages={15146-15155} }