@techreport{ilprints733, number = {2002-9}, month = {February}, author = {Junghoo Cho and Hector Garcia-Molina}, title = {Parallel Crawlers}, type = {Technical Report}, publisher = {Stanford}, year = {2002}, institution = {Stanford InfoLab}, journal = {Technical Report}, keywords = {Web crawler, paralellism, distributed crawler}, url = {http://ilpubs.stanford.edu:8090/733/}, abstract = {In this paper we study how we can design an effective parallel crawler. As the size of the Web grows, it becomes imperative to parallelize a crawling process, in order to finish downloading pages in a reasonable amount of time. We first propose multiple architectures for a parallel crawler and identify fundamental issues related to parallel crawling. Based on this understanding, we then propose metrics to evaluate a parallel crawler, and compare the proposed architectures using 40 million pages collected from the Web. Our results clarify the relative merits of each architecture and provide a good guideline on when to adopt which architecture.} }