@techreport{ilprints376, number = {1999-22}, type = {Technical Report}, title = {The Evolution of the Web and Implications for an Incremental Crawler}, author = {J. Cho and H. Garcia-Molina}, publisher = {Stanford}, year = {1999}, institution = {Stanford InfoLab}, keywords = {web evolution, incremental crawler, web change model}, url = {http://ilpubs.stanford.edu:8090/376/}, abstract = {In this paper we study how to build an effective incremental crawler. The crawler selectively and incrementally updates its index and/or local collection of web pages, instead of periodically refreshing the collection in batch mode. The incremental crawler can improve the ``freshness'' of the collection significantly and bring in new pages in a more timely manner. We first present results from an experiment conducted on more than half million web pages over 4 months, to estimate how web pages evolve over time. Based on these experimental results, we compare various design choices for an incremental crawler and discuss their trade-offs. We propose an architecture for the incremental crawler, which combines the best design choices.} }