Pagination and saving patterns for scraping¶

The most common pagination pattern for scraping is "click until this stops working"

Click until this stops working¶

PlaywrightBeautifulSoupSelenium

# Keep doing this until you fail to click 'next'
while True:
    # Scrape the contents of the page

    try:
        # Try to find and click the next button...
        # (wait up to five seconds for the button to appear)
        await page.locator(".next-btn").click(timeout=5000) 
    except Exception as e:
        # If it fails, exit the loop
        print("Failed with", e.message)
        # If it fails, exit the loop
        break

# Get the first page of results
response = requests.get(url)
doc = BeautifulSoup(response.text)

# Keep doing this until you fail to click 'next'
while True:
    # Scrape the contents of the page

    try:
        # Go to the page the next button points at
        # if it doesn't exist, it will throw an error
        next_url = page.find("a", class_= "next-btn")['href']
        response = requests.get(next_url)
        doc = BeautifulSoup(response.text)

        # Extra bonus: raise an error if not 200 (404, etc)
        response.raise_for_status()
    except Exception as e:
        # If it fails, exit the loop
        print("Failed with", e.message)
        break

# Keep doing this until you fail to click 'next'
while True:
    # Scrape the contents of the page

    try:
        driver.find_element(By.CLASS_NAME, "next-btn").click()
    except Exception as e:
        # If it fails, exit the loop
        print("Failed with", e.message)
        break

Click until the nth page¶

Sometimes that doesn't work, and you need to click the next button a specific number of times. In this case, we are going to find the element that says how many pages of results there are, and then click the next button that many times.

PlaywrightBeautifulSoupSelenium

page_count = await page.locator(".page-count").text_content()
for _ in range(int(page_count)):
    # Scrape the contents of the page

    # Click the next button
    await page.locator(".next-btn").click()

# Get the first page of results
response = requests.get(url)
doc = BeautifulSoup(response.text)

page_count = page.find("span", class_= "page-count").text
for _ in range(int(page_count)):
    # Scrape the contents of the page

    # Go to the next page
    next_url = page.find("a", class_= "next-btn")['href']
    response = requests.get(next_url)
    doc = BeautifulSoup(response.text)

page_count = driver.find_element(By.CLASS_NAME, "page-count").text

# Keep doing this until you fail to click 'next'
for _ in range(int(page_count)):
    # Scrape the contents of the page

    # Click the next button 
    driver.find_element(By.CLASS_NAME, "next-btn").click()

Just change the URL¶

Instead of clicking, oftentimes the URL changes. In this case, you can just change the URL and scrape the new page!

Note that range(1, 30) means "count from 1 to 29". The range function does NOT include the last number! If you wanted to go from pages 1 to 30 you'd need to do range(1, 31).

PlaywrightBeautifulSoupSelenium

# Count from page 1 to 29
for page_num in range(1, 30):
    # Go to the specific page
    url = f"http://example.com/results?q=hello&page={page_num}"
    await page.goto(url)

    # Scrape the page

# Count from page 1 to 29
for page_num in range(1, 30):
    # Go to the specific page
    url = f"http://example.com/results?q=hello&page={page_num}"
    response = requests.get(url)
    doc = BeautifulSoup(response.text)

    # Scrape the page

# Count from page 1 to 29
for page_num in range(1, 30):
    # Go to the specific page
    url = f"http://example.com/results?q=hello&page={page_num}"
    driver.get(url)

    # Scrape the page

Scrape pages of tables¶

This is not a pagination pattern, but rather a pattern of what to do with the content. If each page of results is a table, you can scrape the table and then click the next button. At the end you can combine all of the tables into one big pandas dataframe.

PlaywrightBeautifulSoup

# Start with an empty list of dataframes
dataframes = []

while True:
    # Grab all the tables from the page
    tables = pd.read_html(await page.content())

    # In this case, we want the first one
    df = tables[0]

    # Add it to the list of dataframes
    dataframes.append(df)

    # Click the next button
    try:
        await page.locator(".next-btn").click(timeout=5000)
    except Exception as e:
        # If it fails, exit the loop
        print("Failed with", e.message)
        break

# Combine all the dataframes into one big dataframe
df = pd.concat(dataframes, ignore_index=True)

# Start with an empty list of dataframes
dataframes = []

for page_num in range(1, 30):
    # Grab all the tables from the page
    tables = pd.read_html(await page.content())

    # In this case, we want the first one
    df = tables[0]

    # Add it to the list of dataframes
    dataframes.append(df)

    # Get the next page
    try:
        next_url = page.find("a", class_= "next-btn")['href']
        response = requests.get(next_url)
        response.raise_for_status()
        doc = BeautifulSoup(response.text)
    except:
        break

# Combine all the dataframes into one big dataframe
df = pd.concat(dataframes, ignore_index=True)

We use ignore_index=True so the index (the column on the left-hand side) is not repeated.

Scrape pages of elements¶

Scraping pages of individual elements is slightly different from scraping pages of tables.

In this case, you have an inner loop where you scrape each element on the page, and an outer loop where you click the next button and go to the next page.

PlaywrightBeautifulSoup

all_data = []

while True:
    # Grab all the stories from the page
    stories = page.locator(".story")
    story_count = await stories.count()

    # Loop through each story
    for i in range(story_count):
        story = stories.nth(i)

        # Scrape the story
        story_data = {}

        story_data["title"] = await story.locator(".title").text_content()
        story_data["author"] = await story.locator(".author").text_content()
        story_data["date"] = await story.locator(".date").text_content()

        all_data.append(story_data)

    # Click the next button
    try:
        await page.locator(".next-btn").click(timeout=5000)
    except:
        break

df = pd.DataFrame(all_data)

all_data = []

while True:
    # Grab all the stories from the page
    stories = page.find_all("div", class_="story")

    # Loop through each story
    for story in stories:
        # Scrape the story
        story_data = {}

        story_data["title"] = story.find("h2", class_="title").text
        story_data["author"] = story.find("p", class_="author").text
        story_data["date"] = story.find("p", class_="date").text

        all_data.append(story_data)

    # Click the next button
    try:
        # Go to the next page
        next_url = page.find("a", class_= "next-btn")['href']
        response = requests.get(next_url)
        doc = BeautifulSoup(response.text)
    except:
        break

df = pd.DataFrame(all_data)