Pagination and saving patterns for scraping¶
The most common pagination pattern for scraping is "click until this stops working"
Click until this stops working¶
# Keep doing this until you fail to click 'next'
while True:
# Scrape the contents of the page
try:
# Try to find and click the next button...
# (wait up to five seconds for the button to appear)
await page.locator(".next-btn").click(timeout=5000)
except Exception as e:
# If it fails, exit the loop
print("Failed with", e.message)
# If it fails, exit the loop
break
# Get the first page of results
response = requests.get(url)
doc = BeautifulSoup(response.text)
# Keep doing this until you fail to click 'next'
while True:
# Scrape the contents of the page
try:
# Go to the page the next button points at
# if it doesn't exist, it will throw an error
next_url = page.find("a", class_= "next-btn")['href']
response = requests.get(next_url)
doc = BeautifulSoup(response.text)
# Extra bonus: raise an error if not 200 (404, etc)
response.raise_for_status()
except Exception as e:
# If it fails, exit the loop
print("Failed with", e.message)
break
Click until the nth page¶
Sometimes that doesn't work, and you need to click the next button a specific number of times. In this case, we are going to find the element that says how many pages of results there are, and then click the next button that many times.
# Get the first page of results
response = requests.get(url)
doc = BeautifulSoup(response.text)
page_count = page.find("span", class_= "page-count").text
for _ in range(int(page_count)):
# Scrape the contents of the page
# Go to the next page
next_url = page.find("a", class_= "next-btn")['href']
response = requests.get(next_url)
doc = BeautifulSoup(response.text)
Just change the URL¶
Instead of clicking, oftentimes the URL changes. In this case, you can just change the URL and scrape the new page!
Note that
range(1, 30)
means "count from 1 to 29". Therange
function does NOT include the last number! If you wanted to go from pages 1 to 30 you'd need to dorange(1, 31)
.
Scrape pages of tables¶
This is not a pagination pattern, but rather a pattern of what to do with the content. If each page of results is a table, you can scrape the table and then click the next button. At the end you can combine all of the tables into one big pandas dataframe.
# Start with an empty list of dataframes
dataframes = []
while True:
# Grab all the tables from the page
tables = pd.read_html(await page.content())
# In this case, we want the first one
df = tables[0]
# Add it to the list of dataframes
dataframes.append(df)
# Click the next button
try:
await page.locator(".next-btn").click(timeout=5000)
except Exception as e:
# If it fails, exit the loop
print("Failed with", e.message)
break
# Combine all the dataframes into one big dataframe
df = pd.concat(dataframes, ignore_index=True)
# Start with an empty list of dataframes
dataframes = []
for page_num in range(1, 30):
# Grab all the tables from the page
tables = pd.read_html(await page.content())
# In this case, we want the first one
df = tables[0]
# Add it to the list of dataframes
dataframes.append(df)
# Get the next page
try:
next_url = page.find("a", class_= "next-btn")['href']
response = requests.get(next_url)
response.raise_for_status()
doc = BeautifulSoup(response.text)
except:
break
# Combine all the dataframes into one big dataframe
df = pd.concat(dataframes, ignore_index=True)
We use ignore_index=True
so the index (the column on the left-hand side) is not repeated.
Scrape pages of elements¶
Scraping pages of individual elements is slightly different from scraping pages of tables.
In this case, you have an inner loop where you scrape each element on the page, and an outer loop where you click the next button and go to the next page.
all_data = []
while True:
# Grab all the stories from the page
stories = page.locator(".story")
story_count = await stories.count()
# Loop through each story
for i in range(story_count):
story = stories.nth(i)
# Scrape the story
story_data = {}
story_data["title"] = await story.locator(".title").text_content()
story_data["author"] = await story.locator(".author").text_content()
story_data["date"] = await story.locator(".date").text_content()
all_data.append(story_data)
# Click the next button
try:
await page.locator(".next-btn").click(timeout=5000)
except:
break
df = pd.DataFrame(all_data)
all_data = []
while True:
# Grab all the stories from the page
stories = page.find_all("div", class_="story")
# Loop through each story
for story in stories:
# Scrape the story
story_data = {}
story_data["title"] = story.find("h2", class_="title").text
story_data["author"] = story.find("p", class_="author").text
story_data["date"] = story.find("p", class_="date").text
all_data.append(story_data)
# Click the next button
try:
# Go to the next page
next_url = page.find("a", class_= "next-btn")['href']
response = requests.get(next_url)
doc = BeautifulSoup(response.text)
except:
break
df = pd.DataFrame(all_data)