Scraping in parallel with Playwright¶
In [16]:
Copied!
from playwright.async_api import async_playwright
import asyncio
from playwright.async_api import async_playwright
import asyncio
In [20]:
Copied!
async def load_page(semaphore, browser, url):
async with semaphore:
async with await browser.new_page() as page:
# Wait 60s for the page to load
await page.goto(url, timeout=60000)
# Get the title of the page
title = await page.title()
# Send back both the url we're visiting AND the title of the page
return url, title
async def load_page(semaphore, browser, url):
async with semaphore:
async with await browser.new_page() as page:
# Wait 60s for the page to load
await page.goto(url, timeout=60000)
# Get the title of the page
title = await page.title()
# Send back both the url we're visiting AND the title of the page
return url, title
In [21]:
Copied!
# Startup
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
# Startup
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
In [23]:
Copied!
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://nytimes.com/']
# Five at a time
semaphore = asyncio.Semaphore(5)
tasks = [load_page(semaphore, browser, url) for url in URLS]
# We can do five tihngs at a time
for task in asyncio.as_completed(tasks):
try:
url, headline = await task
print(f"{url} title is {headline}")
except Exception as exc:
print(f"{url} generated an exception: {exc}")
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://nytimes.com/']
# Five at a time
semaphore = asyncio.Semaphore(5)
tasks = [load_page(semaphore, browser, url) for url in URLS]
# We can do five tihngs at a time
for task in asyncio.as_completed(tasks):
try:
url, headline = await task
print(f"{url} title is {headline}")
except Exception as exc:
print(f"{url} generated an exception: {exc}")
http://www.bbc.co.uk/ title is BBC - Home http://nytimes.com/ title is The New York Times - Breaking News, US News, World News and Videos http://nytimes.com/ generated an exception: Page.goto: Timeout 60000ms exceeded. Call log: navigating to "http://www.foxnews.com/", waiting until "load" http://nytimes.com/ generated an exception: Page.goto: Timeout 60000ms exceeded. Call log: navigating to "http://www.cnn.com/", waiting until "load"
In [25]:
Copied!
# Cleanup
await browser.close()
await playwright.stop()
# Cleanup
await browser.close()
await playwright.stop()
In [ ]:
Copied!