Scraping in parallel with Playwright¶

In [16]:

            
                Copied!
                
from playwright.async_api import async_playwright
import asyncio
from playwright.async_api import async_playwright
import asyncio

In [20]:

            
                Copied!
                
                    
                    
                
                

        
async def load_page(semaphore, browser, url):
    async with semaphore:
        async with await browser.new_page() as page:
            # Wait 60s for the page to load
            await page.goto(url, timeout=60000)

            # Get the title of the page
            title = await page.title()

            # Send back both the url we're visiting AND the title of the page
            return url, title
async def load_page(semaphore, browser, url):
    async with semaphore:
        async with await browser.new_page() as page:
            # Wait 60s for the page to load
            await page.goto(url, timeout=60000)

            # Get the title of the page
            title = await page.title()

            # Send back both the url we're visiting AND the title of the page
            return url, title

In [21]:

            
                Copied!
                
# Startup
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
# Startup
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)

In [23]:

            
                Copied!
                
                    
                    
                
                

        
URLS = ['http://www.foxnews.com/',
        'http://www.cnn.com/',
        'http://www.bbc.co.uk/',
        'http://nytimes.com/']

# Five at a time
semaphore = asyncio.Semaphore(5)

tasks = [load_page(semaphore, browser, url) for url in URLS]

# We can do five tihngs at a time
for task in asyncio.as_completed(tasks):
    try:
        url, headline = await task
        print(f"{url} title is {headline}")
    except Exception as exc:
        print(f"{url} generated an exception: {exc}")
URLS = ['http://www.foxnews.com/',
        'http://www.cnn.com/',
        'http://www.bbc.co.uk/',
        'http://nytimes.com/']

# Five at a time
semaphore = asyncio.Semaphore(5)

tasks = [load_page(semaphore, browser, url) for url in URLS]

# We can do five tihngs at a time
for task in asyncio.as_completed(tasks):
    try:
        url, headline = await task
        print(f"{url} title is {headline}")
    except Exception as exc:
        print(f"{url} generated an exception: {exc}")

http://www.bbc.co.uk/ title is BBC - Home
http://nytimes.com/ title is The New York Times - Breaking News, US News, World News and Videos
http://nytimes.com/ generated an exception: Page.goto: Timeout 60000ms exceeded.
Call log:
navigating to "http://www.foxnews.com/", waiting until "load"

http://nytimes.com/ generated an exception: Page.goto: Timeout 60000ms exceeded.
Call log:
navigating to "http://www.cnn.com/", waiting until "load"

In [25]:

            
                Copied!
                
# Cleanup
await browser.close()
await playwright.stop()
# Cleanup
await browser.close()
await playwright.stop()

In [ ]: