Skip to content

Downloading Files Examples

Real-world examples of downloading files during script execution.

Example 1: Download PDF Reports

Download multiple PDF reports from a website:

async def main(page):
    # Navigate to reports page
    debug_log("Loading reports page...")
    await page.goto('https://example.com/reports')
    await page.wait_for_load_state('networkidle')

    # Find all PDF download links
    pdf_links = await page.query_selector_all('a[href$=".pdf"]')
    debug_log(f"Found {len(pdf_links)} PDF files")

    # Download each PDF
    for i, link in enumerate(pdf_links):
        # Get link details
        href = await link.get_attribute('href')
        title = await link.text_content()

        debug_log(f"Downloading {i+1}/{len(pdf_links)}: {title}")

        # Download file
        result = await download_file(
            href,
            description=f'Report: {title.strip()}',
            filename=f'report_{i+1}.pdf'
        )

        if result['success']:
            debug_log(f"  ✓ {result['filename']} ({result['file_size']} bytes)")
        else:
            debug_log(f"  ✗ Failed: {result['error']}")

        # Take screenshot after each download
        await capture_screenshot(f"After downloading {title}")

    debug_log("All downloads complete!")

Example 2: Download Data Exports

Download CSV exports after performing actions:

async def main(page):
    # Login
    debug_log("Logging in...")
    await page.goto('https://data.example.com/login')
    await page.fill('#username', 'user@example.com')
    await page.fill('#password', 'password123')
    await page.click('button[type="submit"]')
    await page.wait_for_load_state('networkidle')

    # Navigate to data page
    debug_log("Opening data export page...")
    await page.goto('https://data.example.com/export')

    # Select export options
    await page.select_option('#format', 'csv')
    await page.select_option('#date_range', 'last_30_days')

    # Take screenshot of export settings
    await capture_screenshot("Export settings configured")

    # Click export button and download
    debug_log("Generating export...")
    result = await download_file(
        'button#export',
        description='Last 30 days data export',
        filename='data_export_30d.csv'
    )

    if result['success']:
        debug_log(f"✓ Export downloaded: {result['filename']}")
        debug_log(f"  File size: {result['file_size']} bytes")
        debug_log(f"  MIME type: {result['mime_type']}")
    else:
        debug_log(f"✗ Export failed: {result['error']}")
        await capture_screenshot("Export failure")

Example 3: Download with Dynamic URLs

Download files from dynamically generated URLs:

import asyncio

async def main(page):
    # Navigate to dashboard
    await page.goto('https://dashboard.example.com')

    # Trigger report generation
    debug_log("Requesting report generation...")
    await page.click('button.generate-report')

    # Wait for generation to complete
    debug_log("Waiting for report to be ready...")
    await page.wait_for_selector('.download-ready', timeout=30000)

    # Get the dynamically generated download URL
    download_url = await page.locator('a.download-link').get_attribute('href')
    debug_log(f"Download URL: {download_url}")

    # Download the file
    result = await download_file(
        download_url,
        description='Generated report',
        filename='generated_report.pdf'
    )

    if result['success']:
        debug_log(f"✓ Downloaded: {result['filename']}")
    else:
        debug_log(f"✗ Failed: {result['error']}")

Example 4: Conditional Download with Validation

Check conditions before downloading:

async def main(page):
    await page.goto('https://example.com/files')

    # Check if file is available
    download_button = page.locator('a.download-pdf')
    count = await download_button.count()

    if count == 0:
        debug_log("No download available")
        await capture_screenshot("No download found")
        return

    # Check file size before downloading
    size_text = await page.locator('.file-size').text_content()
    debug_log(f"File size: {size_text}")

    # Download if available
    result = await download_file(
        'a.download-pdf',
        description='Validated file download'
    )

    if result['success']:
        debug_log(f"✓ Downloaded: {result['filename']}")
        debug_log(f"  Size: {result['file_size']} bytes")
        debug_log(f"  Type: {result['mime_type']}")

        # Verify downloaded file matches expected size
        if 'MB' in size_text:
            expected_mb = float(size_text.split('MB')[0].strip())
            actual_mb = result['file_size'] / (1024 * 1024)
            debug_log(f"  Expected: ~{expected_mb}MB, Actual: {actual_mb:.2f}MB")
    else:
        debug_log(f"✗ Download failed: {result['error']}")

Example 5: Download Multiple File Types

Download different file types from a page:

async def main(page):
    await page.goto('https://example.com/downloads')
    await page.wait_for_load_state('networkidle')

    # Define file types to download
    file_types = [
        {'selector': 'a[href$=".pdf"]', 'type': 'PDF'},
        {'selector': 'a[href$=".csv"]', 'type': 'CSV'},
        {'selector': 'a[href$=".json"]', 'type': 'JSON'},
        {'selector': 'a[href$=".xlsx"]', 'type': 'Excel'}
    ]

    # Download each file type
    for file_type in file_types:
        links = await page.query_selector_all(file_type['selector'])
        debug_log(f"Found {len(links)} {file_type['type']} file(s)")

        for i, link in enumerate(links):
            href = await link.get_attribute('href')
            text = await link.text_content()

            result = await download_file(
                href,
                description=f"{file_type['type']}: {text.strip()}"
            )

            if result['success']:
                debug_log(f"  ✓ {result['filename']}")
            else:
                debug_log(f"  ✗ Failed: {result['error']}")

    await capture_screenshot("All downloads complete")

Example 6: Download with Rate Limiting

Add delays between downloads to avoid rate limiting:

import asyncio

async def main(page):
    await page.goto('https://api.example.com/files')

    # Get all download links
    links = await page.query_selector_all('a.download')
    debug_log(f"Found {len(links)} files to download")

    # Download with delay between each
    for i, link in enumerate(links):
        href = await link.get_attribute('href')
        filename = href.split('/')[-1]

        debug_log(f"Downloading {i+1}/{len(links)}: {filename}")

        result = await download_file(
            href,
            description=f'File {i+1} of {len(links)}'
        )

        if result['success']:
            debug_log(f"  ✓ Success")
        else:
            debug_log(f"  ✗ Failed: {result['error']}")

        # Wait 2 seconds between downloads
        if i < len(links) - 1:  # Don't wait after last download
            debug_log("  Waiting 2 seconds...")
            await asyncio.sleep(2)

    debug_log("All downloads complete!")

Example 7: Download with Error Recovery

Retry failed downloads with exponential backoff:

import asyncio

async def main(page):
    await page.goto('https://example.com/files')

    files_to_download = [
        'https://example.com/file1.pdf',
        'https://example.com/file2.csv',
        'https://example.com/file3.json'
    ]

    for file_url in files_to_download:
        filename = file_url.split('/')[-1]
        max_retries = 3
        retry_count = 0
        success = False

        while retry_count < max_retries and not success:
            if retry_count > 0:
                wait_time = 2 ** retry_count  # Exponential backoff
                debug_log(f"Retry {retry_count}/{max_retries} after {wait_time}s...")
                await asyncio.sleep(wait_time)

            debug_log(f"Downloading {filename}...")
            result = await download_file(
                file_url,
                description=f'{filename} (attempt {retry_count + 1})'
            )

            if result['success']:
                debug_log(f"  ✓ Success on attempt {retry_count + 1}")
                success = True
            else:
                debug_log(f"  ✗ Failed: {result['error']}")
                retry_count += 1

        if not success:
            debug_log(f"  ✗✗ All retries failed for {filename}")
            await capture_screenshot(f"Failed to download {filename}")

Example 8: Scrape Data and Download Files

Combine data scraping with file downloads:

async def main(page):
    await page.goto('https://example.com/products')

    # Get all product cards
    products = await page.query_selector_all('.product-card')
    debug_log(f"Found {len(products)} products")

    scraped_products = []

    for i, product in enumerate(products):
        # Extract product data
        name = await product.locator('.product-name').text_content()
        price = await product.locator('.product-price').text_content()
        pdf_link = await product.locator('a.download-spec').get_attribute('href')

        debug_log(f"Product {i+1}: {name} - {price}")

        # Download product specification
        if pdf_link:
            result = await download_file(
                pdf_link,
                description=f'Spec sheet: {name}',
                filename=f'spec_{i+1}_{name.replace(" ", "_")}.pdf'
            )

            # Store data with download info
            scraped_products.append({
                'name': name,
                'price': price,
                'spec_downloaded': result['success'],
                'spec_filename': result.get('filename', 'N/A')
            })
        else:
            scraped_products.append({
                'name': name,
                'price': price,
                'spec_downloaded': False,
                'spec_filename': 'No spec available'
            })

    # Save scraped data
    scrape_data({
        'products': scraped_products,
        'total_products': len(products)
    })

    debug_log(f"Scraped {len(scraped_products)} products")
    await capture_screenshot("Scraping complete")

Best Practices Demonstrated

These examples show:

  1. Error Handling: Check result['success'] before proceeding
  2. Logging: Use debug_log() to track progress
  3. Screenshots: Capture screenshots at key moments
  4. Rate Limiting: Add delays between downloads
  5. Retry Logic: Implement exponential backoff for failures
  6. Validation: Check conditions before downloading
  7. Organization: Use descriptive filenames and descriptions
  8. Integration: Combine downloads with data scraping

See Also