Source code for pyinaturalist.pagination

from functools import wraps
from logging import getLogger
from math import ceil
from time import sleep
from typing import Callable

from pyinaturalist.constants import (
    EXPORT_URL,
    LARGE_REQUEST_WARNING,
    PER_PAGE_RESULTS,
    REQUESTS_PER_MINUTE,
    THROTTLING_DELAY,
    JsonResponse,
)

logger = getLogger(__name__)


[docs]def add_paginate_all(method: str = 'page'):
    """Decorator that adds auto-pagination support, invoked by passing ``page='all'`` to the wrapped
    API function.
    """

    def decorator(func):
        @wraps(func)
        def wrapper(*args, **params):
            if params.get('page') == 'all':
                return paginate_all(func, *args, method=method, **params)
            return func(*args, **params)

        return wrapper

    return decorator


[docs]def paginate_all(api_func: Callable, *args, method: str = 'page', **params) -> JsonResponse:
    """Get all pages of a multi-page request. Explicit pagination parameters will be overridden.

    Args:
        api_func: API endpoint function to paginate
        method: Pagination method; either 'page', 'id', or 'autocomplete' (see below)
        params: Original request parameters

    Note on pagination by ID, from the iNaturalist documentation:
    _'The large size of the observations index prevents us from supporting the page parameter when
    retrieving records from large result sets. If you need to retrieve large numbers of records,
    use the ``per_page`` and ``id_above`` or ``id_below`` parameters instead.'_

    Returns:
        Response dict containing combined results, in the same format as ``api_func``
    """
    params.pop('page', None)
    if method == 'autocomplete':
        return paginate_autocomplete(api_func, *args, **params)
    if method == 'id':
        params['order_by'] = 'id'
        params['order'] = 'asc'
    else:
        params['page'] = 1
    params['per_page'] = PER_PAGE_RESULTS

    # Run an initial request to get request size
    response = api_func(**params)
    results = page_results = response['results']
    total_results = response.get('total_results')
    estimate_request_size(total_results)

    # Some endpoints (like get_observation_fields) don't return total_results for some reason
    # Also check page size, in case total_results is off (race condition, outdated index, etc.)
    def check_results():
        more_results = total_results is None or len(results) < total_results
        return more_results and len(page_results) > 0

    # Loop until we get all pages
    while check_results():
        if method == 'id':
            params['id_above'] = page_results[-1]['id']
        else:
            params['page'] += 1

        page_results = api_func(**params).get('results', [])
        results += page_results
        sleep(THROTTLING_DELAY)

    return {
        'results': results,
        'total_results': len(results),
    }


[docs]def paginate_autocomplete(api_func: Callable, *args, **params) -> JsonResponse:
    """Attempt to get as many results as possible from the places autocomplete endpoint.
    This is necessary for some problematic places for which there are many matches but not ranked
    with the desired match(es) first.

    This works based on different rankings being returned for order_by=area. No other fields can be
    sorted on, and direction can't be specified, but this can at least provide a few additional
    results beyond the limit of 20.
    """
    params['per_page'] = 20
    params.pop('order_by', None)

    # Search with default ordering and ordering by area (if there are more than 20 results)
    page_1 = api_func(*args, **params)
    if page_1['total_results'] > 20:
        page_2 = api_func(*args, **params, order_by='area')
    else:
        page_2 = {'results': []}

    # De-duplicate results
    unique_results = {r['id']: r for page in [page_1, page_2] for r in page['results']}
    return {
        'results': list(unique_results.values()),
        'total_results': page_1['total_results'],
    }


[docs]def estimate_request_size(total_results):
    """Log the estimated total number of requests and rate-limiting delay, and show a warning if
    the request is too large
    """
    if not total_results:
        return
    total_requests = ceil(total_results / PER_PAGE_RESULTS)
    est_delay = ceil((total_requests / REQUESTS_PER_MINUTE) * 60)
    logger.info(
        f'This query will fetch {total_results} results in {total_requests} requests. '
        f'Estimated total rate-limiting delay: {est_delay} seconds'
    )

    if total_results > LARGE_REQUEST_WARNING:
        logger.warning(
            'This request is larger than recommended for API usage. For bulk requests, consider '
            f'using the iNat export tool instead: {EXPORT_URL}'
        )