scrapeWithPagination<T> method

Future<PaginationResult<T>> scrapeWithPagination<T>({
  1. required String url,
  2. required PaginationConfig config,
  3. required Future<T> extractor(
    1. String html,
    2. String pageUrl
    ),
  4. Map<String, String>? headers,
  5. int? timeout,
  6. int? retries,
})

Scrapes multiple pages with pagination

url is the starting URL config is the pagination configuration extractor is a function that extracts data from each page headers are additional headers to send with the request timeout is the timeout for the request in milliseconds retries is the number of retry attempts

Implementation

Future<PaginationResult<T>> scrapeWithPagination<T>({
  required String url,
  required PaginationConfig config,
  required Future<T> Function(String html, String pageUrl) extractor,
  Map<String, String>? headers,
  int? timeout,
  int? retries,
}) async {
  final results = <T>[];
  final pageUrls = <String>[];
  String? nextPageUrl = url;
  bool hasMorePages = false;
  int pageCount = 0;

  try {
    // Scrape pages until we reach the maximum or there are no more pages
    while (nextPageUrl != null &&
        pageCount < config.maxPages &&
        pageUrls.length <= config.maxDepth) {
      // Avoid duplicate pages
      if (pageUrls.contains(nextPageUrl)) {
        logger?.warning('Duplicate page URL detected: $nextPageUrl');
        break;
      }

      // Fetch the page
      logger?.info('Fetching page ${pageCount + 1}: $nextPageUrl');
      final html = await _webScraper.fetchHtml(
        url: nextPageUrl,
        headers: headers,
        timeout: timeout,
        retries: retries,
      );

      // Add the page URL to the list
      pageUrls.add(nextPageUrl);
      pageCount++;

      // Extract data from the page
      final result = await extractor(html, nextPageUrl);
      results.add(result);

      // If we're not following pagination, stop here
      if (!config.followPagination) {
        break;
      }

      // Detect pagination
      final paginationDetector = PaginationDetector(
        baseUrl: nextPageUrl,
        logger: logger,
      );
      final paginationResult = paginationDetector.detectPagination(html);

      // Update the next page URL
      nextPageUrl = paginationResult.nextPageUrl;
      hasMorePages = !paginationResult.isLastPage;

      // If there's no next page URL, we're done
      if (nextPageUrl == null) {
        logger?.info('No more pages to scrape');
        break;
      }

      // Validate the next page URL if needed
      if (config.validateLinks) {
        if (!_isValidUrl(nextPageUrl)) {
          logger?.warning('Invalid next page URL: $nextPageUrl');
          nextPageUrl = null;
          break;
        }
      }

      // Log the next page URL
      logger?.info('Next page URL: $nextPageUrl');
    }

    // Return the results
    return PaginationResult(
      results: results,
      pageUrls: pageUrls,
      pageCount: pageCount,
      hasMorePages: hasMorePages,
      nextPageUrl: nextPageUrl,
    );
  } catch (e) {
    logger?.error('Error scraping with pagination: $e');
    throw ScrapingException.pagination(
      'Error scraping with pagination',
      originalException: e,
      isRetryable: true,
    );
  }
}