scrapeInParallel<T> method

Future<List<T>> scrapeInParallel<T>({
  1. required List<String> urls,
  2. required Future<T> extractor(
    1. String html,
    2. String url
    ),
  3. required TaskScheduler scheduler,
  4. Map<String, String>? headers,
  5. int? timeout,
  6. int? retries,
  7. TaskPriority priority = TaskPriority.normal,
  8. int maxRetries = 3,
})

Scrapes multiple URLs in parallel

Implementation

Future<List<T>> scrapeInParallel<T>({
  required List<String> urls,
  required Future<T> Function(String html, String url) extractor,
  required TaskScheduler scheduler,
  Map<String, String>? headers,
  int? timeout,
  int? retries,
  TaskPriority priority = TaskPriority.normal,
  int maxRetries = 3,
}) async {
  // Create a task for each URL
  final tasks = <ScrapingTask<T>>[];
  final results = <T>[];

  for (int i = 0; i < urls.length; i++) {
    final url = urls[i];
    final domain = _extractDomain(url);

    // Create a task for this URL
    final task = ScrapingTask<T>(
      id: Uuid().v4(),
      domain: domain,
      url: url,
      execute: () async {
        // Fetch the HTML
        final html = await fetchHtml(
          url: url,
          headers: headers,
          timeout: timeout,
          retries: retries,
        );

        // Extract the data
        return await extractor(html, url);
      },
      priority: priority,
      maxRetries: maxRetries,
      logger: Logger('ScrapingTask-$i'),
    );

    // Add the task to the list
    tasks.add(task);

    // Enqueue the task
    scheduler.enqueue(task);
  }

  // Wait for all tasks to complete
  for (final task in tasks) {
    try {
      final result = await task.future;
      results.add(result);
    } catch (e) {
      logger.error('Error scraping ${task.url}: $e');
    }
  }

  return results;
}