extractStructuredDataBatch method

Future<Map<String, List<Map<String, String>>>> extractStructuredDataBatch({
  1. required List<String> urls,
  2. required Map<String, String> selectors,
  3. Map<String, String?>? attributes,
  4. Map<String, String>? headers,
  5. int? timeout,
  6. int? retries,
  7. bool ignoreRobotsTxt = false,
  8. void onProgress(
    1. int completed,
    2. int total,
    3. String url
    )?,
})

Extracts structured data from multiple URLs concurrently

urls is the list of URLs to fetch selectors is a map of field names to CSS selectors attributes is a map of field names to attributes to extract (optional) headers are additional headers to send with the request timeout is the timeout for the request in milliseconds retries is the number of retry attempts ignoreRobotsTxt whether to ignore robots.txt rules (default: false) onProgress is a callback for progress updates

Implementation

Future<Map<String, List<Map<String, String>>>> extractStructuredDataBatch({
  required List<String> urls,
  required Map<String, String> selectors,
  Map<String, String?>? attributes,
  Map<String, String>? headers,
  int? timeout,
  int? retries,
  bool ignoreRobotsTxt = false,
  void Function(int completed, int total, String url)? onProgress,
}) async {
  _logger.info('Extracting structured data batch: ${urls.length} URLs');
  final results = <String, List<Map<String, String>>>{};
  final errors = <String, dynamic>{};
  final completer = Completer<Map<String, List<Map<String, String>>>>();

  int completed = 0;

  // Function to check if all tasks are completed
  void checkCompletion() {
    if (completed == urls.length) {
      if (errors.isNotEmpty) {
        _logger.warning(
          'Batch completed with ${errors.length} errors: ${errors.keys.join(', ')}',
        );
      } else {
        _logger.success('Batch completed successfully');
      }

      completer.complete(results);
    }
  }

  // Add each URL as a task
  for (final url in urls) {
    _taskQueue.addTask<List<Map<String, String>>>(
      task: () async {
        // First fetch the HTML
        final html = await _webScraper.fetchHtml(
          url: url,
          headers: headers,
          timeout: timeout,
          retries: retries,
          ignoreRobotsTxt: ignoreRobotsTxt,
        );

        // Then extract the structured data from the HTML
        return _webScraper.extractStructuredData(
          html: html,
          selectors: selectors,
          attributes: attributes,
        );
      },
      priority: 0,
      taskName: 'ExtractStructuredData-$url',
      onStart: () {
        _logger.info('Starting structured extraction for URL: $url');
      },
      onComplete: (result) {
        _logger.success('Structured extraction completed for URL: $url');
        results[url] = result;
        completed++;
        onProgress?.call(completed, urls.length, url);
        checkCompletion();
      },
      onError: (error, stackTrace) {
        _logger.error('Structured extraction failed for URL: $url - $error');
        errors[url] = error;
        completed++;
        onProgress?.call(completed, urls.length, url);
        checkCompletion();
      },
    );
  }

  return completer.future;
}