extractStructuredDataBatch method
Extracts structured data from multiple URLs concurrently
urls
is the list of URLs to fetch
selectors
is a map of field names to CSS selectors
attributes
is a map of field names to attributes to extract (optional)
headers
are additional headers to send with the request
timeout
is the timeout for the request in milliseconds
retries
is the number of retry attempts
ignoreRobotsTxt
whether to ignore robots.txt rules (default: false)
onProgress
is a callback for progress updates
Implementation
Future<Map<String, List<Map<String, String>>>> extractStructuredDataBatch({
required List<String> urls,
required Map<String, String> selectors,
Map<String, String?>? attributes,
Map<String, String>? headers,
int? timeout,
int? retries,
bool ignoreRobotsTxt = false,
void Function(int completed, int total, String url)? onProgress,
}) async {
_logger.info('Extracting structured data batch: ${urls.length} URLs');
final results = <String, List<Map<String, String>>>{};
final errors = <String, dynamic>{};
final completer = Completer<Map<String, List<Map<String, String>>>>();
int completed = 0;
// Function to check if all tasks are completed
void checkCompletion() {
if (completed == urls.length) {
if (errors.isNotEmpty) {
_logger.warning(
'Batch completed with ${errors.length} errors: ${errors.keys.join(', ')}',
);
} else {
_logger.success('Batch completed successfully');
}
completer.complete(results);
}
}
// Add each URL as a task
for (final url in urls) {
_taskQueue.addTask<List<Map<String, String>>>(
task: () async {
// First fetch the HTML
final html = await _webScraper.fetchHtml(
url: url,
headers: headers,
timeout: timeout,
retries: retries,
ignoreRobotsTxt: ignoreRobotsTxt,
);
// Then extract the structured data from the HTML
return _webScraper.extractStructuredData(
html: html,
selectors: selectors,
attributes: attributes,
);
},
priority: 0,
taskName: 'ExtractStructuredData-$url',
onStart: () {
_logger.info('Starting structured extraction for URL: $url');
},
onComplete: (result) {
_logger.success('Structured extraction completed for URL: $url');
results[url] = result;
completed++;
onProgress?.call(completed, urls.length, url);
checkCompletion();
},
onError: (error, stackTrace) {
_logger.error('Structured extraction failed for URL: $url - $error');
errors[url] = error;
completed++;
onProgress?.call(completed, urls.length, url);
checkCompletion();
},
);
}
return completer.future;
}