extractStructuredData method
Parses HTML content and extracts structured data using CSS selectors
html
is the HTML content to parse
selectors
is a map of field names to CSS selectors
attributes
is a map of field names to attributes to extract (optional)
validateContent
whether to validate and clean the extracted content
validateSelectors
whether to validate and repair the selectors
requiredFields
fields that must be present and non-empty
Implementation
List<Map<String, String>> extractStructuredData({
required String html,
required Map<String, String> selectors,
Map<String, String?>? attributes,
bool validateContent = true,
bool validateSelectors = true,
List<String> requiredFields = const [],
}) {
try {
// Log the selectors for debugging
_logger.info(
'Extracting structured data with selectors: ${selectors.toString()}',
);
if (attributes != null) {
_logger.info('Using attributes: ${attributes.toString()}');
}
// Parse the HTML
final document = html_parser.parse(html);
final effectiveSelectors = <String, String>{};
// Validate and repair selectors if needed
if (validateSelectors) {
final validationResults = _selectorValidator
.validateSelectorsWithDocument(selectors, document);
for (final entry in validationResults.entries) {
final field = entry.key;
final result = entry.value;
if (!result.isValid && result.repairedSelector != null) {
_logger.warning(
'Invalid selector for field "$field": ${result.originalSelector}. '
'Using repaired selector: ${result.repairedSelector}',
);
effectiveSelectors[field] = result.repairedSelector!;
} else if (!result.isValid) {
_logger.error(
'Invalid selector for field "$field": ${result.originalSelector}. '
'${result.errorMessage}',
);
// Use the original selector anyway, it might still work partially
effectiveSelectors[field] = result.originalSelector;
} else {
effectiveSelectors[field] = result.originalSelector;
}
}
} else {
effectiveSelectors.addAll(selectors);
}
final result = <Map<String, String>>[];
// Find the maximum number of items for any selector
int maxItems = 0;
effectiveSelectors.forEach((field, selector) {
try {
final elements = document.querySelectorAll(selector);
_logger.info(
'Found ${elements.length} elements for field "$field" with selector "$selector"',
);
if (elements.length > maxItems) {
maxItems = elements.length;
}
} catch (e) {
_logger.warning('Error querying selector for field "$field": $e');
}
});
_logger.info('Maximum items found: $maxItems');
// If no items found, log a warning
if (maxItems == 0) {
_logger.warning('No elements found for any selector');
return [];
}
// Extract data for each item
for (int i = 0; i < maxItems; i++) {
final item = <String, String>{};
effectiveSelectors.forEach((field, selector) {
try {
final elements = document.querySelectorAll(selector);
if (i < elements.length) {
final element = elements[i];
final attribute = attributes?[field];
if (attribute != null) {
final value = element.attributes[attribute] ?? '';
if (value.isEmpty) {
_logger.warning(
'Attribute "$attribute" not found or empty for field "$field" in item $i',
);
}
item[field] = value;
} else {
final text = element.text.trim();
if (text.isEmpty) {
_logger.warning(
'Text content is empty for field "$field" in item $i',
);
}
item[field] = text;
}
} else {
_logger.warning('No element found for field "$field" in item $i');
item[field] = '';
}
} catch (e) {
_logger.warning('Error extracting field "$field" in item $i: $e');
item[field] = '';
}
});
// Only add the item if it has at least one non-empty field
if (item.values.any((value) => value.isNotEmpty)) {
result.add(item);
}
}
_logger.info('Extracted ${result.length} structured data items');
// Validate and clean the content if needed
if (validateContent) {
return _structuredDataValidator.cleanStructuredDataList(result);
}
return result;
} catch (e) {
_logger.error('Failed to extract structured data: $e');
throw ScrapingException.parsing(
'Failed to extract structured data',
originalException: e,
isRetryable: false,
);
}
}