extractData method
Parses HTML content and extracts data using CSS selectors
html
is the HTML content to parse
selector
is the CSS selector to use
attribute
is the attribute to extract (optional)
asText
whether to extract the text content (default: true)
validateContent
whether to validate and clean the extracted content
validateSelector
whether to validate and repair the selector
Implementation
List<String> extractData({
required String html,
required String selector,
String? attribute,
bool asText = true,
bool validateContent = true,
bool validateSelector = true,
}) {
try {
// Log the selector for debugging
_logger.info('Extracting data with selector: $selector');
if (attribute != null) {
_logger.info('Using attribute: $attribute');
}
// Parse the HTML
final document = html_parser.parse(html);
String effectiveSelector = selector;
// Validate and repair the selector if needed
if (validateSelector) {
final validationResult = _selectorValidator
.validateSelectorWithDocument(selector, document);
if (!validationResult.isValid &&
validationResult.repairedSelector != null) {
_logger.warning(
'Invalid selector: $selector. Using repaired selector: ${validationResult.repairedSelector}',
);
effectiveSelector = validationResult.repairedSelector!;
} else if (!validationResult.isValid) {
_logger.error(
'Invalid selector: $selector. ${validationResult.errorMessage}',
);
return [];
}
}
// Query the elements
final elements = document.querySelectorAll(effectiveSelector);
_logger.info('Found ${elements.length} elements matching selector');
// If no elements found, log a warning and suggest alternatives
if (elements.isEmpty) {
_logger.warning(
'No elements found matching selector: $effectiveSelector',
);
if (validateSelector) {
final alternatives = _selectorValidator
.suggestAlternativesWithDocument(effectiveSelector, document);
if (alternatives.isNotEmpty) {
_logger.info(
'Suggested alternative selectors: ${alternatives.join(', ')}',
);
// Try the first alternative
final alternativeElements = document.querySelectorAll(
alternatives.first,
);
if (alternativeElements.isNotEmpty) {
_logger.info(
'Found ${alternativeElements.length} elements with alternative selector: ${alternatives.first}',
);
// Extract data with the alternative selector
final alternativeResults =
alternativeElements.map((element) {
if (attribute != null) {
return element.attributes[attribute] ?? '';
} else if (asText) {
return element.text.trim();
} else {
return element.outerHtml;
}
}).toList();
// Validate and clean the content if needed
if (validateContent) {
return _contentValidator.cleanContentList(alternativeResults);
}
return alternativeResults;
}
}
}
return [];
}
// Extract the data from the elements
final results =
elements.map((element) {
if (attribute != null) {
final value = element.attributes[attribute] ?? '';
if (value.isEmpty) {
_logger.warning(
'Attribute "$attribute" not found or empty in element',
);
}
return value;
} else if (asText) {
final text = element.text.trim();
if (text.isEmpty) {
_logger.warning('Text content is empty in element');
}
return text;
} else {
return element.outerHtml;
}
}).toList();
_logger.info('Extracted ${results.length} items');
// Validate and clean the content if needed
if (validateContent) {
return _contentValidator.cleanContentList(results);
}
return results;
} catch (e) {
_logger.error('Failed to extract data: $e');
throw ScrapingException.parsing(
'Failed to extract data',
originalException: e,
isRetryable: false,
);
}
}