extractStructuredData method

List<Map<String, String>> extractStructuredData({
  1. required String html,
  2. required Map<String, String> selectors,
  3. Map<String, String?>? attributes,
  4. bool validateContent = true,
  5. bool validateSelectors = true,
  6. List<String> requiredFields = const [],
})

Parses HTML content and extracts structured data using CSS selectors

html is the HTML content to parse selectors is a map of field names to CSS selectors attributes is a map of field names to attributes to extract (optional) validateContent whether to validate and clean the extracted content validateSelectors whether to validate and repair the selectors requiredFields fields that must be present and non-empty

Implementation

List<Map<String, String>> extractStructuredData({
  required String html,
  required Map<String, String> selectors,
  Map<String, String?>? attributes,
  bool validateContent = true,
  bool validateSelectors = true,
  List<String> requiredFields = const [],
}) {
  try {
    // Log the selectors for debugging
    _logger.info(
      'Extracting structured data with selectors: ${selectors.toString()}',
    );
    if (attributes != null) {
      _logger.info('Using attributes: ${attributes.toString()}');
    }

    // Parse the HTML
    final document = html_parser.parse(html);
    final effectiveSelectors = <String, String>{};

    // Validate and repair selectors if needed
    if (validateSelectors) {
      final validationResults = _selectorValidator
          .validateSelectorsWithDocument(selectors, document);

      for (final entry in validationResults.entries) {
        final field = entry.key;
        final result = entry.value;

        if (!result.isValid && result.repairedSelector != null) {
          _logger.warning(
            'Invalid selector for field "$field": ${result.originalSelector}. '
            'Using repaired selector: ${result.repairedSelector}',
          );
          effectiveSelectors[field] = result.repairedSelector!;
        } else if (!result.isValid) {
          _logger.error(
            'Invalid selector for field "$field": ${result.originalSelector}. '
            '${result.errorMessage}',
          );
          // Use the original selector anyway, it might still work partially
          effectiveSelectors[field] = result.originalSelector;
        } else {
          effectiveSelectors[field] = result.originalSelector;
        }
      }
    } else {
      effectiveSelectors.addAll(selectors);
    }

    final result = <Map<String, String>>[];

    // Find the maximum number of items for any selector
    int maxItems = 0;
    effectiveSelectors.forEach((field, selector) {
      try {
        final elements = document.querySelectorAll(selector);
        _logger.info(
          'Found ${elements.length} elements for field "$field" with selector "$selector"',
        );
        if (elements.length > maxItems) {
          maxItems = elements.length;
        }
      } catch (e) {
        _logger.warning('Error querying selector for field "$field": $e');
      }
    });

    _logger.info('Maximum items found: $maxItems');

    // If no items found, log a warning
    if (maxItems == 0) {
      _logger.warning('No elements found for any selector');
      return [];
    }

    // Extract data for each item
    for (int i = 0; i < maxItems; i++) {
      final item = <String, String>{};

      effectiveSelectors.forEach((field, selector) {
        try {
          final elements = document.querySelectorAll(selector);
          if (i < elements.length) {
            final element = elements[i];
            final attribute = attributes?[field];

            if (attribute != null) {
              final value = element.attributes[attribute] ?? '';
              if (value.isEmpty) {
                _logger.warning(
                  'Attribute "$attribute" not found or empty for field "$field" in item $i',
                );
              }
              item[field] = value;
            } else {
              final text = element.text.trim();
              if (text.isEmpty) {
                _logger.warning(
                  'Text content is empty for field "$field" in item $i',
                );
              }
              item[field] = text;
            }
          } else {
            _logger.warning('No element found for field "$field" in item $i');
            item[field] = '';
          }
        } catch (e) {
          _logger.warning('Error extracting field "$field" in item $i: $e');
          item[field] = '';
        }
      });

      // Only add the item if it has at least one non-empty field
      if (item.values.any((value) => value.isNotEmpty)) {
        result.add(item);
      }
    }

    _logger.info('Extracted ${result.length} structured data items');

    // Validate and clean the content if needed
    if (validateContent) {
      return _structuredDataValidator.cleanStructuredDataList(result);
    }

    return result;
  } catch (e) {
    _logger.error('Failed to extract structured data: $e');
    throw ScrapingException.parsing(
      'Failed to extract structured data',
      originalException: e,
      isRetryable: false,
    );
  }
}