extractData method

List<String> extractData({
  1. required String html,
  2. required String selector,
  3. String? attribute,
  4. bool asText = true,
  5. bool validateContent = true,
  6. bool validateSelector = true,
})

Parses HTML content and extracts data using CSS selectors

html is the HTML content to parse selector is the CSS selector to use attribute is the attribute to extract (optional) asText whether to extract the text content (default: true) validateContent whether to validate and clean the extracted content validateSelector whether to validate and repair the selector

Implementation

List<String> extractData({
  required String html,
  required String selector,
  String? attribute,
  bool asText = true,
  bool validateContent = true,
  bool validateSelector = true,
}) {
  try {
    // Log the selector for debugging
    _logger.info('Extracting data with selector: $selector');
    if (attribute != null) {
      _logger.info('Using attribute: $attribute');
    }

    // Parse the HTML
    final document = html_parser.parse(html);
    String effectiveSelector = selector;

    // Validate and repair the selector if needed
    if (validateSelector) {
      final validationResult = _selectorValidator
          .validateSelectorWithDocument(selector, document);

      if (!validationResult.isValid &&
          validationResult.repairedSelector != null) {
        _logger.warning(
          'Invalid selector: $selector. Using repaired selector: ${validationResult.repairedSelector}',
        );
        effectiveSelector = validationResult.repairedSelector!;
      } else if (!validationResult.isValid) {
        _logger.error(
          'Invalid selector: $selector. ${validationResult.errorMessage}',
        );
        return [];
      }
    }

    // Query the elements
    final elements = document.querySelectorAll(effectiveSelector);
    _logger.info('Found ${elements.length} elements matching selector');

    // If no elements found, log a warning and suggest alternatives
    if (elements.isEmpty) {
      _logger.warning(
        'No elements found matching selector: $effectiveSelector',
      );

      if (validateSelector) {
        final alternatives = _selectorValidator
            .suggestAlternativesWithDocument(effectiveSelector, document);

        if (alternatives.isNotEmpty) {
          _logger.info(
            'Suggested alternative selectors: ${alternatives.join(', ')}',
          );

          // Try the first alternative
          final alternativeElements = document.querySelectorAll(
            alternatives.first,
          );
          if (alternativeElements.isNotEmpty) {
            _logger.info(
              'Found ${alternativeElements.length} elements with alternative selector: ${alternatives.first}',
            );

            // Extract data with the alternative selector
            final alternativeResults =
                alternativeElements.map((element) {
                  if (attribute != null) {
                    return element.attributes[attribute] ?? '';
                  } else if (asText) {
                    return element.text.trim();
                  } else {
                    return element.outerHtml;
                  }
                }).toList();

            // Validate and clean the content if needed
            if (validateContent) {
              return _contentValidator.cleanContentList(alternativeResults);
            }

            return alternativeResults;
          }
        }
      }

      return [];
    }

    // Extract the data from the elements
    final results =
        elements.map((element) {
          if (attribute != null) {
            final value = element.attributes[attribute] ?? '';
            if (value.isEmpty) {
              _logger.warning(
                'Attribute "$attribute" not found or empty in element',
              );
            }
            return value;
          } else if (asText) {
            final text = element.text.trim();
            if (text.isEmpty) {
              _logger.warning('Text content is empty in element');
            }
            return text;
          } else {
            return element.outerHtml;
          }
        }).toList();

    _logger.info('Extracted ${results.length} items');

    // Validate and clean the content if needed
    if (validateContent) {
      return _contentValidator.cleanContentList(results);
    }

    return results;
  } catch (e) {
    _logger.error('Failed to extract data: $e');
    throw ScrapingException.parsing(
      'Failed to extract data',
      originalException: e,
      isRetryable: false,
    );
  }
}