scrapeUrl method

Future<HeadlessBrowserResult> scrapeUrl(
  1. String url, {
  2. Map<String, String>? headers,
  3. Map<String, String>? selectors,
  4. Map<String, String>? attributes,
  5. bool takeScreenshot = false,
  6. int? timeoutMillis,
  7. bool useProxy = true,
  8. Proxy? specificProxy,
})

Scrapes a URL using the headless browser

Implementation

Future<HeadlessBrowserResult> scrapeUrl(
  String url, {
  Map<String, String>? headers,
  Map<String, String>? selectors,
  Map<String, String>? attributes,
  bool takeScreenshot = false,
  int? timeoutMillis,
  bool useProxy = true,
  Proxy? specificProxy,
}) async {
  final startTime = DateTime.now();
  int retryCount = 0;
  Proxy? currentProxy;

  while (retryCount <= _maxRetries) {
    try {
      // Set up proxy if needed
      if (_useProxies && useProxy) {
        try {
          if (specificProxy != null) {
            currentProxy = specificProxy;
          } else if (_rotateProxies || currentProxy == null) {
            try {
              currentProxy = _proxyManager?.getNextProxy(validated: true);
            } catch (e) {
              _logger.error('Error getting validated proxy: $e');

              // Try to fetch and validate new proxies
              _logger.info('Attempting to fetch and validate new proxies...');
              try {
                await _proxyManager?.fetchValidatedProxies(
                  options: ProxyFilterOptions(count: 10, onlyHttps: true),
                );
                currentProxy = _proxyManager?.getNextProxy(validated: true);
              } catch (e) {
                _logger.error('Failed to fetch validated proxies: $e');

                // Try with unvalidated proxies as a fallback
                _logger.info('Trying with unvalidated proxies...');
                try {
                  currentProxy = _proxyManager?.getNextProxy(
                    validated: false,
                  );
                  _logger.warning('Using unvalidated proxy as fallback');
                } catch (e) {
                  _logger.error('No proxies available at all: $e');
                  // Continue without proxy
                }
              }
            }
          }

          if (currentProxy != null) {
            await _browser.setProxy(currentProxy);
            _logger.info(
              'Using proxy: ${currentProxy.host}:${currentProxy.port}',
            );
          } else {
            _logger.warning('No proxy available, proceeding without proxy');
          }
        } catch (e) {
          _logger.error('Error setting up proxy: $e');
          // Continue without proxy
        }
      }

      // Prepare headers with user agent
      final combinedHeaders = <String, String>{};
      if (headers != null) {
        combinedHeaders.addAll(headers);
      }

      if (!combinedHeaders.containsKey('User-Agent')) {
        final userAgent = _userAgentManager.getRandomUserAgentForSite(url);
        combinedHeaders['User-Agent'] = userAgent;
      }

      // Navigate to URL
      final success = await _browser.navigateTo(
        url,
        headers: combinedHeaders,
        timeoutMillis: timeoutMillis,
      );

      if (!success) {
        throw ScrapingException.network(
          'Failed to load page',
          url: url,
          isRetryable: true,
        );
      }

      // Get HTML content
      final html = await _browser.getHtml();

      // Extract data if selectors provided
      Map<String, dynamic>? extractedData;
      if (selectors != null && selectors.isNotEmpty) {
        extractedData = await _browser.extractData(
          selectors,
          attributes: attributes,
        );
      }

      // Take screenshot if requested
      Uint8List? screenshot;
      if (takeScreenshot) {
        screenshot = await _browser.takeScreenshot();
      }

      // Calculate elapsed time
      final elapsedMillis =
          DateTime.now().difference(startTime).inMilliseconds;

      // Update site reputation
      _reputationTracker.recordSuccess(url);

      // Return successful result
      return HeadlessBrowserResult.success(
        html: html,
        data: extractedData,
        screenshot: screenshot,
        elapsedMillis: elapsedMillis,
      );
    } catch (e, stackTrace) {
      _logger.error(
        'Error scraping $url (attempt ${retryCount + 1}/$_maxRetries): $e',
      );
      if (kDebugMode) {
        _logger.error(stackTrace.toString());
      }

      // Update site reputation
      _reputationTracker.recordFailure(url, e.toString());

      // Mark proxy as invalid if using proxies
      if (currentProxy != null && _proxyManager != null) {
        // Use validateSpecificProxy with false result to mark as invalid
        await _proxyManager.validateSpecificProxy(
          currentProxy,
          updateScore: true,
        );
        currentProxy = null;
      }

      retryCount++;

      if (retryCount <= _maxRetries) {
        _logger.info('Retrying in 1 second...');
        await Future.delayed(const Duration(seconds: 1));
      } else {
        final elapsedMillis =
            DateTime.now().difference(startTime).inMilliseconds;
        return HeadlessBrowserResult.failure(
          errorMessage: 'Failed after $retryCount attempts: ${e.toString()}',
          elapsedMillis: elapsedMillis,
        );
      }
    }
  }

  // This should never be reached, but just in case
  return HeadlessBrowserResult.failure(
    errorMessage: 'Unknown error occurred',
  );
}