fetchHtmlStream method

Future<Stream<List<int>>> fetchHtmlStream({
  1. required String url,
  2. Map<String, String>? headers,
  3. int? timeout,
  4. int? retries,
  5. bool ignoreRobotsTxt = false,
})

Fetches HTML content as a stream from the given URL

url is the URL to fetch headers are additional headers to send with the request timeout is the timeout for the request in milliseconds retries is the number of retry attempts ignoreRobotsTxt whether to ignore robots.txt rules (default: false)

Implementation

Future<Stream<List<int>>> fetchHtmlStream({
  required String url,
  Map<String, String>? headers,
  int? timeout,
  int? retries,
  bool ignoreRobotsTxt = false,
}) async {
  final effectiveHeaders = {
    'User-Agent': _defaultUserAgent,
    ..._defaultHeaders,
    ...?headers,
  };

  final effectiveTimeout = timeout ?? _defaultTimeout;
  final effectiveRetries = retries ?? _maxRetries;

  // Check robots.txt if enabled and not explicitly ignored
  if (_respectRobotsTxt && !ignoreRobotsTxt) {
    final userAgent = effectiveHeaders['User-Agent'] ?? _defaultUserAgent;
    final isAllowed = await _robotsTxtHandler.isAllowed(url, userAgent);

    if (!isAllowed) {
      _logger.warning('URL not allowed by robots.txt: $url');
      throw ScrapingException.robotsTxt(
        'URL not allowed by robots.txt',
        url: url,
        isRetryable: false,
      );
    }
  }

  // Get the optimal strategy for this URL
  final strategy = _adaptiveStrategy.getStrategyForUrl(url);

  // Use the strategy parameters or the provided ones
  final effectiveRetries2 =
      strategy.retries > effectiveRetries
          ? strategy.retries
          : effectiveRetries;
  final effectiveTimeout2 =
      strategy.timeout > effectiveTimeout
          ? strategy.timeout
          : effectiveTimeout;
  final effectiveHeaders2 = Map<String, String>.from(effectiveHeaders);
  effectiveHeaders2.addAll(strategy.headers);

  _logger.info(
    'Using strategy for stream: retries=$effectiveRetries2, timeout=${effectiveTimeout2}ms',
  );

  // Ensure URL has proper scheme
  if (!url.startsWith('http://') && !url.startsWith('https://')) {
    url = 'https://$url';
    _logger.info('URL scheme added: $url');
  }

  // Check if we have a specialized handler for this URL
  if (_specializedHandlers.hasHandlerForUrl(url)) {
    _logger.info('Using specialized handler for URL stream: $url');
    try {
      final handler = _specializedHandlers.getHandlerForUrl(url)!;
      final html = await handler.fetchHtml(
        url: url,
        headers: effectiveHeaders2,
        timeout: effectiveTimeout2,
        logger: _logger,
      );

      // Convert the HTML string to a stream
      return Stream.value(utf8.encode(html));
    } catch (e) {
      _logger.error('Specialized handler failed for stream: $e');
      _logger.info('Falling back to standard fetching mechanism for stream');
      // Fall through to standard mechanism
    }
  }

  // Get a fresh proxy
  final proxy = proxyManager.getNextProxy(
    validated: strategy.validateProxies,
  );
  // Set the proxy in the HTTP client
  _httpClient.setProxy(proxy);
  _logger.proxy('Using proxy for stream: ${proxy.ip}:${proxy.port}');

  // Create a request
  final request = http.Request('GET', Uri.parse(url));
  request.headers.addAll(effectiveHeaders2);
  _logger.request('Sending stream request to $url');

  try {
    // Send the request
    final response = await _httpClient
        .send(request)
        .timeout(Duration(milliseconds: effectiveTimeout2));

    // Check if the response is successful
    if (response.statusCode >= 200 && response.statusCode < 300) {
      // Record success for this URL
      _adaptiveStrategy.recordSuccess(url);
      _logger.success('Stream request successful');

      // Return the response stream
      return response.stream;
    } else {
      // Handle HTTP error
      final statusCode = response.statusCode;
      final errorMessage = 'HTTP error: $statusCode';
      _adaptiveStrategy.recordFailure(url, errorMessage);
      _logger.error(errorMessage);

      // Create appropriate exception based on status code
      if (statusCode == 429) {
        throw ScrapingException.rateLimit(
          'Rate limit exceeded',
          url: url,
          statusCode: statusCode,
          isRetryable: true,
        );
      } else if (statusCode == 403) {
        throw ScrapingException.permission(
          'Access forbidden',
          url: url,
          statusCode: statusCode,
          isRetryable: false,
        );
      } else if (statusCode == 401) {
        throw ScrapingException.authentication(
          'Authentication required',
          url: url,
          statusCode: statusCode,
          isRetryable: false,
        );
      } else if (statusCode >= 500) {
        throw ScrapingException.http(
          'Server error',
          url: url,
          statusCode: statusCode,
          isRetryable: true,
        );
      } else {
        throw ScrapingException.http(
          errorMessage,
          url: url,
          statusCode: statusCode,
          isRetryable: statusCode >= 500 || statusCode == 429,
        );
      }
    }
  } catch (e) {
    // Record the error
    _adaptiveStrategy.recordFailure(url, e.toString());
    _logger.error('Stream error: ${e.toString()}');

    if (e is ScrapingException) {
      rethrow;
    } else {
      throw ScrapingException.network(
        'Failed to fetch URL stream',
        originalException: e,
        url: url,
        isRetryable: true,
      );
    }
  }
}