fetchHtmlStream method
Fetches HTML content as a stream from the given URL
url
is the URL to fetch
headers
are additional headers to send with the request
timeout
is the timeout for the request in milliseconds
retries
is the number of retry attempts
ignoreRobotsTxt
whether to ignore robots.txt rules (default: false)
Implementation
Future<Stream<List<int>>> fetchHtmlStream({
required String url,
Map<String, String>? headers,
int? timeout,
int? retries,
bool ignoreRobotsTxt = false,
}) async {
final effectiveHeaders = {
'User-Agent': _defaultUserAgent,
..._defaultHeaders,
...?headers,
};
final effectiveTimeout = timeout ?? _defaultTimeout;
final effectiveRetries = retries ?? _maxRetries;
// Check robots.txt if enabled and not explicitly ignored
if (_respectRobotsTxt && !ignoreRobotsTxt) {
final userAgent = effectiveHeaders['User-Agent'] ?? _defaultUserAgent;
final isAllowed = await _robotsTxtHandler.isAllowed(url, userAgent);
if (!isAllowed) {
_logger.warning('URL not allowed by robots.txt: $url');
throw ScrapingException.robotsTxt(
'URL not allowed by robots.txt',
url: url,
isRetryable: false,
);
}
}
// Get the optimal strategy for this URL
final strategy = _adaptiveStrategy.getStrategyForUrl(url);
// Use the strategy parameters or the provided ones
final effectiveRetries2 =
strategy.retries > effectiveRetries
? strategy.retries
: effectiveRetries;
final effectiveTimeout2 =
strategy.timeout > effectiveTimeout
? strategy.timeout
: effectiveTimeout;
final effectiveHeaders2 = Map<String, String>.from(effectiveHeaders);
effectiveHeaders2.addAll(strategy.headers);
_logger.info(
'Using strategy for stream: retries=$effectiveRetries2, timeout=${effectiveTimeout2}ms',
);
// Ensure URL has proper scheme
if (!url.startsWith('http://') && !url.startsWith('https://')) {
url = 'https://$url';
_logger.info('URL scheme added: $url');
}
// Check if we have a specialized handler for this URL
if (_specializedHandlers.hasHandlerForUrl(url)) {
_logger.info('Using specialized handler for URL stream: $url');
try {
final handler = _specializedHandlers.getHandlerForUrl(url)!;
final html = await handler.fetchHtml(
url: url,
headers: effectiveHeaders2,
timeout: effectiveTimeout2,
logger: _logger,
);
// Convert the HTML string to a stream
return Stream.value(utf8.encode(html));
} catch (e) {
_logger.error('Specialized handler failed for stream: $e');
_logger.info('Falling back to standard fetching mechanism for stream');
// Fall through to standard mechanism
}
}
// Get a fresh proxy
final proxy = proxyManager.getNextProxy(
validated: strategy.validateProxies,
);
// Set the proxy in the HTTP client
_httpClient.setProxy(proxy);
_logger.proxy('Using proxy for stream: ${proxy.ip}:${proxy.port}');
// Create a request
final request = http.Request('GET', Uri.parse(url));
request.headers.addAll(effectiveHeaders2);
_logger.request('Sending stream request to $url');
try {
// Send the request
final response = await _httpClient
.send(request)
.timeout(Duration(milliseconds: effectiveTimeout2));
// Check if the response is successful
if (response.statusCode >= 200 && response.statusCode < 300) {
// Record success for this URL
_adaptiveStrategy.recordSuccess(url);
_logger.success('Stream request successful');
// Return the response stream
return response.stream;
} else {
// Handle HTTP error
final statusCode = response.statusCode;
final errorMessage = 'HTTP error: $statusCode';
_adaptiveStrategy.recordFailure(url, errorMessage);
_logger.error(errorMessage);
// Create appropriate exception based on status code
if (statusCode == 429) {
throw ScrapingException.rateLimit(
'Rate limit exceeded',
url: url,
statusCode: statusCode,
isRetryable: true,
);
} else if (statusCode == 403) {
throw ScrapingException.permission(
'Access forbidden',
url: url,
statusCode: statusCode,
isRetryable: false,
);
} else if (statusCode == 401) {
throw ScrapingException.authentication(
'Authentication required',
url: url,
statusCode: statusCode,
isRetryable: false,
);
} else if (statusCode >= 500) {
throw ScrapingException.http(
'Server error',
url: url,
statusCode: statusCode,
isRetryable: true,
);
} else {
throw ScrapingException.http(
errorMessage,
url: url,
statusCode: statusCode,
isRetryable: statusCode >= 500 || statusCode == 429,
);
}
}
} catch (e) {
// Record the error
_adaptiveStrategy.recordFailure(url, e.toString());
_logger.error('Stream error: ${e.toString()}');
if (e is ScrapingException) {
rethrow;
} else {
throw ScrapingException.network(
'Failed to fetch URL stream',
originalException: e,
url: url,
isRetryable: true,
);
}
}
}