scrapeUrl method
Scrapes a URL using the headless browser
Implementation
Future<HeadlessBrowserResult> scrapeUrl(
String url, {
Map<String, String>? headers,
Map<String, String>? selectors,
Map<String, String>? attributes,
bool takeScreenshot = false,
int? timeoutMillis,
bool useProxy = true,
Proxy? specificProxy,
}) async {
final startTime = DateTime.now();
int retryCount = 0;
Proxy? currentProxy;
while (retryCount <= _maxRetries) {
try {
// Set up proxy if needed
if (_useProxies && useProxy) {
try {
if (specificProxy != null) {
currentProxy = specificProxy;
} else if (_rotateProxies || currentProxy == null) {
try {
currentProxy = _proxyManager?.getNextProxy(validated: true);
} catch (e) {
_logger.error('Error getting validated proxy: $e');
// Try to fetch and validate new proxies
_logger.info('Attempting to fetch and validate new proxies...');
try {
await _proxyManager?.fetchValidatedProxies(
options: ProxyFilterOptions(count: 10, onlyHttps: true),
);
currentProxy = _proxyManager?.getNextProxy(validated: true);
} catch (e) {
_logger.error('Failed to fetch validated proxies: $e');
// Try with unvalidated proxies as a fallback
_logger.info('Trying with unvalidated proxies...');
try {
currentProxy = _proxyManager?.getNextProxy(
validated: false,
);
_logger.warning('Using unvalidated proxy as fallback');
} catch (e) {
_logger.error('No proxies available at all: $e');
// Continue without proxy
}
}
}
}
if (currentProxy != null) {
await _browser.setProxy(currentProxy);
_logger.info(
'Using proxy: ${currentProxy.host}:${currentProxy.port}',
);
} else {
_logger.warning('No proxy available, proceeding without proxy');
}
} catch (e) {
_logger.error('Error setting up proxy: $e');
// Continue without proxy
}
}
// Prepare headers with user agent
final combinedHeaders = <String, String>{};
if (headers != null) {
combinedHeaders.addAll(headers);
}
if (!combinedHeaders.containsKey('User-Agent')) {
final userAgent = _userAgentManager.getRandomUserAgentForSite(url);
combinedHeaders['User-Agent'] = userAgent;
}
// Navigate to URL
final success = await _browser.navigateTo(
url,
headers: combinedHeaders,
timeoutMillis: timeoutMillis,
);
if (!success) {
throw ScrapingException.network(
'Failed to load page',
url: url,
isRetryable: true,
);
}
// Get HTML content
final html = await _browser.getHtml();
// Extract data if selectors provided
Map<String, dynamic>? extractedData;
if (selectors != null && selectors.isNotEmpty) {
extractedData = await _browser.extractData(
selectors,
attributes: attributes,
);
}
// Take screenshot if requested
Uint8List? screenshot;
if (takeScreenshot) {
screenshot = await _browser.takeScreenshot();
}
// Calculate elapsed time
final elapsedMillis =
DateTime.now().difference(startTime).inMilliseconds;
// Update site reputation
_reputationTracker.recordSuccess(url);
// Return successful result
return HeadlessBrowserResult.success(
html: html,
data: extractedData,
screenshot: screenshot,
elapsedMillis: elapsedMillis,
);
} catch (e, stackTrace) {
_logger.error(
'Error scraping $url (attempt ${retryCount + 1}/$_maxRetries): $e',
);
if (kDebugMode) {
_logger.error(stackTrace.toString());
}
// Update site reputation
_reputationTracker.recordFailure(url, e.toString());
// Mark proxy as invalid if using proxies
if (currentProxy != null && _proxyManager != null) {
// Use validateSpecificProxy with false result to mark as invalid
await _proxyManager.validateSpecificProxy(
currentProxy,
updateScore: true,
);
currentProxy = null;
}
retryCount++;
if (retryCount <= _maxRetries) {
_logger.info('Retrying in 1 second...');
await Future.delayed(const Duration(seconds: 1));
} else {
final elapsedMillis =
DateTime.now().difference(startTime).inMilliseconds;
return HeadlessBrowserResult.failure(
errorMessage: 'Failed after $retryCount attempts: ${e.toString()}',
elapsedMillis: elapsedMillis,
);
}
}
}
// This should never be reached, but just in case
return HeadlessBrowserResult.failure(
errorMessage: 'Unknown error occurred',
);
}