fetchHtml method
Future<String>
fetchHtml({
- required String url,
- required Map<
String, String> headers, - required int timeout,
- required ScrapingLogger logger,
override
Fetches HTML content from the given URL
Implementation
@override
Future<String> fetchHtml({
required String url,
required Map<String, String> headers,
required int timeout,
required ScrapingLogger logger,
}) async {
logger.info('Using specialized handler for onlinekhabar.com');
// Get a sequence of user agents to try for this site
final userAgents = _userAgentManager.getUserAgentSequenceForProblematicSite(
url,
);
logger.info('Prepared ${userAgents.length} user agents to try');
// Try with each user agent
for (final userAgent in userAgents) {
logger.info('Trying with user agent: ${_truncateUserAgent(userAgent)}');
// Enhanced headers specifically for onlinekhabar.com
final enhancedHeaders = {
'User-Agent': userAgent,
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Pragma': 'no-cache',
...headers,
};
// Try different approaches with this user agent
try {
// First try: HttpClient approach
logger.info('Attempting direct HttpClient approach');
try {
final result = await _fetchWithHttpClient(
url,
enhancedHeaders,
timeout,
logger,
);
logger.success('Successfully fetched with HttpClient');
return result;
} catch (e) {
logger.error('HttpClient approach failed: $e');
}
// Second try: http package
logger.info('Attempting with http package');
try {
final response = await http
.get(Uri.parse(url), headers: enhancedHeaders)
.timeout(Duration(milliseconds: timeout));
if (response.statusCode >= 200 && response.statusCode < 300) {
logger.success('Successfully fetched with http package');
return response.body;
} else {
logger.error('HTTP error: ${response.statusCode}');
}
} catch (e) {
logger.error('http package approach failed: $e');
}
// Third try: Alternative URL format (without port)
if (url.contains(':443')) {
logger.info('Attempting with alternative URL format (without port)');
final alternativeUrl = url.replaceAll(':443', '');
try {
final response = await http
.get(Uri.parse(alternativeUrl), headers: enhancedHeaders)
.timeout(Duration(milliseconds: timeout));
if (response.statusCode >= 200 && response.statusCode < 300) {
logger.success('Successfully fetched with alternative URL');
return response.body;
} else {
logger.error(
'HTTP error with alternative URL: ${response.statusCode}',
);
}
} catch (e) {
logger.error('Alternative URL approach failed: $e');
}
}
} catch (e) {
// This catch block should never be reached due to inner try-catch blocks,
// but it's here as a safety net
logger.error(
'Unexpected error with user agent ${_truncateUserAgent(userAgent)}: $e',
);
}
// If we reach here, all approaches with this user agent failed
// We'll try the next user agent
}
// If we reach here, all user agents and approaches failed
throw Exception(
'All approaches failed for onlinekhabar.com after trying ${userAgents.length} user agents',
);
}