fetchHtml method

  1. @override
Future<String> fetchHtml({
  1. required String url,
  2. required Map<String, String> headers,
  3. required int timeout,
  4. required ScrapingLogger logger,
})
override

Fetches HTML content from the given URL

Implementation

@override
Future<String> fetchHtml({
  required String url,
  required Map<String, String> headers,
  required int timeout,
  required ScrapingLogger logger,
}) async {
  logger.info('Using specialized handler for onlinekhabar.com');

  // Get a sequence of user agents to try for this site
  final userAgents = _userAgentManager.getUserAgentSequenceForProblematicSite(
    url,
  );
  logger.info('Prepared ${userAgents.length} user agents to try');

  // Try with each user agent
  for (final userAgent in userAgents) {
    logger.info('Trying with user agent: ${_truncateUserAgent(userAgent)}');

    // Enhanced headers specifically for onlinekhabar.com
    final enhancedHeaders = {
      'User-Agent': userAgent,
      'Accept':
          'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
      'Accept-Language': 'en-US,en;q=0.5',
      'Accept-Encoding': 'gzip, deflate, br',
      'Connection': 'keep-alive',
      'Upgrade-Insecure-Requests': '1',
      'Cache-Control': 'max-age=0',
      'Sec-Fetch-Dest': 'document',
      'Sec-Fetch-Mode': 'navigate',
      'Sec-Fetch-Site': 'none',
      'Sec-Fetch-User': '?1',
      'Pragma': 'no-cache',
      ...headers,
    };

    // Try different approaches with this user agent
    try {
      // First try: HttpClient approach
      logger.info('Attempting direct HttpClient approach');
      try {
        final result = await _fetchWithHttpClient(
          url,
          enhancedHeaders,
          timeout,
          logger,
        );
        logger.success('Successfully fetched with HttpClient');
        return result;
      } catch (e) {
        logger.error('HttpClient approach failed: $e');
      }

      // Second try: http package
      logger.info('Attempting with http package');
      try {
        final response = await http
            .get(Uri.parse(url), headers: enhancedHeaders)
            .timeout(Duration(milliseconds: timeout));

        if (response.statusCode >= 200 && response.statusCode < 300) {
          logger.success('Successfully fetched with http package');
          return response.body;
        } else {
          logger.error('HTTP error: ${response.statusCode}');
        }
      } catch (e) {
        logger.error('http package approach failed: $e');
      }

      // Third try: Alternative URL format (without port)
      if (url.contains(':443')) {
        logger.info('Attempting with alternative URL format (without port)');
        final alternativeUrl = url.replaceAll(':443', '');

        try {
          final response = await http
              .get(Uri.parse(alternativeUrl), headers: enhancedHeaders)
              .timeout(Duration(milliseconds: timeout));

          if (response.statusCode >= 200 && response.statusCode < 300) {
            logger.success('Successfully fetched with alternative URL');
            return response.body;
          } else {
            logger.error(
              'HTTP error with alternative URL: ${response.statusCode}',
            );
          }
        } catch (e) {
          logger.error('Alternative URL approach failed: $e');
        }
      }
    } catch (e) {
      // This catch block should never be reached due to inner try-catch blocks,
      // but it's here as a safety net
      logger.error(
        'Unexpected error with user agent ${_truncateUserAgent(userAgent)}: $e',
      );
    }

    // If we reach here, all approaches with this user agent failed
    // We'll try the next user agent
  }

  // If we reach here, all user agents and approaches failed
  throw Exception(
    'All approaches failed for onlinekhabar.com after trying ${userAgents.length} user agents',
  );
}