toReadableContent static method

String toReadableContent(
  1. String html
)

Extract readable content (similar to Readability.js)

Implementation

static String toReadableContent(String html) {
  // Remove script, style, nav, header, footer, sidebar elements
  String content = html.replaceAll(
      RegExp(
          r'<(script|style|nav|header|footer|aside|sidebar|menu|form)[^>]*>.*?</\1>',
          caseSensitive: false,
          dotAll: true),
      '');

  // Remove elements commonly used for ads and navigation
  content = content.replaceAll(
      RegExp(
          r'<[^>]*class=[\"\x27][^\"\x27]*(?:ad|advertisement|banner|social|share|comment|sidebar|navigation|nav)[^\"\x27]*[\"\x27][^>]*>.*?</[^>]*>',
          caseSensitive: false,
          dotAll: true),
      '');

  // Focus on content-rich elements
  final contentSelectors = [
    r'<article[^>]*>(.*?)</article>',
    r'<main[^>]*>(.*?)</main>',
    r'<div[^>]*class=[\"\x27][^\"\x27]*(?:content|article|post|entry)[^\"\x27]*[\"\x27][^>]*>(.*?)</div>',
  ];

  String? bestContent;
  int maxLength = 0;

  for (final selector in contentSelectors) {
    final matches = RegExp(selector, caseSensitive: false, dotAll: true)
        .allMatches(content);
    for (final match in matches) {
      final extracted = match.group(1) ?? '';
      final textLength = toPlainText(extracted).length;
      if (textLength > maxLength) {
        maxLength = textLength;
        bestContent = extracted;
      }
    }
  }

  // If no specific content area found, try to extract paragraphs
  if (bestContent == null || maxLength < 200) {
    final paragraphs =
        RegExp(r'<p[^>]*>(.*?)</p>', caseSensitive: false, dotAll: true)
            .allMatches(content)
            .map((match) => match.group(1) ?? '')
            .where((p) => p.trim().length > 50)
            .join('\n\n');

    if (paragraphs.isNotEmpty) {
      bestContent = paragraphs;
    }
  }

  return bestContent != null
      ? toPlainText(bestContent)
      : toPlainText(content);
}