toCleanHtml static method

String toCleanHtml(
  1. String html
)

Clean HTML while preserving basic structure

Implementation

static String toCleanHtml(String html) {
  // Remove unwanted elements
  String cleaned = html.replaceAll(
      RegExp(r'<(script|style|nav|header|footer|aside|menu)[^>]*>.*?</\1>',
          caseSensitive: false, dotAll: true),
      '');

  // Remove comments
  cleaned = cleaned.replaceAll(RegExp(r'<!--.*?-->', dotAll: true), '');

  // Clean up attributes but keep essential ones
  cleaned = cleaned.replaceAllMapped(
      RegExp(r'<([a-z]+)[^>]*>', caseSensitive: false), (match) {
    final tag = match.group(1)!.toLowerCase();
    switch (tag) {
      case 'a':
        // Keep href attribute for links
        final hrefMatch =
            RegExp(r'href=[\"\x27]([^\"\x27]*)[\"\x27]', caseSensitive: false)
                .firstMatch(match.group(0)!);
        return hrefMatch != null ? '<a href="${hrefMatch.group(1)}">' : '<a>';
      case 'img':
        // Keep src and alt for images
        final imgMatch = RegExp(
                r'src=[\"\x27]([^\"\x27]*)[\"\x27].*?alt=[\"\x27]([^\"\x27]*)[\"\x27]',
                caseSensitive: false)
            .firstMatch(match.group(0)!);
        return imgMatch != null
            ? '<img src="${imgMatch.group(1)}" alt="${imgMatch.group(2)}">'
            : '<img>';
      default:
        return '<$tag>';
    }
  });

  return cleaned;
}