toCleanHtml static method
Clean HTML while preserving basic structure
Implementation
static String toCleanHtml(String html) {
// Remove unwanted elements
String cleaned = html.replaceAll(
RegExp(r'<(script|style|nav|header|footer|aside|menu)[^>]*>.*?</\1>',
caseSensitive: false, dotAll: true),
'');
// Remove comments
cleaned = cleaned.replaceAll(RegExp(r'<!--.*?-->', dotAll: true), '');
// Clean up attributes but keep essential ones
cleaned = cleaned.replaceAllMapped(
RegExp(r'<([a-z]+)[^>]*>', caseSensitive: false), (match) {
final tag = match.group(1)!.toLowerCase();
switch (tag) {
case 'a':
// Keep href attribute for links
final hrefMatch =
RegExp(r'href=[\"\x27]([^\"\x27]*)[\"\x27]', caseSensitive: false)
.firstMatch(match.group(0)!);
return hrefMatch != null ? '<a href="${hrefMatch.group(1)}">' : '<a>';
case 'img':
// Keep src and alt for images
final imgMatch = RegExp(
r'src=[\"\x27]([^\"\x27]*)[\"\x27].*?alt=[\"\x27]([^\"\x27]*)[\"\x27]',
caseSensitive: false)
.firstMatch(match.group(0)!);
return imgMatch != null
? '<img src="${imgMatch.group(1)}" alt="${imgMatch.group(2)}">'
: '<img>';
default:
return '<$tag>';
}
});
return cleaned;
}