toReadableContent static method
Extract readable content (similar to Readability.js)
Implementation
static String toReadableContent(String html) {
// Remove script, style, nav, header, footer, sidebar elements
String content = html.replaceAll(
RegExp(
r'<(script|style|nav|header|footer|aside|sidebar|menu|form)[^>]*>.*?</\1>',
caseSensitive: false,
dotAll: true),
'');
// Remove elements commonly used for ads and navigation
content = content.replaceAll(
RegExp(
r'<[^>]*class=[\"\x27][^\"\x27]*(?:ad|advertisement|banner|social|share|comment|sidebar|navigation|nav)[^\"\x27]*[\"\x27][^>]*>.*?</[^>]*>',
caseSensitive: false,
dotAll: true),
'');
// Focus on content-rich elements
final contentSelectors = [
r'<article[^>]*>(.*?)</article>',
r'<main[^>]*>(.*?)</main>',
r'<div[^>]*class=[\"\x27][^\"\x27]*(?:content|article|post|entry)[^\"\x27]*[\"\x27][^>]*>(.*?)</div>',
];
String? bestContent;
int maxLength = 0;
for (final selector in contentSelectors) {
final matches = RegExp(selector, caseSensitive: false, dotAll: true)
.allMatches(content);
for (final match in matches) {
final extracted = match.group(1) ?? '';
final textLength = toPlainText(extracted).length;
if (textLength > maxLength) {
maxLength = textLength;
bestContent = extracted;
}
}
}
// If no specific content area found, try to extract paragraphs
if (bestContent == null || maxLength < 200) {
final paragraphs =
RegExp(r'<p[^>]*>(.*?)</p>', caseSensitive: false, dotAll: true)
.allMatches(content)
.map((match) => match.group(1) ?? '')
.where((p) => p.trim().length > 50)
.join('\n\n');
if (paragraphs.isNotEmpty) {
bestContent = paragraphs;
}
}
return bestContent != null
? toPlainText(bestContent)
: toPlainText(content);
}