extractReadableText static method
Extract readable text with smart paragraph detection
Implementation
static String extractReadableText(String html) {
// First remove clutter
String content = removeClutter(html);
// Extract text from content-rich elements
final contentElements = [
'article',
'main',
'section',
'[class*="content"]',
'[class*="article"]',
'[class*="post"]'
];
String bestContent = '';
int maxScore = 0;
for (final element in contentElements) {
final pattern = element.startsWith('[')
? r'<[^>]*' +
element.replaceAll(RegExp(r'[\[\]*"]'), r'\$&') +
r'[^>]*>(.*?)</[^>]*>'
: '<$element[^>]*>(.*?)</$element>';
final matches = RegExp(pattern, caseSensitive: false, dotAll: true)
.allMatches(content);
for (final match in matches) {
final text = toPlainText(match.group(1) ?? '');
final score = _calculateContentScore(text);
if (score > maxScore) {
maxScore = score;
bestContent = text;
}
}
}
return bestContent.isNotEmpty ? bestContent : toPlainText(content);
}