clean method
Applies full text cleaning pipeline
Implementation
String clean(String raw) {
if (raw.isEmpty) return '';
String text = raw;
// Step 0.5: Protect code blocks and inline code BEFORE all processing
// Temporarily replace code blocks and inline code with markers so code is not processed
final codeBlocks = <String>[];
text = text.replaceAllMapped(RegExp(r'```[\s\S]*?```', multiLine: true), (
match,
) {
codeBlocks.add(match.group(0)!);
return '__CODE_BLOCK_${codeBlocks.length - 1}__';
});
// Step 1: Normalization and basic preparation
text = _step1Normalize(text);
// Step 1.25: Replace URL/EMAIL/PATH/ID BEFORE removing HTML tags
// This ensures that URLs and emails in format <https://...> and <email@...> are preserved
text = _step1_25ReplaceTechnicalTokens(text);
// Step 1.5: Remove HTML tags and comments
text = _step1_5RemoveHtml(text);
// Step 2: Remove garbage blocks
text = _step2RemoveGarbage(text);
// Step 3: Remove emojis and decorative symbols
text = _step3RemoveEmojisAndDecorations(text);
final inlineCode = <String>[];
text = text.replaceAllMapped(RegExp(r'`[^`\n]+`'), (match) {
inlineCode.add(match.group(0)!);
return '__INLINE_CODE_${inlineCode.length - 1}__';
});
// Step 4: Normalize punctuation and tails (code protected)
text = _step4NormalizePunctuation(text);
// Step 4.5: Restore inline code before token processing
for (var i = 0; i < inlineCode.length; i++) {
text = text.replaceAll('__INLINE_CODE_${i}__', inlineCode[i]);
}
// Step 4.6: Re-protect code blocks for step 5 (token processing)
// Simply copy codeBlocks to codeBlocks2, as code blocks are already protected by markers
// and should not have been changed in step 4
final codeBlocks2 = List<String>.from(codeBlocks);
final inlineCode2 = <String>[];
text = text.replaceAllMapped(RegExp(r'`[^`\n]+`'), (match) {
inlineCode2.add(match.group(0)!);
return '__INLINE_CODE_${inlineCode2.length - 1}__';
});
// Step 5: Process links, e-mail and technical tokens (code protected)
text = _step5ReplaceTechnicalTokens(text);
// Step 5.5: Restore inline code
for (var i = 0; i < inlineCode2.length; i++) {
// Use multiple approaches for reliable marker search and replacement
text = text.replaceAll('__INLINE_CODE_${i}__', inlineCode2[i]);
text = text.replaceAll('__INLINE_CODE_$i', inlineCode2[i]);
text = text.replaceAllMapped(
RegExp(r'__INLINE_CODE_' + i.toString() + r'__?'),
(match) => inlineCode2[i],
);
}
// Step 5.6: Restore code blocks
// Replace markers in reverse order to avoid index conflicts
// Process markers both with and without double underscores at the end
for (var i = codeBlocks2.length - 1; i >= 0; i--) {
// Use multiple approaches for reliable marker search and replacement
// 1. With double underscores at the end
text = text.replaceAll('__CODE_BLOCK_${i}__', codeBlocks2[i]);
// 2. Without double underscores at the end
text = text.replaceAll('__CODE_BLOCK_$i', codeBlocks2[i]);
// 3. With regex for more flexible search
text = text.replaceAllMapped(
RegExp(r'__CODE_BLOCK_' + i.toString() + r'__?'),
(match) => codeBlocks2[i],
);
}
// Step 6: Text structure: headers, lists, code
// In technical mode, pass saved code blocks for restoration
text = _step6ProcessStructure(
text,
codeBlocks2: mode == TextProcessingMode.technical ? codeBlocks2 : null,
);
// Step 7: Whitespace and line breaks
text = _step7NormalizeWhitespace(text);
// Step 8: Filter meaningless lines
text = _step8FilterMeaninglessLines(text);
return text;
}