clean method

String clean(
String raw
)
Applies full text cleaning pipeline
Implementation

String clean(String raw) {
  if (raw.isEmpty) return '';

  String text = raw;

  // Step 0.5: Protect code blocks and inline code BEFORE all processing
  // Temporarily replace code blocks and inline code with markers so code is not processed
  final codeBlocks = <String>[];
  text = text.replaceAllMapped(RegExp(r'```[\s\S]*?```', multiLine: true), (
    match,
  ) {
    codeBlocks.add(match.group(0)!);
    return '__CODE_BLOCK_${codeBlocks.length - 1}__';
  });

  // Step 1: Normalization and basic preparation
  text = _step1Normalize(text);

  // Step 1.25: Replace URL/EMAIL/PATH/ID BEFORE removing HTML tags
  // This ensures that URLs and emails in format <https://...> and <email@...> are preserved
  text = _step1_25ReplaceTechnicalTokens(text);

  // Step 1.5: Remove HTML tags and comments
  text = _step1_5RemoveHtml(text);

  // Step 2: Remove garbage blocks
  text = _step2RemoveGarbage(text);

  // Step 3: Remove emojis and decorative symbols
  text = _step3RemoveEmojisAndDecorations(text);

  final inlineCode = <String>[];
  text = text.replaceAllMapped(RegExp(r'`[^`\n]+`'), (match) {
    inlineCode.add(match.group(0)!);
    return '__INLINE_CODE_${inlineCode.length - 1}__';
  });

  // Step 4: Normalize punctuation and tails (code protected)
  text = _step4NormalizePunctuation(text);

  // Step 4.5: Restore inline code before token processing
  for (var i = 0; i < inlineCode.length; i++) {
    text = text.replaceAll('__INLINE_CODE_${i}__', inlineCode[i]);
  }

  // Step 4.6: Re-protect code blocks for step 5 (token processing)
  // Simply copy codeBlocks to codeBlocks2, as code blocks are already protected by markers
  // and should not have been changed in step 4
  final codeBlocks2 = List<String>.from(codeBlocks);

  final inlineCode2 = <String>[];
  text = text.replaceAllMapped(RegExp(r'`[^`\n]+`'), (match) {
    inlineCode2.add(match.group(0)!);
    return '__INLINE_CODE_${inlineCode2.length - 1}__';
  });

  // Step 5: Process links, e-mail and technical tokens (code protected)
  text = _step5ReplaceTechnicalTokens(text);

  // Step 5.5: Restore inline code
  for (var i = 0; i < inlineCode2.length; i++) {
    // Use multiple approaches for reliable marker search and replacement
    text = text.replaceAll('__INLINE_CODE_${i}__', inlineCode2[i]);
    text = text.replaceAll('__INLINE_CODE_$i', inlineCode2[i]);
    text = text.replaceAllMapped(
      RegExp(r'__INLINE_CODE_' + i.toString() + r'__?'),
      (match) => inlineCode2[i],
    );
  }

  // Step 5.6: Restore code blocks
  // Replace markers in reverse order to avoid index conflicts
  // Process markers both with and without double underscores at the end
  for (var i = codeBlocks2.length - 1; i >= 0; i--) {
    // Use multiple approaches for reliable marker search and replacement
    // 1. With double underscores at the end
    text = text.replaceAll('__CODE_BLOCK_${i}__', codeBlocks2[i]);
    // 2. Without double underscores at the end
    text = text.replaceAll('__CODE_BLOCK_$i', codeBlocks2[i]);
    // 3. With regex for more flexible search
    text = text.replaceAllMapped(
      RegExp(r'__CODE_BLOCK_' + i.toString() + r'__?'),
      (match) => codeBlocks2[i],
    );
  }

  // Step 6: Text structure: headers, lists, code
  // In technical mode, pass saved code blocks for restoration
  text = _step6ProcessStructure(
    text,
    codeBlocks2: mode == TextProcessingMode.technical ? codeBlocks2 : null,
  );

  // Step 7: Whitespace and line breaks
  text = _step7NormalizeWhitespace(text);

  // Step 8: Filter meaningless lines
  text = _step8FilterMeaninglessLines(text);

  return text;
}
clean method

Implementation

TextPreprocessor class