extractText method

TextExtractionResult extractText(
  1. String html, {
  2. TextExtractionOptions options = const TextExtractionOptions(),
})

Extracts text from HTML

Implementation

TextExtractionResult extractText(
  String html, {
  TextExtractionOptions options = const TextExtractionOptions(),
}) {
  try {
    final document = html_parser.parse(html);

    // If extracting only the main content, use the content detector
    if (options.extractMainContentOnly) {
      final contentResult = _contentDetector.detectContent(html);
      if (contentResult.mainContentText.isNotEmpty) {
        // If preserving formatting, extract formatted text from the main content element
        if (options.preserveFormatting ||
            options.preserveHeadings ||
            options.preserveLists ||
            options.preserveParagraphs ||
            options.preserveTables ||
            options.preserveLineBreaks) {
          final mainContentElement = contentResult.mainContentElement;
          if (mainContentElement != null) {
            final formattedText = _extractFormattedText(
              mainContentElement,
              options,
            );
            return TextExtractionResult(
              text: formattedText,
              title: contentResult.title,
              author: contentResult.author,
              publishDate: contentResult.publishDate,
              modifiedDate: contentResult.modifiedDate,
              readingTimeMinutes: contentResult.readingTimeMinutes,
            );
          }
        }

        // Otherwise, use the plain text from the content detector
        return TextExtractionResult(
          text: contentResult.mainContentText,
          title: contentResult.title,
          author: contentResult.author,
          publishDate: contentResult.publishDate,
          modifiedDate: contentResult.modifiedDate,
          readingTimeMinutes: contentResult.readingTimeMinutes,
        );
      }
    }

    // If not extracting only the main content or if main content detection failed,
    // extract text from the entire document
    final body = document.body;
    if (body != null) {
      final text =
          options.preserveFormatting ||
                  options.preserveHeadings ||
                  options.preserveLists ||
                  options.preserveParagraphs ||
                  options.preserveTables ||
                  options.preserveLineBreaks
              ? _extractFormattedText(body, options)
              : _extractPlainText(body);

      // Extract metadata from the document
      String? title;
      String? author;
      DateTime? publishDate;
      DateTime? modifiedDate;
      int? readingTimeMinutes;

      // Try to find the title
      final ogTitle = document.querySelector('meta[property="og:title"]');
      if (ogTitle != null && ogTitle.attributes.containsKey('content')) {
        title = ogTitle.attributes['content'];
      } else {
        final titleElement = document.querySelector('title');
        if (titleElement != null) {
          title = titleElement.text.trim();
        }
      }

      // Try to find the author
      final ogAuthor = document.querySelector('meta[property="og:author"]');
      if (ogAuthor != null && ogAuthor.attributes.containsKey('content')) {
        author = ogAuthor.attributes['content'];
      } else {
        final metaAuthor = document.querySelector('meta[name="author"]');
        if (metaAuthor != null &&
            metaAuthor.attributes.containsKey('content')) {
          author = metaAuthor.attributes['content'];
        }
      }

      // Try to find the publish date
      final ogPublishedTime = document.querySelector(
        'meta[property="article:published_time"]',
      );
      if (ogPublishedTime != null &&
          ogPublishedTime.attributes.containsKey('content')) {
        try {
          publishDate = DateTime.parse(
            ogPublishedTime.attributes['content']!,
          );
        } catch (_) {}
      }

      // Try to find the modified date
      final ogModifiedTime = document.querySelector(
        'meta[property="article:modified_time"]',
      );
      if (ogModifiedTime != null &&
          ogModifiedTime.attributes.containsKey('content')) {
        try {
          modifiedDate = DateTime.parse(
            ogModifiedTime.attributes['content']!,
          );
        } catch (_) {}
      }

      // Calculate reading time
      const wordsPerMinute = 225;
      final wordCount = text.split(RegExp(r'\s+')).length;
      readingTimeMinutes = (wordCount / wordsPerMinute).ceil();
      if (readingTimeMinutes < 1) readingTimeMinutes = 1;

      return TextExtractionResult(
        text: text,
        title: title,
        author: author,
        publishDate: publishDate,
        modifiedDate: modifiedDate,
        readingTimeMinutes: readingTimeMinutes,
      );
    }

    // If all else fails, return an empty result
    logger?.error('Could not extract text');
    return TextExtractionResult.empty();
  } catch (e) {
    logger?.error('Error extracting text: $e');
    return TextExtractionResult.empty();
  }
}