extractText method
TextExtractionResult
extractText(
- String html, {
- TextExtractionOptions options = const TextExtractionOptions(),
Extracts text from HTML
Implementation
TextExtractionResult extractText(
String html, {
TextExtractionOptions options = const TextExtractionOptions(),
}) {
try {
final document = html_parser.parse(html);
// If extracting only the main content, use the content detector
if (options.extractMainContentOnly) {
final contentResult = _contentDetector.detectContent(html);
if (contentResult.mainContentText.isNotEmpty) {
// If preserving formatting, extract formatted text from the main content element
if (options.preserveFormatting ||
options.preserveHeadings ||
options.preserveLists ||
options.preserveParagraphs ||
options.preserveTables ||
options.preserveLineBreaks) {
final mainContentElement = contentResult.mainContentElement;
if (mainContentElement != null) {
final formattedText = _extractFormattedText(
mainContentElement,
options,
);
return TextExtractionResult(
text: formattedText,
title: contentResult.title,
author: contentResult.author,
publishDate: contentResult.publishDate,
modifiedDate: contentResult.modifiedDate,
readingTimeMinutes: contentResult.readingTimeMinutes,
);
}
}
// Otherwise, use the plain text from the content detector
return TextExtractionResult(
text: contentResult.mainContentText,
title: contentResult.title,
author: contentResult.author,
publishDate: contentResult.publishDate,
modifiedDate: contentResult.modifiedDate,
readingTimeMinutes: contentResult.readingTimeMinutes,
);
}
}
// If not extracting only the main content or if main content detection failed,
// extract text from the entire document
final body = document.body;
if (body != null) {
final text =
options.preserveFormatting ||
options.preserveHeadings ||
options.preserveLists ||
options.preserveParagraphs ||
options.preserveTables ||
options.preserveLineBreaks
? _extractFormattedText(body, options)
: _extractPlainText(body);
// Extract metadata from the document
String? title;
String? author;
DateTime? publishDate;
DateTime? modifiedDate;
int? readingTimeMinutes;
// Try to find the title
final ogTitle = document.querySelector('meta[property="og:title"]');
if (ogTitle != null && ogTitle.attributes.containsKey('content')) {
title = ogTitle.attributes['content'];
} else {
final titleElement = document.querySelector('title');
if (titleElement != null) {
title = titleElement.text.trim();
}
}
// Try to find the author
final ogAuthor = document.querySelector('meta[property="og:author"]');
if (ogAuthor != null && ogAuthor.attributes.containsKey('content')) {
author = ogAuthor.attributes['content'];
} else {
final metaAuthor = document.querySelector('meta[name="author"]');
if (metaAuthor != null &&
metaAuthor.attributes.containsKey('content')) {
author = metaAuthor.attributes['content'];
}
}
// Try to find the publish date
final ogPublishedTime = document.querySelector(
'meta[property="article:published_time"]',
);
if (ogPublishedTime != null &&
ogPublishedTime.attributes.containsKey('content')) {
try {
publishDate = DateTime.parse(
ogPublishedTime.attributes['content']!,
);
} catch (_) {}
}
// Try to find the modified date
final ogModifiedTime = document.querySelector(
'meta[property="article:modified_time"]',
);
if (ogModifiedTime != null &&
ogModifiedTime.attributes.containsKey('content')) {
try {
modifiedDate = DateTime.parse(
ogModifiedTime.attributes['content']!,
);
} catch (_) {}
}
// Calculate reading time
const wordsPerMinute = 225;
final wordCount = text.split(RegExp(r'\s+')).length;
readingTimeMinutes = (wordCount / wordsPerMinute).ceil();
if (readingTimeMinutes < 1) readingTimeMinutes = 1;
return TextExtractionResult(
text: text,
title: title,
author: author,
publishDate: publishDate,
modifiedDate: modifiedDate,
readingTimeMinutes: readingTimeMinutes,
);
}
// If all else fails, return an empty result
logger?.error('Could not extract text');
return TextExtractionResult.empty();
} catch (e) {
logger?.error('Error extracting text: $e');
return TextExtractionResult.empty();
}
}