scrapeWithPagination<T> method
Scrapes multiple pages with pagination
url
is the starting URL
config
is the pagination configuration
extractor
is a function that extracts data from each page
headers
are additional headers to send with the request
timeout
is the timeout for the request in milliseconds
retries
is the number of retry attempts
Implementation
Future<PaginationResult<T>> scrapeWithPagination<T>({
required String url,
required PaginationConfig config,
required Future<T> Function(String html, String pageUrl) extractor,
Map<String, String>? headers,
int? timeout,
int? retries,
}) async {
final results = <T>[];
final pageUrls = <String>[];
String? nextPageUrl = url;
bool hasMorePages = false;
int pageCount = 0;
try {
// Scrape pages until we reach the maximum or there are no more pages
while (nextPageUrl != null &&
pageCount < config.maxPages &&
pageUrls.length <= config.maxDepth) {
// Avoid duplicate pages
if (pageUrls.contains(nextPageUrl)) {
logger?.warning('Duplicate page URL detected: $nextPageUrl');
break;
}
// Fetch the page
logger?.info('Fetching page ${pageCount + 1}: $nextPageUrl');
final html = await _webScraper.fetchHtml(
url: nextPageUrl,
headers: headers,
timeout: timeout,
retries: retries,
);
// Add the page URL to the list
pageUrls.add(nextPageUrl);
pageCount++;
// Extract data from the page
final result = await extractor(html, nextPageUrl);
results.add(result);
// If we're not following pagination, stop here
if (!config.followPagination) {
break;
}
// Detect pagination
final paginationDetector = PaginationDetector(
baseUrl: nextPageUrl,
logger: logger,
);
final paginationResult = paginationDetector.detectPagination(html);
// Update the next page URL
nextPageUrl = paginationResult.nextPageUrl;
hasMorePages = !paginationResult.isLastPage;
// If there's no next page URL, we're done
if (nextPageUrl == null) {
logger?.info('No more pages to scrape');
break;
}
// Validate the next page URL if needed
if (config.validateLinks) {
if (!_isValidUrl(nextPageUrl)) {
logger?.warning('Invalid next page URL: $nextPageUrl');
nextPageUrl = null;
break;
}
}
// Log the next page URL
logger?.info('Next page URL: $nextPageUrl');
}
// Return the results
return PaginationResult(
results: results,
pageUrls: pageUrls,
pageCount: pageCount,
hasMorePages: hasMorePages,
nextPageUrl: nextPageUrl,
);
} catch (e) {
logger?.error('Error scraping with pagination: $e');
throw ScrapingException.pagination(
'Error scraping with pagination',
originalException: e,
isRetryable: true,
);
}
}