WebScraper constructor
WebScraper({
- required ProxyManager proxyManager,
- ProxyHttpClient? httpClient,
- String? defaultUserAgent,
- Map<
String, String> ? defaultHeaders, - int defaultTimeout = 30000,
- int maxRetries = 3,
- AdaptiveScrapingStrategy? adaptiveStrategy,
- SiteReputationTracker? reputationTracker,
- ScrapingLogger? logger,
- RobotsTxtHandler? robotsTxtHandler,
- StreamingHtmlParser? streamingParser,
- ContentValidator? contentValidator,
- StructuredDataValidator? structuredDataValidator,
- SelectorValidator? selectorValidator,
- RateLimiter? rateLimiter,
- RequestQueue? requestQueue,
- StructuredDataExtractor? structuredDataExtractor,
- ContentDetector? contentDetector,
- TextExtractor? textExtractor,
- HeadlessBrowser? headlessBrowser,
- LazyLoadDetector? lazyLoadDetector,
- LazyLoadHandler? lazyLoadHandler,
- PaginationHandler? paginationHandler,
- bool respectRobotsTxt = true,
Creates a new WebScraper with the given parameters
Implementation
WebScraper({
required this.proxyManager,
ProxyHttpClient? httpClient,
String? defaultUserAgent,
Map<String, String>? defaultHeaders,
int defaultTimeout = 30000,
int maxRetries = 3,
AdaptiveScrapingStrategy? adaptiveStrategy,
SiteReputationTracker? reputationTracker,
ScrapingLogger? logger,
RobotsTxtHandler? robotsTxtHandler,
StreamingHtmlParser? streamingParser,
ContentValidator? contentValidator,
StructuredDataValidator? structuredDataValidator,
SelectorValidator? selectorValidator,
RateLimiter? rateLimiter,
RequestQueue? requestQueue,
StructuredDataExtractor? structuredDataExtractor,
ContentDetector? contentDetector,
TextExtractor? textExtractor,
HeadlessBrowser? headlessBrowser,
LazyLoadDetector? lazyLoadDetector,
LazyLoadHandler? lazyLoadHandler,
PaginationHandler? paginationHandler,
bool respectRobotsTxt = true,
}) : _httpClient =
httpClient ??
ProxyHttpClient(
proxyManager: proxyManager,
useValidatedProxies: true,
rotateProxies: true,
),
_defaultUserAgent =
defaultUserAgent ??
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
_defaultHeaders = defaultHeaders ?? {},
_defaultTimeout = defaultTimeout,
_maxRetries = maxRetries,
_reputationTracker = reputationTracker ?? SiteReputationTracker(),
_logger = logger ?? ScrapingLogger(),
_adaptiveStrategy =
adaptiveStrategy ??
AdaptiveScrapingStrategy(reputationTracker: reputationTracker),
_robotsTxtHandler =
robotsTxtHandler ??
RobotsTxtHandler(
proxyManager: proxyManager,
logger: logger,
defaultUserAgent: defaultUserAgent,
respectRobotsTxt: respectRobotsTxt,
),
_respectRobotsTxt = respectRobotsTxt,
_streamingParser =
streamingParser ?? StreamingHtmlParser(logger: logger),
_contentValidator =
contentValidator ?? ContentValidator(logger: Logger('WebScraper')),
_structuredDataValidator =
structuredDataValidator ??
StructuredDataValidator(logger: Logger('WebScraper')),
_selectorValidator =
selectorValidator ?? SelectorValidator(logger: Logger('WebScraper')),
_rateLimiter = rateLimiter ?? RateLimiter(logger: Logger('WebScraper')),
_requestQueue =
requestQueue ??
RequestQueue(
rateLimiter:
rateLimiter ?? RateLimiter(logger: Logger('WebScraper')),
logger: Logger('WebScraper'),
),
_structuredDataExtractor =
structuredDataExtractor ??
StructuredDataExtractor(logger: Logger('WebScraper')),
_contentDetector =
contentDetector ?? ContentDetector(logger: Logger('WebScraper')),
_textExtractor =
textExtractor ?? TextExtractor(logger: Logger('WebScraper')),
_headlessBrowser = headlessBrowser ?? HeadlessBrowser(),
_lazyLoadHandler =
lazyLoadHandler ??
LazyLoadHandler(
headlessBrowser: headlessBrowser ?? HeadlessBrowser(),
logger: Logger('WebScraper'),
) {
// Initialize pagination handler after construction
_paginationHandler =
paginationHandler ??
PaginationHandler(webScraper: this, logger: Logger('WebScraper'));
}