ConcurrentWebScraper constructor
ConcurrentWebScraper({
- required ProxyManager proxyManager,
- int maxConcurrentTasks = 5,
- ProxyHttpClient? httpClient,
- String? defaultUserAgent,
- Map<
String, String> ? defaultHeaders, - int defaultTimeout = 30000,
- int maxRetries = 3,
- ScrapingLogger? logger,
- RobotsTxtHandler? robotsTxtHandler,
- StreamingHtmlParser? streamingParser,
- bool respectRobotsTxt = true,
Creates a new ConcurrentWebScraper with the given parameters
proxyManager
is the proxy manager for getting proxies
maxConcurrentTasks
is the maximum number of concurrent tasks
httpClient
is the HTTP client to use
defaultUserAgent
is the default user agent to use
defaultHeaders
are the default headers to use
defaultTimeout
is the default timeout for requests in milliseconds
maxRetries
is the maximum number of retry attempts
logger
is the logger for scraping operations
robotsTxtHandler
is the robots.txt handler
streamingParser
is the streaming HTML parser
respectRobotsTxt
whether to respect robots.txt rules
Implementation
ConcurrentWebScraper({
required ProxyManager proxyManager,
int maxConcurrentTasks = 5,
ProxyHttpClient? httpClient,
String? defaultUserAgent,
Map<String, String>? defaultHeaders,
int defaultTimeout = 30000,
int maxRetries = 3,
ScrapingLogger? logger,
RobotsTxtHandler? robotsTxtHandler,
StreamingHtmlParser? streamingParser,
bool respectRobotsTxt = true,
}) : _webScraper = WebScraper(
proxyManager: proxyManager,
httpClient: httpClient,
defaultUserAgent: defaultUserAgent,
defaultHeaders: defaultHeaders,
defaultTimeout: defaultTimeout,
maxRetries: maxRetries,
logger: logger,
robotsTxtHandler: robotsTxtHandler,
streamingParser: streamingParser,
respectRobotsTxt: respectRobotsTxt,
),
_taskQueue = ScrapingTaskQueue(
maxConcurrentTasks: maxConcurrentTasks,
logger: logger,
),
_logger = logger ?? ScrapingLogger();