if ( ! empty( $current_crawler['uid'] ) ) {
if ( empty( $this->_server_ip ) ) {
self::debug( '🛑 Terminated crawler due to Server IP not set' );
$vary_name = $this->cls( 'Vary' )->get_vary_name();
$vary_val = $this->cls( 'Vary' )->finalize_default_vary( $current_crawler['uid'] );
$this->_crawler_conf['cookies'][ $vary_name ] = $vary_val;
$this->_crawler_conf['cookies']['litespeed_hash'] = Router::cls()->get_hash( $current_crawler['uid'] );
* Get crawler duration allowance.
public function get_crawler_duration() {
$run_duration = defined( 'LITESPEED_CRAWLER_DURATION' ) ? (int) constant( 'LITESPEED_CRAWLER_DURATION' ) : 900;
if ( $run_duration > 900 ) {
$run_duration = 900; // reset to default value if defined higher than 900 seconds.
private function _engine_start() {
$this->_adjust_current_threads();
if ( 0 === (int) $this->_cur_threads ) {
$this->_end_reason = 'stopped_highload';
self::debug( 'Stopped due to heavy load.' );
self::save_summary( [ 'last_start_time' => time() ] );
$max_time = (int) ini_get( 'max_execution_time' );
self::debug( 'ini_get max_execution_time=' . $max_time );
$max_time = 300; // hardlimit.
if ( $max_time >= (int) $this->_crawler_conf['run_duration'] ) {
$max_time = (int) $this->_crawler_conf['run_duration'];
self::debug( 'Use run_duration setting as max_execution_time=' . $max_time );
// phpcs:ignore WordPress.PHP.IniSet.max_execution_time_Disallowed -- Required for crawler functionality.
} elseif ( ini_set( 'max_execution_time', $this->_crawler_conf['run_duration'] + 15 ) !== false ) {
$max_time = $this->_crawler_conf['run_duration'];
self::debug( 'ini_set max_execution_time=' . $max_time );
self::debug( 'final max_execution_time=' . $max_time );
$this->_max_run_time = $max_time + time();
$this->_prepare_running();
$this->_terminate_running();
* @return int Load or -1 if unsupported.
public function get_server_load() {
if ( ! function_exists( 'sys_getloadavg' ) ) {
$curload = sys_getloadavg();
$curload = (float) $curload[0];
self::debug( 'Server load: ' . $curload );
* Adjust threads dynamically.
private function _adjust_current_threads() {
$curload = $this->get_server_load();
if ( -1 === (int) $curload ) {
self::debug( 'set threads=0 due to func sys_getloadavg not exist!' );
$curload /= (float) $this->_ncpu;
$crawler_threads = defined( 'LITESPEED_CRAWLER_THREADS' ) ? (int) constant( 'LITESPEED_CRAWLER_THREADS' ) : 3;
$load_limit = (float) $this->_crawler_conf['load_limit'];
$current_threads = (int) $this->_cur_threads;
if ( -1 === $current_threads ) {
if ( $curload > $load_limit ) {
} elseif ( $curload >= ( $load_limit - 1 ) ) {
$curthreads = (int) ( $load_limit - $curload );
if ( $curthreads > $crawler_threads ) {
$curthreads = $crawler_threads;
$curthreads = $current_threads;
if ( $curload >= ( $load_limit + 1 ) ) {
sleep( 5 ); // sleep 5 secs.
if ( $curthreads >= 1 ) {
} elseif ( $curload >= $load_limit ) {
} elseif ( ( $curload + 1 ) < $load_limit ) {
if ( $curthreads < $crawler_threads ) {
$this->_cur_threads = (int) $curthreads;
$this->_cur_thread_time = time();
private function _prepare_running() {
$this->_summary['is_running'] = time();
$this->_summary['done'] = 0; // reset done status.
$this->_summary['last_status'] = 'prepare running';
$this->_summary['last_crawled'] = 0;
// Current crawler starttime mark.
if ( 0 === (int) $this->_summary['last_pos'] ) {
$this->_summary['curr_crawler_beginning_time'] = time();
if ( 0 === (int) $this->_summary['curr_crawler'] && 0 === (int) $this->_summary['last_pos'] ) {
$this->_summary['this_full_beginning_time'] = time();
$this->_summary['list_size'] = $this->cls( 'Crawler_Map' )->count_map();
if ( 'end' === $this->_summary['end_reason'] && 0 === (int) $this->_summary['last_pos'] ) {
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ] = [];
private function _take_over_lane() {
self::debug( 'Take over lane as lane is free: ' . $this->json_local_path() . '.pid' );
File::save( $this->json_local_path() . '.pid', LITESPEED_LANE_HASH );
* Update lane file mtime.
private function _touch_lane() {
// phpcs:ignore WordPress.WP.AlternativeFunctions.file_system_operations_touch
touch( $this->json_local_path() . '.pid' );
public function Release_lane() {
$lane_file = $this->json_local_path() . '.pid';
if ( ! file_exists( $lane_file ) ) {
self::debug( 'Release lane' );
// phpcs:ignore WordPress.WP.AlternativeFunctions.unlink_unlink
* Check if lane is used by other crawlers.
* @param bool $strict_mode Strict check that file must exist.
* @return bool True if valid lane.
private function _check_valid_lane( $strict_mode = false ) {
$lane_file = $this->json_local_path() . '.pid';
if ( ! file_exists( $lane_file ) ) {
self::debug( 'lane file not existed, strict mode is false [file] ' . $lane_file );
$pid = File::read( $lane_file );
if ( $pid && LITESPEED_LANE_HASH !== $pid ) {
// If lane file is older than 1h, ignore.
if ( ( time() - filemtime( $lane_file ) ) > 3600 ) {
self::debug( 'Lane file is older than 1h, releasing lane' );
* Test port for simulator.
* @return bool true if success and can continue crawling, false otherwise.
private function _test_port() {
if ( empty( $this->_server_ip ) ) {
if ( empty( $this->_crawlers[ $this->_summary['curr_crawler'] ]['uid'] ) ) {
self::debug( 'Bypass test port as Server IP is not set' );
self::debug( '❌ Server IP not set' );
if ( defined( 'LITESPEED_CRAWLER_LOCAL_PORT' ) ) {
self::debug( '✅ LITESPEED_CRAWLER_LOCAL_PORT already defined' );
// Don't repeat testing in 120s.
if ( ! empty( $this->_summary['test_port_tts'] ) && ( time() - (int) $this->_summary['test_port_tts'] ) < 120 ) {
if ( ! empty( $this->_summary['test_port'] ) ) {
self::debug( '✅ Use tested local port: ' . $this->_summary['test_port'] );
define( 'LITESPEED_CRAWLER_LOCAL_PORT', (int) $this->_summary['test_port'] );
$this->_summary['test_port_tts'] = time();
$options = $this->_get_curl_options();
File::save( LITESPEED_STATIC_DIR . '/crawler/test_port.html', $home, true );
$url = LITESPEED_STATIC_URL . '/crawler/test_port.html';
$parsed_url = wp_parse_url( $url );
if ( empty( $parsed_url['host'] ) ) {
self::debug( '❌ Test port failed, invalid URL: ' . $url );
$resolved = $parsed_url['host'] . ':443:' . $this->_server_ip;
$options[ CURLOPT_RESOLVE ] = [ $resolved ];
$options[ CURLOPT_DNS_USE_GLOBAL_CACHE ] = false;
$options[ CURLOPT_HEADER ] = false;
self::debug( 'Test local 443 port for ' . $resolved );
// cURL is intentionally used for speed; suppress sniffs in this method.
// phpcs:disable WordPress.WP.AlternativeFunctions
curl_setopt_array( $ch, $options );
curl_setopt( $ch, CURLOPT_URL, $url );
$result = curl_exec( $ch );
if ( curl_errno( $ch ) || $result !== $home ) {
if ( curl_errno( $ch ) ) {
self::debug( '❌ Test port curl error: [errNo] ' . curl_errno( $ch ) . ' [err] ' . curl_error( $ch ) );
} elseif ( $result !== $home ) {
self::debug( '❌ Test port response is wrong: ' . $result );
self::debug( '❌ Test local 443 port failed, try port 80' );
$resolved = $parsed_url['host'] . ':80:' . $this->_server_ip;
$options[ CURLOPT_RESOLVE ] = [ $resolved ];
$url = str_replace( 'https://', 'http://', $url );
if ( empty( $options[ CURLOPT_HTTPHEADER ] ) || ! in_array( 'X-Forwarded-Proto: https', $options[ CURLOPT_HTTPHEADER ], true ) ) {
$options[ CURLOPT_HTTPHEADER ][] = 'X-Forwarded-Proto: https';
curl_setopt_array( $ch, $options );
curl_setopt( $ch, CURLOPT_URL, $url );
$result = curl_exec( $ch );
if ( curl_errno( $ch ) ) {
self::debug( '❌ Test port curl error: [errNo] ' . curl_errno( $ch ) . ' [err] ' . curl_error( $ch ) );
} elseif ( $result !== $home ) {
self::debug( '❌ Test port response is wrong: ' . $result );
self::debug( '✅ Test local 80 port successfully' );
define( 'LITESPEED_CRAWLER_LOCAL_PORT', 80 );
$this->_summary['test_port'] = 80;
self::debug( '✅ Tested local 443 port successfully' );
define( 'LITESPEED_CRAWLER_LOCAL_PORT', 443 );
$this->_summary['test_port'] = 443;
* @throws \Exception When lane becomes invalid during run.
private function _do_running() {
$options = $this->_get_curl_options( true );
// If is role simulator and not defined local port, check port once.
$test_result = $this->_test_port();
$this->_end_reason = 'port_test_failed';
self::debug( '❌ Test port failed, crawler stopped.' );
$url_chunks = $this->cls( 'Crawler_Map' )->list_map( self::CHUNKS, $this->_summary['last_pos'] );
if ( empty( $url_chunks ) ) {
$url_chunks = array_chunk( $url_chunks, (int) $this->_cur_threads );
foreach ( $url_chunks as $rows ) {
if ( ! $this->_check_valid_lane( true ) ) {
$this->_end_reason = 'lane_invalid';
self::debug( '🛑 The crawler lane is used by newer crawler.' );
throw new \Exception( 'invalid crawler lane' );
$rets = $this->_multi_request( $rows, $options );
foreach ( $rows as $row ) {
if ( empty( $rets[ $row['id'] ] ) ) {
if ( 428 === (int) $rets[ $row['id'] ]['code'] ) {
// HTTP/1.1 428 Precondition Required (need to test)
$this->_end_reason = 'crawler_disabled';
self::debug( 'crawler_disabled' );
$status = $this->_status_parse( $rets[ $row['id'] ]['header'], $rets[ $row['id'] ]['code'], $row['url'] ); // B or H or M or N(nocache).
self::debug( '[status] ' . $this->_status2title( $status ) . "\t\t [url] " . $row['url'] );
$this->_map_status_list[ $status ][ $row['id'] ] = [
'code' => (int) $rets[ $row['id'] ]['code'], // 201 or 200 or 404.
if ( empty( $this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ][ $status ] ) ) {
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ][ $status ] = 0;
++$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ][ $status ];
// update offset position.
$this->_summary['last_count'] = count( $rows );
$this->_summary['last_pos'] += $this->_summary['last_count'];
$this->_summary['last_crawled'] += $this->_summary['last_count'];
$this->_summary['last_update_time'] = $_time;
$this->_summary['last_status'] = 'updated position';
if ( $this->_summary['last_update_time'] > $this->_max_run_time ) {
$this->_end_reason = 'stopped_maxtime';
self::debug( 'Terminated due to maxtime' );
// make sure at least each 10s save meta & map status once.
if ( $_time - $this->_summary['meta_save_time'] > 10 ) {
$this->_map_status_list = $this->cls( 'Crawler_Map' )->save_map_status( $this->_map_status_list, $this->_summary['curr_crawler'] );
// check if need to reset pos each 5s.
if ( $_time > $this->_summary['pos_reset_check'] ) {
$this->_summary['pos_reset_check'] = $_time + 5;
if ( file_exists( $this->_resetfile ) && unlink( $this->_resetfile ) ) { // phpcs:ignore WordPress.WP.AlternativeFunctions.unlink_unlink
self::debug( 'Terminated due to reset file' );
$this->_summary['last_pos'] = 0;
$this->_summary['curr_crawler'] = 0;
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ] = [];
$this->_summary['done'] = 0;
$this->_summary['this_full_beginning_time'] = 0;
$this->_end_reason = 'stopped_reset';
if ( ( $this->_summary['last_update_time'] - $this->_cur_thread_time ) > 60 ) {
$this->_adjust_current_threads();
if ( 0 === (int) $this->_cur_threads ) {
$this->_end_reason = 'stopped_highload';
self::debug( '🛑 Terminated due to highload' );
$this->_summary['last_status'] = 'sleeping ' . (int) $this->_crawler_conf['run_delay'] . 'ms';
usleep( (int) $this->_crawler_conf['run_delay'] );
// All URLs are done for current crawler.
$this->_end_reason = 'end';
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ]['W'] = 0;
self::debug( 'Crawler #' . $this->_summary['curr_crawler'] . ' touched end' );
* If need to resolve DNS or not.
private function _should_force_resolve_dns() {
if ( ! empty( $this->_server_ip ) ) {
if ( ! empty( $this->_crawler_conf['cookies'] ) && ! empty( $this->_crawler_conf['cookies']['litespeed_hash'] ) ) {
* Send multi curl requests.
* If res=B/N, bypass request and won't return.
* @param array<int,array<string,mixed>> $rows Rows to crawl.
* @param array $options cURL options.
* @return array<int,array{header:string,code:int}>
private function _multi_request( $rows, $options ) {
if ( ! function_exists( 'curl_multi_init' ) ) {
exit( 'curl_multi_init disabled' );
// phpcs:disable WordPress.WP.AlternativeFunctions
$crawler_drop_domain = defined( 'LITESPEED_CRAWLER_DROP_DOMAIN' ) ? (bool) constant( 'LITESPEED_CRAWLER_DROP_DOMAIN' ) : false;
foreach ( $rows as $row ) {
if ( self::STATUS_BLACKLIST === substr( $row['res'], $this->_summary['curr_crawler'], 1 ) ) {
if ( self::STATUS_NOCACHE === substr( $row['res'], $this->_summary['curr_crawler'], 1 ) ) {