php下的curl多线程采集函数。

//多线程采集
function curl_multi($urls) {
    if (!is_array($urls) or count($urls) == 0) {
        return false;
    } 
    $num=count($urls);
    $curl = $curl2 = $text = array();
    $handle = curl_multi_init();
    function createCh($url) {
        $ch = curl_init();
        curl_setopt ($ch, CURLOPT_URL, $url);
        curl_setopt ($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko');//设置头部
        curl_setopt ($ch, CURLOPT_REFERER, $url); //设置来源
        curl_setopt ($ch, CURLOPT_ENCODING, "gzip"); // 编码压缩
        curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);//是否采集301、302之后的页面
        curl_setopt ($ch, CURLOPT_MAXREDIRS, 5);//查找次数,防止查找太深
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // 对认证证书来源的检查
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE); // 从证书中检查SSL加密算法是否存在       
        curl_setopt ($ch, CURLOPT_TIMEOUT, 20);
        curl_setopt ($ch, CURLOPT_HEADER, 0);//输出头部
        return $ch;
    }
    foreach($urls as $k=>$v){
        $url=$urls[$k];
        $curl[$k] = createCh($url);
        curl_multi_add_handle ($handle,$curl[$k]);
    }
    $active = null;
    do {
        $mrc = curl_multi_exec($handle, $active);
    } while ($mrc == CURLM_CALL_MULTI_PERFORM);

    while ($active && $mrc == CURLM_OK) {
        if (curl_multi_select($handle) != -1) {
            usleep(100);
        }
        do {
            $mrc = curl_multi_exec($handle, $active);
        } while ($mrc == CURLM_CALL_MULTI_PERFORM);
    } 

    foreach ($curl as $k => $v) {
        if (curl_error($curl[$k]) == "") {
            $text[$k] = (string) curl_multi_getcontent($curl[$k]); 
        }
        curl_multi_remove_handle($handle, $curl[$k]);
        curl_close($curl[$k]);
    } 
    curl_multi_close($handle);
    return $text;
}

用法:直接使用curl_multi(urls), 其中urls是要采集的多条网址。

Last modification:October 2nd, 2020 at 06:18 pm
如果觉得我的文章对你有用,请随意赞赏