PHP dynamic website crawling problem

URL http://app1.sfda.gov.cn/datas.

idepzk0l14075.png" alt="clipboard.png" title="clipboard.png">

<?php
require_once "curl.func.php";
$url="http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=25&tableName=TABLE25&tableView=%B9%FA%B2%FA%D2%A9%C6%B7&Id=29813&bcId=124356560303886909015737447882 ";
$config=array();
$config["cookie"]="JSESSIONID=CB48D05599167A38CFCDCB53416B6AE1.7; FSSBBIl1UgzbN7N82S=vZCYhV7eHgrdkkhDoKwBA2ck5t.Y0NbG8rONrlN7HoM_GuZzRR6fNkJkR7MJF3u_; FSSBBIl1UgzbN7N82T=2gjgV3eNRxQ1nzOjyrE_N4bSy84kQZ6HotJaeBD3VycZ4kDwb.PVnyEC0aiuxiuFTKyJXv_pFn150mftlM9Yqo4_MKfuJuWrCkEjcOwXZaaZnqPAXlurB5n5wtzNlBShlr1BMYc_g7I9dSbJFg2pdyyW4S3d4DwpxPwQfwYlY1SA758_pgEakKCZafgq_13s2_QXWHN0JKsU_1geEVR2ymIqyNFt7yOTTjorHW2_crSBlqfhnF9kGgGIak1K_83t_jA3SBf6aCp6pp_6UotA50yP6Wb5mGb_4enYZnEYmY23wgeX984XbcM3Jkf0keLOpjjGjuzqIUUXZMNoSBUL286ZJvrmuIcYknISGHtYBSxRFJz62v9auesdmkflTIaF_ta5PUjx0Nml_ejCKW0ynSEEp";
$config["header"][]="User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
$config["header"][]="Cache-Control: max-age=0";
$config["header"][]="Upgrade-Insecure-Requests: 1";
$config["header"][]="Content-Type: text/html;encoding=gbk";
$config["header"][]="Accept-Encoding: gzip, deflate";
$config["header"][]="Accept-Language: zh-CN,zh;q=0.9";  
$config["header"][]="Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"; 
// $config["referer"]=$referer; 

$result=curlopen($url,$config);

file_put_contents("cfda.txt",$result);

curlopen Code

function curlOpen($url, $config = array())
{
    $arr = array("post" => false,"referer" => $url,"cookie" => "", "useragent" => "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; customie8)", "timeout" => 20, "return" => true, "proxy" => "", "userpwd" => "", "nobody" => false,"header"=>array(),"gzip"=>true,"ssl"=>false,"isupfile"=>false);
    $arr = array_merge($arr, $config);
    $ch = curl_init();
    
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, $arr["return"]);
    curl_setopt($ch, CURLOPT_NOBODY, $arr["nobody"]);  
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    curl_setopt($ch, CURLOPT_USERAGENT, $arr["useragent"]);
    curl_setopt($ch, CURLOPT_REFERER, $arr["referer"]);
    curl_setopt($ch, CURLOPT_TIMEOUT, $arr["timeout"]);
    curl_setopt($ch, CURLOPT_MAXREDIRS, 0);
    
curl_setopt($curl,CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_HEADER, true);//header
    if($arr["gzip"]) curl_setopt($ch, CURLOPT_ENCODING, "gzip,deflate");
    if($arr["ssl"])
    {
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
    }
    if(!empty($arr["cookie"]))
    {
        curl_setopt($ch, CURLOPT_COOKIEJAR, $arr["cookie"]);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $arr["cookie"]); 
    } 
    
    if(!empty($arr["proxy"]))
    {
        //curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);  
        curl_setopt ($ch, CURLOPT_PROXY, $arr["proxy"]);
        if(!empty($arr["userpwd"]))
        {            
            curl_setopt($ch,CURLOPT_PROXYUSERPWD,$arr["userpwd"]);
        }        
    }    
    var_dump($arr["cookie"]);
    //ip
    if(!empty($arr["header"]["ip"]))
    {
        array_push($arr["header"],"X-FORWARDED-FOR:".$arr["header"]["ip"],"CLIENT-IP:".$arr["header"]["ip"]);
        unset($arr["header"]["ip"]);
    }   
    $arr["header"] = array_filter($arr["header"]);
    
    if(!empty($arr["header"]))
    {
        curl_setopt($ch, CURLOPT_HTTPHEADER, $arr["header"]); 
    }

    if ($arr["post"] != false)
    {
        curl_setopt($ch, CURLOPT_POST, true);
        if(is_array($arr["post"]) && $arr["isupfile"] === false)
        {
            $post = http_build_query($arr["post"]);            
        } 
        else
        {
            $post = $arr["post"];
        }
        curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
    }    
    $result = curl_exec($ch);
    //var_dump(curl_getinfo($ch));
    curl_close($ch);

    return $result;
}

curlopen has not been the desired content since. That is, the content of the GET request is inconsistent with that displayed by the browser

Php
Mar.29,2021

data may be obtained separately by ajax, so you need to grab the package and take a look at
and suggest that you post the code of curlopen to see


headless browser you can try
for dynamic websites, you can first use a headless browser to visit, get the results after js runs, and analyze the html structure on it.
for example, I know phantomjs , and there are many similar bars

.
Menu