Java爬虫01-httpclient和jsoup

发表于 2024-09-08 更新于 2024-09-11 分类于爬虫

爬虫之使用httpclient和jsoup爬取数据

Httpclient

GET请求

百度一下

百度搜索林俊杰，爬取搜索结果页下载为html

<dependencies>
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.13</version>
    </dependency>
</dependencies>

    public static void main(String[] args) throws IOException {
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //https://www.baidu.com/s?wd=林俊杰
        HttpGet httpGet = new HttpGet("https://www.baidu.com/s?wd=%E6%9E%97%E4%BF%8A%E6%9D%B0");
        //设置请求头UA，模拟浏览器行为
        httpGet.addHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36");
        CloseableHttpResponse response = httpClient.execute(httpGet);
        StatusLine statusLine = response.getStatusLine();
        System.out.println(statusLine.getStatusCode());

        HttpEntity entity = response.getEntity();
        InputStream is = entity.getContent();
        File file = new File("./search.html");
        byte[] bytes = new byte[1024];
        FileOutputStream fileOutputStream = new FileOutputStream(file);
        int ch = 0;
        while((ch = is.read(bytes)) != -1){
            fileOutputStream.write(bytes,0,ch);
        }
        is.close();
        fileOutputStream.flush();
        fileOutputStream.close();
        is.close();

//        String responseBody = EntityUtils.toString(entity, "utf-8");
//        File file = new File("./search.html");
//        FileOutputStream fileOutputStream = new FileOutputStream(file);
//        fileOutputStream.write(responseBody.getBytes("utf-8"));
    }

POST请求

百度翻译

public static void main(String[] args) throws IOException {
    String searchWord = "爬虫";
    String defaultReqBody = "{\n" +
            "    \"from\": \"zh\",\n" +
            "    \"to\": \"en\",\n" +
            "    \"reference\": \"\",\n" +
            "    \"corpusIds\": [\n" +
            "        \n" +
            "    ],\n" +
            "    \"qcSettings\": [\n" +
            "        \"1\",\n" +
            "        \"2\",\n" +
            "        \"3\",\n" +
            "        \"4\",\n" +
            "        \"5\",\n" +
            "        \"6\",\n" +
            "        \"7\",\n" +
            "        \"8\",\n" +
            "        \"9\",\n" +
            "        \"10\",\n" +
            "        \"11\"\n" +
            "    ],\n" +
            "    \"needPhonetic\": false,\n" +
            "    \"domain\": \"common\",\n" +
            "    \"milliTimestamp\": 1725808602840\n" +
            "}";
    CloseableHttpClient httpClient = HttpClients.createDefault();
    HttpPost httpPost = new HttpPost("https://fanyi.baidu.com/ait/text/translate");
    JSONObject params =  JSONObject.parseObject(defaultReqBody);
    params.put("query",searchWord);
    params.put("milliTimestamp",System.currentTimeMillis());
    httpPost.setEntity(new StringEntity(params.toString(),"UTF-8"));
    httpPost.addHeader("Content-Type", "application/json");

//        https://www.baidu.com/s?wd=林俊杰 表单方式
//        HttpPost httpPost = new HttpPost("https://fanyi.baidu.com/sug");
//        List<BasicNameValuePair> formData = Arrays.asList(new BasicNameValuePair("kw", searchWord));
//        httpPost.setEntity(new UrlEncodedFormEntity(formData));


    System.out.println(params);
    CloseableHttpResponse response = httpClient.execute(httpPost);
    HttpEntity entity = response.getEntity();
    String result = EntityUtils.toString(entity);
    System.out.println(result);
}

Jsoup爬取数据

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>

public static void main(String[] args) throws IOException {
    URL url = new URL("https://www.jd.com/?cu=true");
    Document document = Jsoup.parse(url, 5000);
    Elements elements = document.getElementsByTag("title");
    System.out.println(elements.get(0).text());

    Elements liElements = document.getElementById("navitems-group1").getElementsByTag("li");
    System.out.println(liElements.stream().map(e -> e.text()).collect(Collectors.toList()));

    Elements select = document.select("#navitems-group1");
    System.out.println(select.stream().map(e -> e.text()).collect(Collectors.toList())); 
}
京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物！
[企业会员, 大牌奥莱, 京东五金城, 进口好物]
[企业会员 大牌奥莱 京东五金城 进口好物]

发现加载结果与实际访问结果不一致，说明京东做了一定的反爬或者前端实用Ajax异步加载，这些往往比较难以识别爬取，需要依赖框架或者浏览器驱动来模拟爬取

结论

目前主流爬虫框架底层也是采用Httpclient和Jsoup来发起请求和解析HTML